In [None]:
# Step 1: Parse the SRT File
# Step 2: Create Input-Response Pairs
# Step 3: Train a Chatbot Model
# Step 4: Example with HuggingFace GPT-2 (Fine-tuning)
# Step 5: Test the Chatbot

# Step 1: Parse the SRT File
Extract the dialogue only from the .srt file and discard timestamps and sequence numbers.

We’ll:

- Ignore numeric indices and timestamps

- Keep only the spoken lines

- Group alternating lines into a conversation (e.g., line 1 = user input, line 2 = response)

In [None]:
import re

def parse_srt(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()

    blocks = re.split(r'\n\n+', text.strip())
    dialogues = []

    for block in blocks:
        lines = block.strip().split('\n')
        if len(lines) >= 3:
            spoken_lines = lines[2:]
            dialogues.extend(spoken_lines)

    return dialogues

# Load and preview dialogues
dialogues = parse_srt('/mnt/data/suits-1x01-pilot.en.srt')
for line in dialogues[:10]:
    print(line)


# Step 2: Create Input-Response Pairs
To train a chatbot, we need conversation pairs:

In [None]:
pairs = []
for i in range(len(dialogues)-1):
    input_text = dialogues[i].strip()
    response_text = dialogues[i+1].strip()
    if input_text and response_text:
        pairs.append((input_text, response_text))


# delete everything after here

### What is a Natural Language Tool Kit?

**Key Features of NLTK:**

In [None]:
# 1. Tokenization – Breaking text into words or sentences
from nltk.tokenize import word_tokenize
word_tokenize("Hello world!")

['Hello', 'world', '!']

In [4]:
# 2. Part-of-Speech (POS) Tagging – Labeling words with their grammatical roles
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk import pos_tag
pos_tag(word_tokenize("He runs fast."))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rurig\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rurig\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('He', 'PRP'), ('runs', 'VBZ'), ('fast', 'RB'), ('.', '.')]

In [9]:
# 3. Stemming and Lemmatization – Reducing words to their root form
from nltk.stem import PorterStemmer
PorterStemmer().stem("running")  # Output: 'run'


'run'

In [None]:
# 4. Named Entity Recognition (NER) – Identifying names of people, organizations, etc.
# 5. Parsing and Syntax Trees – Analyzing grammatical structure of sentences
# 6. Sentiment Analysis – Determining the emotion or opinion in text (with training)
# 7. Stop Words Removal – Removing common words (like “the”, “is”, “in”)
# 8. Text Classification and Machine Learning – Building simple NLP models

# Use Cases:
- Chatbot development

- Text summarization

- Information extraction

- Language translation

- Spam filtering

In [13]:
# ! pip install spacy
! pip install --upgrade pip setuptools wheel

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
Collecting setuptools
  Using cached setuptools-75.3.2-py3-none-any.whl (1.3 MB)
Collecting wheel
  Downloading wheel-0.45.1-py3-none-any.whl (72 kB)
Installing collected packages: pip, setuptools, wheel
  Attempting uninstall: pip
    Found existing installation: pip 20.2.3
    Uninstalling pip-20.2.3:
      Successfully uninstalled pip-20.2.3
  Attempting uninstall: setuptools
    Found existing installation: setuptools 50.3.0.post20201103
    Uninstalling setuptools-50.3.0.post20201103:
      Successfully uninstalled setuptools-50.3.0.post20201103
  Attempting uninstall: wheel
    Found existing installation: wheel 0.35.1
    Uninstalling wheel-0.35.1:
      Successfully uninstalled wheel-0.35.1
Successfully installed pip-25.0.1 setuptools-75.3.2 wheel-0.45.1


ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

tensorboard 2.3.0 requires google-auth<2,>=1.6.3, but you'll have google-auth 2.40.3 which is incompatible.


In [16]:
# import spacy
# print(spacy.__version__)

!pip install spacy

Collecting spacy
  Downloading spacy-3.8.2.tar.gz (1.3 MB)
     ---------------------------------------- 1.3/1.3 MB 878.0 kB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'error'


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [113 lines of output]
      Ignoring numpy: markers 'python_version >= "3.9"' don't match your environment
      Collecting setuptools
        Downloading setuptools-75.3.2-py3-none-any.whl.metadata (6.9 kB)
      Collecting cython<3.0,>=0.25
        Downloading Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Downloading cymem-2.0.11.tar.gz (10 kB)
        Installing build dependencies: started
        Installing build dependencies: finished with status 'done'
        Getting requirements to build wheel: started
        Getting requirements to build wheel: finished with status 'done'
        Preparing metadata (pyproject.toml): started
        Preparing metadata (pyproject.toml): finished with status 'done'
      Collecting preshed<3.1.0,>=3.0.2
        Downloading preshed-3.0.10.tar.gz (15 kB)
    

In [17]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying a startup in the UK.")
for ent in doc.ents:
    print(ent.text, ent.label_)

ModuleNotFoundError: No module named 'spacy'