In [299]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [300]:
pd.set_option('display.max_colwidth', 1)

## Loading .txt file

In [301]:
# Step 1: Read the file
file_path = "dialogs.txt"  # Replace with your file path
with open(file_path, 'r') as file:
    lines = file.readlines()

# Step 2: Process the data
# Splitting each line into question and response based on the tab separator
data = []
for line in lines:
    parts = line.strip().split("\t")
    if len(parts) == 2:  # Ensure both question and response are present
        data.append(parts)

# Convert to a DataFrame for easier manipulation
df = pd.DataFrame(data, columns=["Question", "Response"])

# Display the first few rows
df.head()

Unnamed: 0,Question,Response
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [302]:
df.shape

(3725, 2)

## Preprocessing

#### Handling Missing Values

In [303]:
print(df.isna().sum())

print("No Handling as not missing values")

Question    0
Response    0
dtype: int64
No Handling as not missing values


#### Handling Duplicates

In [304]:
df.drop_duplicates(subset=['Question', 'Response'], inplace=True)
df.describe()

Unnamed: 0,Question,Response
count,3724,3724
unique,3510,3512
top,what do you mean?,what do you mean?
freq,22,22


In [305]:
df.shape

(3724, 2)

## NLP Text Preprocessing

#### Lower casing

In [306]:
df = df.map(str.lower)
df.head()

Unnamed: 0,Question,Response
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


#### Contractions

In [307]:
import contractions
def expand_contractions(text):
    return ' '.join([contractions.fix(word) for word in text.split()])

df['Question'] = df['Question'].apply(expand_contractions)
df['Response'] = df['Response'].apply(expand_contractions)
df.head()

Unnamed: 0,Question,Response
0,"hi, how are you doing?",i am fine. how about yourself?
1,i am fine. how about yourself?,i am pretty good. thanks for asking.
2,i am pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i have been great. what about you?
4,i have been great. what about you?,i have been good. i am in school right now.


#### Punctuations Removal

In [308]:
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Apply the function to both columns
df['Question'] = df['Question'].apply(remove_punctuation)
df['Response'] = df['Response'].apply(remove_punctuation)
df.head()

Unnamed: 0,Question,Response
0,hi how are you doing,i am fine how about yourself
1,i am fine how about yourself,i am pretty good thanks for asking
2,i am pretty good thanks for asking,no problem so how have you been
3,no problem so how have you been,i have been great what about you
4,i have been great what about you,i have been good i am in school right now


#### Numbers removal

In [309]:
import string
translation_table = str.maketrans('', '', string.digits)
df['Question'] = df['Question'].apply(lambda x: x.translate(translation_table))
df['Response'] = df['Response'].apply(lambda x: x.translate(translation_table))
df.head()

Unnamed: 0,Question,Response
0,hi how are you doing,i am fine how about yourself
1,i am fine how about yourself,i am pretty good thanks for asking
2,i am pretty good thanks for asking,no problem so how have you been
3,no problem so how have you been,i have been great what about you
4,i have been great what about you,i have been good i am in school right now


#### Removing Extra Spaces

In [310]:
import re
def remove_extra_spaces_with_re(text):
    return re.sub(r'\s+', ' ', text.strip())

df['Question'] = df['Question'].apply(remove_extra_spaces_with_re)
df['Response'] = df['Response'].apply(remove_extra_spaces_with_re)
df.head()

Unnamed: 0,Question,Response
0,hi how are you doing,i am fine how about yourself
1,i am fine how about yourself,i am pretty good thanks for asking
2,i am pretty good thanks for asking,no problem so how have you been
3,no problem so how have you been,i have been great what about you
4,i have been great what about you,i have been good i am in school right now


#### Handling Repeated Punctuations

In [311]:
import re
def replace_repeated_puncs(text):
    return re.sub(r'([!?/\.])\1+', r'\1', text)
df['Question'] = df['Question'].apply(replace_repeated_puncs)
df['Response'] = df['Response'].apply(replace_repeated_puncs)
df.head()

Unnamed: 0,Question,Response
0,hi how are you doing,i am fine how about yourself
1,i am fine how about yourself,i am pretty good thanks for asking
2,i am pretty good thanks for asking,no problem so how have you been
3,no problem so how have you been,i have been great what about you
4,i have been great what about you,i have been good i am in school right now


#### Decoder Stuff

In [312]:
#df['decoder_input'] = df['Response'].apply(lambda x: 'sos ' +x)
#df['decoder_label'] = df['Response'].apply(lambda x: x+" eo>")

In [313]:
df.head()

Unnamed: 0,Question,Response
0,hi how are you doing,i am fine how about yourself
1,i am fine how about yourself,i am pretty good thanks for asking
2,i am pretty good thanks for asking,no problem so how have you been
3,no problem so how have you been,i have been great what about you
4,i have been great what about you,i have been good i am in school right now


## Tokenization

In [314]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Step 3: Tokenize each row in the 'question' and 'response' columns
df['processed_question'] = df['Question'].apply(lambda x: tokenizer.tokenize(x))
df['processed_response'] = df['Response'].apply(lambda x: tokenizer.tokenize(x))
df.head()

Unnamed: 0,Question,Response,processed_question,processed_response
0,hi how are you doing,i am fine how about yourself,"[hi, how, are, you, doing]","[i, am, fine, how, about, yourself]"
1,i am fine how about yourself,i am pretty good thanks for asking,"[i, am, fine, how, about, yourself]","[i, am, pretty, good, thanks, for, asking]"
2,i am pretty good thanks for asking,no problem so how have you been,"[i, am, pretty, good, thanks, for, asking]","[no, problem, so, how, have, you, been]"
3,no problem so how have you been,i have been great what about you,"[no, problem, so, how, have, you, been]","[i, have, been, great, what, about, you]"
4,i have been great what about you,i have been good i am in school right now,"[i, have, been, great, what, about, you]","[i, have, been, good, i, am, in, school, right, now]"


## Stemming & Lemmatization

In [315]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
df['processed_question']=df['processed_question'].apply(lambda x:lemmatizer(x))
df['processed_response'] = df['processed_response'].apply(lambda x: lemmatizer(x))
df.head()

Unnamed: 0,Question,Response,processed_question,processed_response
0,hi how are you doing,i am fine how about yourself,"[hi, how, are, you, doing]","[i, am, fine, how, about, yourself]"
1,i am fine how about yourself,i am pretty good thanks for asking,"[i, am, fine, how, about, yourself]","[i, am, pretty, good, thanks, for, asking]"
2,i am pretty good thanks for asking,no problem so how have you been,"[i, am, pretty, good, thanks, for, asking]","[no, problem, so, how, have, you, been]"
3,no problem so how have you been,i have been great what about you,"[no, problem, so, how, have, you, been]","[i, have, been, great, what, about, you]"
4,i have been great what about you,i have been good i am in school right now,"[i, have, been, great, what, about, you]","[i, have, been, good, i, am, in, school, right, now]"


#### Train Test Split

In [316]:
"""
from sklearn.model_selection import train_test_split

encoder_input = np.array(df['Question'])
decoder_input = np.array(df['decoder_input'])
decoder_label = np.array(df['decoder_label'])

n_rows = df.shape[0]
print(n_rows)

indices = np.arange(n_rows)
np.random.shuffle(indices)

encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_label = decoder_label[indices]

train_size = 0.8

train_encoder_input = encoder_input[:int(n_rows*train_size)]
train_decoder_input = decoder_input[:int(n_rows*train_size)]
train_decoder_label = decoder_label[:int(n_rows*train_size)]

test_encoder_input = encoder_input[int(n_rows*train_size):]
test_decoder_input = decoder_input[int(n_rows*train_size):]
test_decoder_label = decoder_label[int(n_rows*train_size):]


print(train_encoder_input.shape)
print(train_decoder_input.shape)
print(train_decoder_label.shape)

print("================")

print(test_encoder_input.shape)
print(test_decoder_input.shape)
print(test_decoder_label.shape)
"""



#### Vocabulary Building

In [317]:
from collections import Counter

all_tokens = [token for tokens in df["processed_question"] for token in tokens] + \
             [token for tokens in df["processed_response"] for token in tokens]

# Count the frequency of each token
token_counts = Counter(all_tokens)

# Create vocabulary: map tokens to unique indices
vocabulary = {token: idx for idx, (token, _) in enumerate(token_counts.items(), start=1)}

# Add special tokens to the vocabulary
vocabulary["<PAD>"] = 0  # Padding token
vocabulary["<UNK>"] = len(vocabulary)  # Unknown token

# Display the vocabulary
print("Vocabulary:", len(vocabulary))

# Reverse vocabulary (for decoding indices back to tokens)
reverse_vocab = {idx: token for token, idx in vocabulary.items()}
print("Decoding: ", len(reverse_vocab))

# Optionally save the vocabulary
import json
with open("vocabulary.json", "w") as f:
    json.dump(vocabulary, f)

print("\nVocabulary saved to 'vocabulary.json'")

Vocabulary: 2225
Decoding:  2225

Vocabulary saved to 'vocabulary.json'


#### Padding & Truncating Sequence

In [318]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def tokens_to_sequence(tokens, vocab):
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens]

# Convert questions and responses to sequences
df["question_seq"] = df["processed_question"].apply(lambda tokens: tokens_to_sequence(tokens, vocabulary))
df["response_seq"] = df["processed_response"].apply(lambda tokens: tokens_to_sequence(tokens, vocabulary))

# Determine the maximum sequence length
max_seq_length = max(
    max(df["question_seq"].apply(len)),
    max(df["response_seq"].apply(len))
)

print(f"Maximum sequence length: {max_seq_length}")

# Pad the sequences
df["padded_question"] = pad_sequences(df["question_seq"], maxlen=max_seq_length, padding="post").tolist()
df["padded_response"] = pad_sequences(df["response_seq"], maxlen=max_seq_length, padding="post").tolist()

# Display the results
print("\nProcessed DataFrame:")
print(df[["padded_question", "padded_response"]])

# Save padded sequences (optional)
df.to_csv("processed_sequences.csv", index=False)
print("\nPadded sequences saved to 'processed_sequences.csv'")

Maximum sequence length: 20

Processed DataFrame:
                                                                      padded_question  \
0     [1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]                      
1     [6, 7, 8, 2, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]                     
2     [6, 7, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]                 
3     [16, 17, 18, 2, 19, 4, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]                 
4     [6, 19, 20, 21, 22, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]                  
...                                                                ...                  
3720  [58, 35, 37, 12, 522, 235, 33, 35, 53, 599, 1030, 0, 0, 0, 0, 0, 0, 0, 0, 0]      
3721  [3, 4, 25, 2119, 1155, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]               
3722  [97, 224, 69, 624, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]                
3723  [4, 3, 355, 165, 247, 25, 1269, 661, 1095, 33, 18, 163

## Vectorization / Feature Extraction

In [319]:
df.head()

Unnamed: 0,Question,Response,processed_question,processed_response,question_seq,response_seq,padded_question,padded_response
0,hi how are you doing,i am fine how about yourself,"[hi, how, are, you, doing]","[i, am, fine, how, about, yourself]","[1, 2, 3, 4, 5]","[6, 7, 8, 2, 9, 10]","[1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[6, 7, 8, 2, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,i am fine how about yourself,i am pretty good thanks for asking,"[i, am, fine, how, about, yourself]","[i, am, pretty, good, thanks, for, asking]","[6, 7, 8, 2, 9, 10]","[6, 7, 11, 12, 13, 14, 15]","[6, 7, 8, 2, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[6, 7, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,i am pretty good thanks for asking,no problem so how have you been,"[i, am, pretty, good, thanks, for, asking]","[no, problem, so, how, have, you, been]","[6, 7, 11, 12, 13, 14, 15]","[16, 17, 18, 2, 19, 4, 20]","[6, 7, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[16, 17, 18, 2, 19, 4, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,no problem so how have you been,i have been great what about you,"[no, problem, so, how, have, you, been]","[i, have, been, great, what, about, you]","[16, 17, 18, 2, 19, 4, 20]","[6, 19, 20, 21, 22, 9, 4]","[16, 17, 18, 2, 19, 4, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[6, 19, 20, 21, 22, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,i have been great what about you,i have been good i am in school right now,"[i, have, been, great, what, about, you]","[i, have, been, good, i, am, in, school, right, now]","[6, 19, 20, 21, 22, 9, 4]","[6, 19, 20, 12, 6, 7, 23, 24, 25, 26]","[6, 19, 20, 21, 22, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[6, 19, 20, 12, 6, 7, 23, 24, 25, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [320]:
df['padded_question'].shape

(3724,)

#### Train Test Split

#### Model Building

#### Prediction