In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [23]:
pd.set_option('display.max_colwidth', 1)

## Loading .txt file

In [24]:
# Step 1: Read the file
file_path = "dialogs.txt"  # Replace with your file path
with open(file_path, 'r') as file:
    lines = file.readlines()

# Step 2: Process the data
# Splitting each line into question and response based on the tab separator
data = []
for line in lines:
    parts = line.strip().split("\t")
    if len(parts) == 2:  # Ensure both question and response are present
        data.append(parts)

# Convert to a DataFrame for easier manipulation
df = pd.DataFrame(data, columns=["Question", "Response"])

# Display the first few rows
df.head()

Unnamed: 0,Question,Response
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [25]:
df.shape

(3725, 2)

## Preprocessing

#### Handling Missing Values

In [26]:
print(df.isna().sum())

print("No Handling as not missing values")

Question    0
Response    0
dtype: int64
No Handling as not missing values


#### Handling Duplicates

In [27]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
3720    False
3721    False
3722    False
3723    False
3724    False
Length: 3725, dtype: bool

## NLP Text Preprocessing

#### Lower casing

In [28]:
df = df.map(str.lower)
df.head()

Unnamed: 0,Question,Response
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


#### Contractions

In [29]:
import contractions
def expand_contractions(text):
    return ' '.join([contractions.fix(word) for word in text.split()])

df['Question'] = df['Question'].apply(expand_contractions)
df['Response'] = df['Response'].apply(expand_contractions)
df.head()

Unnamed: 0,Question,Response
0,"hi, how are you doing?",i am fine. how about yourself?
1,i am fine. how about yourself?,i am pretty good. thanks for asking.
2,i am pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i have been great. what about you?
4,i have been great. what about you?,i have been good. i am in school right now.


#### Punctuations Removal

In [30]:
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Apply the function to both columns
df['Question'] = df['Question'].apply(remove_punctuation)
df['Response'] = df['Response'].apply(remove_punctuation)
df.head()

Unnamed: 0,Question,Response
0,hi how are you doing,i am fine how about yourself
1,i am fine how about yourself,i am pretty good thanks for asking
2,i am pretty good thanks for asking,no problem so how have you been
3,no problem so how have you been,i have been great what about you
4,i have been great what about you,i have been good i am in school right now


#### Numbers removal

In [31]:
import string
translation_table = str.maketrans('', '', string.digits)
df['Question'] = df['Question'].apply(lambda x: x.translate(translation_table))
df['Response'] = df['Response'].apply(lambda x: x.translate(translation_table))
df.head()

Unnamed: 0,Question,Response
0,hi how are you doing,i am fine how about yourself
1,i am fine how about yourself,i am pretty good thanks for asking
2,i am pretty good thanks for asking,no problem so how have you been
3,no problem so how have you been,i have been great what about you
4,i have been great what about you,i have been good i am in school right now


#### Removing Extra Spaces

In [32]:
import re
def remove_extra_spaces_with_re(text):
    return re.sub(r'\s+', ' ', text.strip())

df['Question'] = df['Question'].apply(remove_extra_spaces_with_re)
df['Response'] = df['Response'].apply(remove_extra_spaces_with_re)
df.head()

Unnamed: 0,Question,Response
0,hi how are you doing,i am fine how about yourself
1,i am fine how about yourself,i am pretty good thanks for asking
2,i am pretty good thanks for asking,no problem so how have you been
3,no problem so how have you been,i have been great what about you
4,i have been great what about you,i have been good i am in school right now


#### Handling Repeated Punctuations

In [33]:
import re
def replace_repeated_puncs(text):
    return re.sub(r'([!?/\.])\1+', r'\1', text)
df['Question'] = df['Question'].apply(replace_repeated_puncs)
df['Response'] = df['Response'].apply(replace_repeated_puncs)
df.head()

Unnamed: 0,Question,Response
0,hi how are you doing,i am fine how about yourself
1,i am fine how about yourself,i am pretty good thanks for asking
2,i am pretty good thanks for asking,no problem so how have you been
3,no problem so how have you been,i have been great what about you
4,i have been great what about you,i have been good i am in school right now


## Tokenization

In [34]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Step 3: Tokenize each row in the 'question' and 'response' columns
df['question_tokens'] = df['Question'].apply(lambda x: tokenizer.tokenize(x))
df['response_tokens'] = df['Response'].apply(lambda x: tokenizer.tokenize(x))
df.head()

Unnamed: 0,Question,Response,question_tokens,response_tokens
0,hi how are you doing,i am fine how about yourself,"[hi, how, are, you, doing]","[i, am, fine, how, about, yourself]"
1,i am fine how about yourself,i am pretty good thanks for asking,"[i, am, fine, how, about, yourself]","[i, am, pretty, good, thanks, for, asking]"
2,i am pretty good thanks for asking,no problem so how have you been,"[i, am, pretty, good, thanks, for, asking]","[no, problem, so, how, have, you, been]"
3,no problem so how have you been,i have been great what about you,"[no, problem, so, how, have, you, been]","[i, have, been, great, what, about, you]"
4,i have been great what about you,i have been good i am in school right now,"[i, have, been, great, what, about, you]","[i, have, been, good, i, am, in, school, right, now]"


## Stemming & Lemmatization

In [35]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
df['question_tokens']=df['question_tokens'].apply(lambda x:lemmatizer(x))
df['response_tokens'] = df['response_tokens'].apply(lambda x: lemmatizer(x))
df.head()

Unnamed: 0,Question,Response,question_tokens,response_tokens
0,hi how are you doing,i am fine how about yourself,"[hi, how, are, you, doing]","[i, am, fine, how, about, yourself]"
1,i am fine how about yourself,i am pretty good thanks for asking,"[i, am, fine, how, about, yourself]","[i, am, pretty, good, thanks, for, asking]"
2,i am pretty good thanks for asking,no problem so how have you been,"[i, am, pretty, good, thanks, for, asking]","[no, problem, so, how, have, you, been]"
3,no problem so how have you been,i have been great what about you,"[no, problem, so, how, have, you, been]","[i, have, been, great, what, about, you]"
4,i have been great what about you,i have been good i am in school right now,"[i, have, been, great, what, about, you]","[i, have, been, good, i, am, in, school, right, now]"


## Text Representation

In [36]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained transformer model and tokenizer
model_name = "bert-base-uncased"  # You can replace this with any other model like "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Function to get embeddings
def get_embeddings(texts):
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings

# Get embeddings for 'Question' and 'Response'
df["Question_Embeddings"] = list(get_embeddings(df["Question"].tolist()).numpy())
df["Response_Embeddings"] = list(get_embeddings(df["Response"].tolist()).numpy())

In [46]:
df["Question_Embeddings"].head(2)

0    [-0.084322825, -0.29642522, 0.107464604, 0.06761158, 0.045005273, -0.36714563, -0.038330637, 0.4176241, 0.14506508, -0.40889618, -0.025968947, -0.13267325, -0.025408257, 0.05544834, -0.26655722, 0.32937482, 0.058801685, 0.2423604, -0.06465361, 0.46883267, -0.032088682, -0.27868727, 0.032381117, 0.05643771, 0.06908245, -0.067590125, -0.56853, 0.108709075, -0.11642273, -0.21876977, 0.017847342, 0.14752997, -0.20447387, 0.10257865, 0.18059957, 0.1654304, -0.17744726, 0.057701472, -0.3381567, 0.32482374, -0.5958184, -0.43422377, -0.04663925, 0.0012705732, -0.35299122, -0.62987703, 0.051803123, -0.19315194, -0.1997378, -0.21552132, -0.079806454, 0.16827577, -0.50703984, -0.014744756, -0.56430286, 0.2152796, 0.017850423, -0.67047054, -0.69661885, 0.06297649, 0.23837209, -0.28224155, -0.24002466, 0.08230722, 0.12211145, 0.1616512, 0.060312867, 0.1993176, -0.45128775, 0.47030544, -0.44425106, -0.20570198, -0.03653549, -0.15316619, -0.15003, 0.010101399, -0.19414373, 0.20522928, 0.01526631

In [47]:
df["Response_Embeddings"].head(2)

0    [0.08694417, -0.3504896, -0.010461641, -0.0007057637, 0.22674249, -0.2164548, -0.17136958, 0.37046382, -0.03513408, -0.29210237, 0.07565384, 0.21186353, 0.09164712, 0.10223167, -0.19707146, 0.23126665, 0.23198037, 0.256825, 0.089198664, 0.25338736, 0.15155251, -0.17072378, 0.034023475, 0.10554478, 0.22117485, -0.019442948, -0.17113426, 0.3258734, -0.0798314, -0.25878772, -0.09259739, 0.11437352, -0.27960265, 0.2795223, -0.34646255, -0.089238405, -0.2386611, 0.15108551, -0.5196974, 0.12126971, -0.5821555, -0.24259382, -0.03919399, -0.051267087, 0.067544214, -0.24699745, 0.20256238, -0.3565829, 0.05417842, -0.13439111, -0.34780198, 0.15631181, -0.32066688, 0.11451554, -0.3652254, -0.03582444, 0.015126505, -0.43574685, -0.6822088, -0.03926374, 0.23996352, 0.104848556, 0.12602362, 0.17850202, 0.29183543, 0.17611331, 0.17022137, -0.24079514, -0.26498553, 0.4562868, -0.3451291, -0.17088072, -0.47321537, -0.27807066, 0.023660583, 0.31594115, -0.16463369, -0.005742748, -0.10848736, 0.0693

## Model Building