In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

In [69]:
def preprocess(q):
    
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    
    return q
    

In [94]:
df = pd.read_excel('/kaggle/input/text-reason/evaluation.xlsx')
# df = pd.read_excel("/kaggle/input/text-reason/train.xlsx")

In [71]:
df.reason = df.reason.apply(preprocess)
df.text = df.text.apply(preprocess)

In [72]:
! pip install nlpaug -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [73]:
import nlpaug
import nlpaug.augmenter.word as naw

In [74]:
aug = naw.SynonymAug(aug_src='wordnet',aug_max=4)
aug2 = naw.AntonymAug()

In [75]:
antonmy = []
y = []

for i in range(len(df)) :
    temp = []
    temp.append(df.text.iloc[i])
    temp.append(aug2.augment(df.reason.iloc[i])[0])
    antonmy.append(temp)
    y.append(0)
    
for i in range(len(df)) :
    temp = []
    temp.append(aug2.augment(df.text.iloc[i])[0])
    temp.append(df.reason.iloc[i])
    antonmy.append(temp)
    y.append(0)
    
for i in range(len(df)) :
    temp = []
    temp.append(aug.augment(df.text.iloc[i])[0])
    temp.append(df.reason.iloc[i])
    antonmy.append(temp)
    y.append(1)
    
antonmy = np.array(antonmy)
y = np.array(y)
print(antonmy.shape, y.shape)

(27000, 2) (27000,)


In [None]:
new = pd.DataFrame(antonmy, columns=['text','reason'])
new['label'] = y
new

In [None]:
temp = pd.concat([df, new], axis=0)
df = temp.sample(frac=1).reset_index(drop=True)

In [None]:
df.columns

In [95]:
new_df = df.rename(columns = {'text':'question1', 'reason':'question2', 'label':'is_duplicate'})

In [96]:
new_df.is_duplicate = new_df.is_duplicate.apply(lambda x : int(x))

In [97]:
new_df.head()

Unnamed: 0,question1,question2,is_duplicate
0,the app is crashing when i play a vedio,app crashes during playback,1
1,but i want to connect it to the tv from one de...,want compatibility with more smart televisions,0
2,very helpful when and home working remotley,good app for work,0
3,this zoom so called and missed call and mobile...,receiving incorrect phone number message,0
4,one of my favorite apps,good for spending time,0


In [98]:
preprocess("I've already! wasn't <b>done</b>?")

'i have already  was not done'

In [99]:
new_df['question1'] = new_df['question1'].apply(preprocess)
new_df['question2'] = new_df['question2'].apply(preprocess)

In [100]:
new_df.head()

Unnamed: 0,question1,question2,is_duplicate
0,the app is crashing when i play a vedio,app crashes during playback,1
1,but i want to connect it to the tv from one de...,want compatibility with more smart televisions,0
2,very helpful when and home working remotley,good app for work,0
3,this zoom so called and missed call and mobile...,receiving incorrect phone number message,0
4,one of my favorite apps,good for spending time,0


In [101]:
new_df['q1_len'] = new_df['question1'].str.len() 
new_df['q2_len'] = new_df['question2'].str.len()

In [102]:
new_df['q1_num_words'] = new_df['question1'].apply(lambda row: len(row.split(" ")))
new_df['q2_num_words'] = new_df['question2'].apply(lambda row: len(row.split(" ")))
new_df.head()

Unnamed: 0,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words
0,the app is crashing when i play a vedio,app crashes during playback,1,39,27,9,4
1,but i want to connect it to the tv from one de...,want compatibility with more smart televisions,0,61,46,14,6
2,very helpful when and home working remotley,good app for work,0,43,17,7,4
3,this zoom so called and missed call and mobile...,receiving incorrect phone number message,0,53,40,10,5
4,one of my favorite apps,good for spending time,0,23,22,5,4


In [103]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return len(w1 & w2)

In [104]:
new_df['word_common'] = new_df.apply(common_words, axis=1)
new_df.head()

Unnamed: 0,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common
0,the app is crashing when i play a vedio,app crashes during playback,1,39,27,9,4,1
1,but i want to connect it to the tv from one de...,want compatibility with more smart televisions,0,61,46,14,6,1
2,very helpful when and home working remotley,good app for work,0,43,17,7,4,0
3,this zoom so called and missed call and mobile...,receiving incorrect phone number message,0,53,40,10,5,1
4,one of my favorite apps,good for spending time,0,23,22,5,4,0


In [105]:
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return (len(w1) + len(w2))

In [106]:
new_df['word_total'] = new_df.apply(total_words, axis=1)
new_df.head()

Unnamed: 0,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total
0,the app is crashing when i play a vedio,app crashes during playback,1,39,27,9,4,1,13
1,but i want to connect it to the tv from one de...,want compatibility with more smart televisions,0,61,46,14,6,1,18
2,very helpful when and home working remotley,good app for work,0,43,17,7,4,0,11
3,this zoom so called and missed call and mobile...,receiving incorrect phone number message,0,53,40,10,5,1,14
4,one of my favorite apps,good for spending time,0,23,22,5,4,0,9


In [107]:
new_df['word_share'] = round(new_df['word_common']/new_df['word_total'],2)
new_df.head()

Unnamed: 0,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share
0,the app is crashing when i play a vedio,app crashes during playback,1,39,27,9,4,1,13,0.08
1,but i want to connect it to the tv from one de...,want compatibility with more smart televisions,0,61,46,14,6,1,18,0.06
2,very helpful when and home working remotley,good app for work,0,43,17,7,4,0,11,0.0
3,this zoom so called and missed call and mobile...,receiving incorrect phone number message,0,53,40,10,5,1,14,0.07
4,one of my favorite apps,good for spending time,0,23,22,5,4,0,9,0.0


In [108]:
# Advanced Features
from nltk.corpus import stopwords

def fetch_token_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    SAFE_DIV = 0.0001 

    STOP_WORDS = stopwords.words("english")
    
    token_features = [0.0]*8
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_features


In [109]:
token_features = new_df.apply(fetch_token_features, axis=1)

new_df["cwc_min"]       = list(map(lambda x: x[0], token_features))
new_df["cwc_max"]       = list(map(lambda x: x[1], token_features))
new_df["csc_min"]       = list(map(lambda x: x[2], token_features))
new_df["csc_max"]       = list(map(lambda x: x[3], token_features))
new_df["ctc_min"]       = list(map(lambda x: x[4], token_features))
new_df["ctc_max"]       = list(map(lambda x: x[5], token_features))
new_df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
new_df["first_word_eq"] = list(map(lambda x: x[7], token_features))

In [110]:
new_df.head()

Unnamed: 0,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq
0,the app is crashing when i play a vedio,app crashes during playback,1,39,27,9,4,1,13,0.08,0.333322,0.249994,0.0,0.0,0.249994,0.11111,0,0
1,but i want to connect it to the tv from one de...,want compatibility with more smart televisions,0,61,46,14,6,1,18,0.06,0.249994,0.166664,0.0,0.0,0.166664,0.071428,0,0
2,very helpful when and home working remotley,good app for work,0,43,17,7,4,0,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,this zoom so called and missed call and mobile...,receiving incorrect phone number message,0,53,40,10,5,1,14,0.07,0.199996,0.166664,0.0,0.0,0.199996,0.099999,0,0
4,one of my favorite apps,good for spending time,0,23,22,5,4,0,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [111]:
! pip install distance -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [112]:
import distance

def fetch_length_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    length_features = [0.0]*3
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features
    
    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    
    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2
    
    strs = list(distance.lcsubstrings(q1, q2))
    length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
    
    return length_features
    

In [113]:
length_features = new_df.apply(fetch_length_features, axis=1)

new_df['abs_len_diff'] = list(map(lambda x: x[0], length_features))
new_df['mean_len'] = list(map(lambda x: x[1], length_features))
new_df['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))

In [114]:
new_df.head()

Unnamed: 0,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,...,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio
0,the app is crashing when i play a vedio,app crashes during playback,1,39,27,9,4,1,13,0.08,...,0.249994,0.0,0.0,0.249994,0.11111,0,0,5,6.5,0.214286
1,but i want to connect it to the tv from one de...,want compatibility with more smart televisions,0,61,46,14,6,1,18,0.06,...,0.166664,0.0,0.0,0.166664,0.071428,0,0,8,10.0,0.106383
2,very helpful when and home working remotley,good app for work,0,43,17,7,4,0,11,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,3,5.5,0.277778
3,this zoom so called and missed call and mobile...,receiving incorrect phone number message,0,53,40,10,5,1,14,0.07,...,0.166664,0.0,0.0,0.199996,0.099999,0,0,5,7.5,0.195122
4,one of my favorite apps,good for spending time,0,23,22,5,4,0,9,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,1,4.5,0.086957


In [115]:
# Fuzzy Features
from fuzzywuzzy import fuzz

def fetch_fuzzy_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    fuzzy_features = [0.0]*4
    
    # fuzz_ratio
    fuzzy_features[0] = fuzz.QRatio(q1, q2)

    # fuzz_partial_ratio
    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)

    # token_sort_ratio
    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)

    # token_set_ratio
    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)

    return fuzzy_features

In [116]:
fuzzy_features = new_df.apply(fetch_fuzzy_features, axis=1)

# Creating new feature columns for fuzzy features
new_df['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))
new_df['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))
new_df['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))
new_df['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))

In [117]:
print(new_df.shape)
new_df.head()

(9000, 25)


Unnamed: 0,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
0,the app is crashing when i play a vedio,app crashes during playback,1,39,27,9,4,1,13,0.08,...,0.11111,0,0,5,6.5,0.214286,55,63,52,52
1,but i want to connect it to the tv from one de...,want compatibility with more smart televisions,0,61,46,14,6,1,18,0.06,...,0.071428,0,0,8,10.0,0.106383,43,46,37,46
2,very helpful when and home working remotley,good app for work,0,43,17,7,4,0,11,0.0,...,0.0,0,0,3,5.5,0.277778,30,53,37,37
3,this zoom so called and missed call and mobile...,receiving incorrect phone number message,0,53,40,10,5,1,14,0.07,...,0.099999,0,0,5,7.5,0.195122,37,43,41,45
4,one of my favorite apps,good for spending time,0,23,22,5,4,0,9,0.0,...,0.0,0,0,1,4.5,0.086957,40,41,36,36


In [None]:
sns.pairplot(new_df[['ctc_min', 'cwc_min', 'csc_min', 'is_duplicate']],hue='is_duplicate')

In [None]:
sns.pairplot(new_df[['ctc_max', 'cwc_max', 'csc_max', 'is_duplicate']],hue='is_duplicate')

In [None]:
sns.pairplot(new_df[['last_word_eq', 'first_word_eq', 'is_duplicate']],hue='is_duplicate')

In [None]:
sns.pairplot(new_df[['mean_len', 'abs_len_diff','longest_substr_ratio', 'is_duplicate']],hue='is_duplicate')

In [None]:
sns.pairplot(new_df[['fuzz_ratio', 'fuzz_partial_ratio','token_sort_ratio','token_set_ratio', 'is_duplicate']],hue='is_duplicate')

In [None]:
# Using TSNE for Dimentionality reduction for 15 Features(Generated after cleaning the data) to 3 dimention

from sklearn.preprocessing import MinMaxScaler

X = MinMaxScaler().fit_transform(new_df[['cwc_min', 'cwc_max', 'csc_min', 'csc_max' , 'ctc_min' , 'ctc_max' , 'last_word_eq', 'first_word_eq' , 'abs_len_diff' , 'mean_len' , 'token_set_ratio' , 'token_sort_ratio' ,  'fuzz_ratio' , 'fuzz_partial_ratio' , 'longest_substr_ratio']])
y = new_df['is_duplicate'].values

In [None]:
from sklearn.manifold import TSNE

tsne2d = TSNE(
    n_components=2,
    init='random', # pca
    random_state=101,
    method='barnes_hut',
    n_iter=1000,
    verbose=2,
    angle=0.5
).fit_transform(X)

In [None]:
x_df = pd.DataFrame({'x':tsne2d[:,0], 'y':tsne2d[:,1] ,'label':y})

# draw the plot in appropriate place in the grid
sns.lmplot(data=x_df, x='x', y='y', hue='label', fit_reg=False, size=8,palette="Set1",markers=['s','o'])


In [None]:
tsne3d = TSNE(
    n_components=3,
    init='random', # pca
    random_state=101,
    method='barnes_hut',
    n_iter=1000,
    verbose=2,
    angle=0.5
).fit_transform(X)

In [None]:
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.offline as py
py.init_notebook_mode(connected=True)

trace1 = go.Scatter3d(
    x=tsne3d[:,0],
    y=tsne3d[:,1],
    z=tsne3d[:,2],
    mode='markers',
    marker=dict(
        sizemode='diameter',
        color = y,
        colorscale = 'Portland',
        colorbar = dict(title = 'duplicate'),
        line=dict(color='rgb(255, 255, 255)'),
        opacity=0.75
    )
)

data=[trace1]
layout=dict(height=800, width=800, title='3d embedding with engineered features')
fig=dict(data=data, layout=layout)
py.iplot(fig, filename='3DBubble')

In [118]:
ques_df = new_df[['question1','question2']]
ques_df.head()

Unnamed: 0,question1,question2
0,the app is crashing when i play a vedio,app crashes during playback
1,but i want to connect it to the tv from one de...,want compatibility with more smart televisions
2,very helpful when and home working remotley,good app for work
3,this zoom so called and missed call and mobile...,receiving incorrect phone number message
4,one of my favorite apps,good for spending time


In [119]:
final_df = new_df.drop(columns=['question1','question2'])
print(final_df.shape)
final_df.head()

(9000, 23)


Unnamed: 0,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,cwc_min,cwc_max,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
0,1,39,27,9,4,1,13,0.08,0.333322,0.249994,...,0.11111,0,0,5,6.5,0.214286,55,63,52,52
1,0,61,46,14,6,1,18,0.06,0.249994,0.166664,...,0.071428,0,0,8,10.0,0.106383,43,46,37,46
2,0,43,17,7,4,0,11,0.0,0.0,0.0,...,0.0,0,0,3,5.5,0.277778,30,53,37,37
3,0,53,40,10,5,1,14,0.07,0.199996,0.166664,...,0.099999,0,0,5,7.5,0.195122,37,43,41,45
4,0,23,22,5,4,0,9,0.0,0.0,0.0,...,0.0,0,0,1,4.5,0.086957,40,41,36,36


In [120]:
final_df.shape

(9000, 23)

In [54]:
! pip install -U sentence-transformers -q

[0m

In [121]:
from sentence_transformers import SentenceTransformer

In [56]:
sent_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
sent_model.encode(["hello","second","third"]).shape

In [122]:
ques_df1 = ques_df[:200]
# ques_df1 = ques_df[:44]

output_1 =  sent_model.encode(ques_df1['question1'].tolist())
output_2 =  sent_model.encode(ques_df1['question2'].tolist())

# for i in range(44, 8244, 200) :
for i in range(200, 9000, 200) :
    print(i)
    ques_df1 = ques_df[i:i+200]

    outputs_bert =  sent_model.encode(ques_df1['question1'].tolist())
    output_1 = np.vstack((output_1, outputs_bert))
    outputs_bert =  sent_model.encode(ques_df1['question2'].tolist())   
    output_2 = np.vstack((output_2, outputs_bert))

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

200


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

400


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

600


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

800


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

1000


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

1200


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

1400


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

1600


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

1800


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2000


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2200


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2400


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2600


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2800


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

3000


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

3200


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

3400


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

3600


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

3800


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

4000


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

4200


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

4400


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

4600


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

4800


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

5000


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

5200


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

5400


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

5600


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

5800


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

6000


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

6200


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

6400


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

6600


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

6800


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

7000


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

7200


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

7400


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

7600


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

7800


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

8000


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

8200


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

8400


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

8600


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

8800


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [30]:
! pip install transformers -q

[0m

In [31]:
from transformers import AutoTokenizer,TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

2022-12-31 06:13:45.698452: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-31 06:13:45.699556: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-31 06:13:45.700238: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-31 06:13:45.702417: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [None]:
ques_df1 = ques_df[1000:1200]

In [None]:
ques_df.shape

In [32]:
ques_df1 = ques_df[:200]
# ques_df1 = ques_df[:44]

text = tokenizer(
    text=ques_df1['question1'].tolist(),
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

reason = tokenizer(
    text=ques_df1['question2'].tolist(),
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

output_1 =  np.array(bert(text.input_ids, attention_mask=text.attention_mask)[1])
output_2 =  np.array(bert(reason.input_ids, attention_mask=reason.attention_mask)[1])
# output_bert = np.vstack((output_bert, outputs_bert))
# output_bert.shape

In [33]:
ques_df.shape

(9000, 2)

In [None]:
for i in range(44, 8244, 200) :
    print(i, i+200)

In [34]:
# for i in range(44, 8244, 200) :
for i in range(200, 9000, 200) :
    print(i)
    ques_df1 = ques_df[i:i+200]

    text = tokenizer(
        text=ques_df1['question1'].tolist(),
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids = False,
        return_attention_mask = True,
        verbose = True)

    reason = tokenizer(
        text=ques_df1['question2'].tolist(),
        truncation=True,
        padding=True,
        return_tensors='tf',
        return_token_type_ids = False,
        return_attention_mask = True,
        verbose = True)

    outputs_bert =  bert(text.input_ids, attention_mask=text.attention_mask)[1]
    output_1 = np.vstack((output_1, outputs_bert))
    outputs_bert =  bert(reason.input_ids, attention_mask=reason.attention_mask)[1]    
    output_2 = np.vstack((output_2, outputs_bert))

200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
7800
8000
8200
8400
8600
8800


In [123]:
print(ques_df.shape)
print(output_1.shape)
print(output_2.shape)

(9000, 2)
(9000, 384)
(9000, 384)


In [124]:
temp_df1 = pd.DataFrame(output_1, index= ques_df.index)
temp_df2 = pd.DataFrame(output_2, index= ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(9000, 768)

In [None]:
a = []
for i in range(8244) :
    for j in ques_df.question1.iloc[i].split() :
        a.append(j)
    for j in ques_df.question2.iloc[i].split() :
        a.append(j)
a = set(a)
len(a)

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
# merge texts
questions = list(ques_df['question1']) + list(ques_df['question2'])

cv = TfidfVectorizer(max_features=4500)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(ques_df['question1']) + list(ques_df['question2'])

cv = CountVectorizer(max_features=4500)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [81]:
questions = list(ques_df['question1']) + list(ques_df['question2'])
q1_arr, q2_arr = np.vsplit(cv.transform(questions).toarray(),2)

In [82]:
temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(9000, 9000)

In [125]:
final_df = pd.concat([final_df, temp_df], axis=1)
print(final_df.shape)
final_df.head()

(9000, 791)


Unnamed: 0,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share,cwc_min,cwc_max,...,374,375,376,377,378,379,380,381,382,383
0,1,39,27,9,4,1,13,0.08,0.333322,0.249994,...,0.083752,0.017858,0.021774,-0.006902,0.013482,-0.052682,0.032777,-0.011009,0.001348,0.071552
1,0,61,46,14,6,1,18,0.06,0.249994,0.166664,...,0.013709,-0.06781,0.089256,0.038318,0.010369,0.035648,0.112165,-0.008756,0.047949,0.049029
2,0,43,17,7,4,0,11,0.0,0.0,0.0,...,0.038657,0.073354,0.053726,-0.099888,-0.009144,0.051016,0.106548,0.022287,-0.021756,0.099681
3,0,53,40,10,5,1,14,0.07,0.199996,0.166664,...,0.009429,-0.123919,-0.007291,-0.062368,-0.025236,0.043611,0.021638,0.001129,-0.059774,-0.058623
4,0,23,22,5,4,0,9,0.0,0.0,0.0,...,0.033892,0.037739,0.053023,0.008275,-0.009326,-0.025432,0.030153,0.006286,-0.02011,0.008876


In [126]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(final_df.iloc[:,1:].values,final_df.iloc[:,0].values,test_size=0.7,random_state=1)

In [127]:
X_test = final_df.iloc[:,1:].values
y_test = final_df.iloc[:,0].values

In [128]:
xx = X_test
yy = y_test

In [129]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

RandomForestClassifier()

In [130]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(X_train,y_train)

LogisticRegression()

In [131]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [132]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)

GaussianNB()

In [133]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

KNeighborsClassifier()

In [134]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)

DecisionTreeClassifier()

In [135]:
mod = [rf,xgb, dtc, log, knn, gnb]
mod_name = {rf:'random', xgb:'xgb', dtc:'decision', log:'logistic', knn:'knn', gnb:'gaussian'}

In [136]:
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score, classification_report, roc_auc_score, accuracy_score, balanced_accuracy_score

# Sentence

In [137]:
for i in mod :
    yp = i.predict(xx)
    print(mod_name[i])
    print('accuracy  : ',accuracy_score(yy,yp))
    print('bal accur : ',balanced_accuracy_score(yy,yp))
    print('f1 score  : ',f1_score(yy,yp))
    print('precision : ',precision_score(yy,yp))
    print('roc accur : ',roc_auc_score(yy,yp))
    print('confusion : \n', confusion_matrix(yy, yp))
    print('\n')

random
accuracy  :  0.8717777777777778
bal accur :  0.8293793717368978
f1 score  :  0.7850223546944858
precision :  0.8901563160118293
roc accur :  0.8293793717368979
confusion : 
 [[5739  260]
 [ 894 2107]]


xgb
accuracy  :  0.8777777777777778
bal accur :  0.8468692910553403
f1 score  :  0.8044792036971204
precision :  0.8620952380952381
roc accur :  0.8468692910553403
confusion : 
 [[5637  362]
 [ 738 2263]]


decision
accuracy  :  0.8003333333333333
bal accur :  0.7774523844610557
f1 score  :  0.7030242935052058
precision :  0.6973770491803278
roc accur :  0.7774523844610556
confusion : 
 [[5076  923]
 [ 874 2127]]


logistic
accuracy  :  0.7454444444444445
bal accur :  0.6778275941691715
f1 score  :  0.5543668546975298
precision :  0.6658878504672897
roc accur :  0.6778275941691715
confusion : 
 [[5284  715]
 [1576 1425]]


knn
accuracy  :  0.7174444444444444
bal accur :  0.6536600651924716
f1 score  :  0.5217227760015046
precision :  0.5988773747841105
roc accur :  0.653660065192

# BERT

In [50]:
for i in mod :
    yp = i.predict(xx)
    print(mod_name[i])
    print('accuracy  : ',accuracy_score(yy,yp))
    print('bal accur : ',balanced_accuracy_score(yy,yp))
    print('f1 score  : ',f1_score(yy,yp))
    print('precision : ',precision_score(yy,yp))
    print('roc accur : ',roc_auc_score(yy,yp))
    print('confusion : \n', confusion_matrix(yy, yp))
    print('\n')

random
accuracy  :  0.84
bal accur :  0.7945512300478381
f1 score  :  0.7328385899814472
precision :  0.8267057346169946
roc accur :  0.7945512300478382
confusion : 
 [[5585  414]
 [1026 1975]]


xgb
accuracy  :  0.852
bal accur :  0.8197059278845708
f1 score  :  0.7650793650793651
precision :  0.812663919070813
roc accur :  0.8197059278845708
confusion : 
 [[5499  500]
 [ 832 2169]]


decision
accuracy  :  0.7912222222222223
bal accur :  0.7697852729981266
f1 score  :  0.6926222803860624
precision :  0.6802699228791774
roc accur :  0.7697852729981266
confusion : 
 [[5004  995]
 [ 884 2117]]


logistic
accuracy  :  0.7464444444444445
bal accur :  0.6788275109052664
f1 score  :  0.5558583106267031
precision :  0.6682264857276556
roc accur :  0.6788275109052664
confusion : 
 [[5290  709]
 [1573 1428]]


knn
accuracy  :  0.72
bal accur :  0.6559933708822625
f1 score  :  0.5248868778280543
precision :  0.6044290056448112
roc accur :  0.6559933708822625
confusion : 
 [[5088  911]
 [1609 139

# TF-IDF

In [97]:
for i in mod :
    yp = i.predict(xx)
    print(mod_name[i])
    print('accuracy  : ',accuracy_score(yy,yp))
    print('bal accur : ',balanced_accuracy_score(yy,yp))
    print('f1 score  : ',f1_score(yy,yp))
    print('precision : ',precision_score(yy,yp))
    print('roc accur : ',roc_auc_score(yy,yp))
    print('confusion : \n', confusion_matrix(yy, yp))
    print('\n')

random
accuracy  :  0.8634444444444445
bal accur :  0.8126370778557506
f1 score  :  0.7632440762858794
precision :  0.9045662100456621
roc accur :  0.8126370778557506
confusion : 
 [[5790  209]
 [1020 1981]]


xgb
accuracy  :  0.8675555555555555
bal accur :  0.8364536375300582
f1 score  :  0.7891012031139419
precision :  0.8411920030177291
roc accur :  0.8364536375300582
confusion : 
 [[5578  421]
 [ 771 2230]]


decision
accuracy  :  0.8125555555555556
bal accur :  0.7869536347805163
f1 score  :  0.7164229282232307
precision :  0.7228629579375848
roc accur :  0.7869536347805162
confusion : 
 [[5182  817]
 [ 870 2131]]


logistic
accuracy  :  0.7803333333333333
bal accur :  0.7373041847083366
f1 score  :  0.6486582548427227
precision :  0.694973343488195
roc accur :  0.7373041847083367
confusion : 
 [[5198  801]
 [1176 1825]]


knn
accuracy  :  0.7188888888888889
bal accur :  0.6561590654979206
f1 score  :  0.5260397152491569
precision :  0.6007702182284981
roc accur :  0.6561590654979

# Bag of words 

In [79]:
for i in mod :
    yp = i.predict(xx)
    print(mod_name[i])
    print('accuracy  : ',accuracy_score(yy,yp))
    print('bal accur : ',balanced_accuracy_score(yy,yp))
    print('f1 score  : ',f1_score(yy,yp))
    print('precision : ',precision_score(yy,yp))
    print('roc accur : ',roc_auc_score(yy,yp))
    print('confusion : \n', confusion_matrix(yy, yp))
    print('\n')

random
accuracy  :  0.8618888888888889
bal accur :  0.812469383573259
f1 score  :  0.7622872442149551
precision :  0.894524236983842
roc accur :  0.812469383573259
confusion : 
 [[5764  235]
 [1008 1993]]


xgb
accuracy  :  0.8711111111111111
bal accur :  0.8427843605390413
f1 score  :  0.7967764540995095
precision :  0.8400443295160694
roc accur :  0.8427843605390414
confusion : 
 [[5566  433]
 [ 727 2274]]


decision
accuracy  :  0.8195555555555556
bal accur :  0.7955350661298153
f1 score  :  0.7277908146161581
precision :  0.7322091062394603
roc accur :  0.7955350661298153
confusion : 
 [[5205  794]
 [ 830 2171]]


logistic
accuracy  :  0.7874444444444444
bal accur :  0.7407233372617529
f1 score  :  0.6532535798441182
precision :  0.7162162162162162
roc accur :  0.7407233372617529
confusion : 
 [[5285  714]
 [1199 1802]]


knn
accuracy  :  0.7184444444444444
bal accur :  0.6555758848845129
f1 score  :  0.525112443778111
precision :  0.6
roc accur :  0.6555758848845129
confusion : 
 