### Notebook - Table of Content

1. [**Importing necessary libraries**](#1.-Importing-necessary-libraries)   
2. [**Loading data using dask**](#2.-Loading-data-using-dask)   
3. [**Basic Data Analysis**](#3.-Basic-Data-Analysis)  
    3.1 [**Checking for class imbalance**](#3.1-Checking-for-class-imbalance)  
    3.2 [**Number of distinct questions**](#3.2-Number-of-distinct-questions)  
4. [**Data preprocessing**](#4.-Data-preprocessing)  
    4.1 [**Checking for duplicates**](#4.1-Checking-for-duplicates)  
    4.2 [**Checking for missing values**](#4.2-Checking-for-missing-values)  
5. [**Basic Feature Extraction**](#5.-Basic-Feature-Extraction)  
    5.1 [**Analysis on few extracted features**](#5.1-Analysis-on-few-extracted-features)  
6. [**Text preprocessing**](#6.-Text-preprocessing)   
    6.1 [**Analysing extracted features **](#6.1-Analysing-extracted-features )  
7. [**Featurization through weighted tf-idf based word vectors**](#7.-Featurization-through-weighted-tf-idf-based-word-vectors) 
8. [**Merging all the extacted features**](#8.-Merging-all-the-extacted-features)
9. [**Machine Learning models**](#9.-Machine-Learning-models)  
    9.1 [**Fitting Logistic Regression**](#9.1-Fitting-Logistic-Regression-model)  

**Additional NOTE**

If you are interested in learning or exploring more about importance of feature selection in machine learning, then refer to my below blog offering.

https://www.analyticsvidhya.com/blog/2020/10/a-comprehensive-guide-to-feature-selection-using-wrapper-methods-in-python/

### 1. Importing necessary libraries

In [None]:
import os
import numpy as np 
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
from wordcloud import WordCloud, STOPWORDS
from os import path
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics.classification import accuracy_score, log_loss
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")
import spacy

import dask.dataframe as dd
import dask.array as da

### 2. Loading data using dask

In [None]:
train_df = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
df = dd.from_pandas(train_df, npartitions=5)
df.head()

In [None]:
df.visualize()

### 3. Basic Data Analysis

#### 3.1 Checking for class imbalance

In [None]:
plt.figure(figsize = (10,7))
sns.countplot(df.iloc[:,5].compute())
plt.title("Barplot of is_duplicate")

In [None]:
print("Total number of qustion pairs : ", len(df))
print("% of question pairs which are similar : ", (len(df[df["is_duplicate"]==1]) / len(df))*100)
print("% of question pairs which are not similar : ", (len(df[df["is_duplicate"]==0]) / len(df))*100)

#### 3.2 Number of distinct questions

In [None]:
distinct_qus = len(set(df['qid1'].compute().tolist() + df['qid2'].compute().tolist()))
print("Total number of distinct questions : ", distinct_qus)

appended_series = df['qid1'].append(train_df['qid2']).compute()
qus_freq_more_than_one = sum(appended_series.value_counts()>1)
print("Repeated questions(Number of questions having frequency more than one time) : ", qus_freq_more_than_one)
print("Highest repeat frequency : ", max(appended_series.value_counts()))

In [None]:
plt.figure(figsize = (10,7))
sns.barplot(["Distinct", "Repeated"], [distinct_qus, qus_freq_more_than_one])
plt.title("Barplot indicating distinct and repeated questions")

### 4. Data preprocessing

#### 4.1 Checking for duplicates

In [None]:
print("Number of duplicate question pairs : ", df[['qid1','qid2']].compute().duplicated().sum())

#### Frequency of each question

In [None]:
plt.figure(figsize = (20,12))
sns.distplot(appended_series.value_counts(),bins = 200, kde = False, color = "blue")
plt.yscale('log', nonposy='clip')

#### 4.2 Checking for missing values

In [None]:
df.isna().compute().sum()

In [None]:
df = df.fillna('')

### 5. Basic Feature Extraction

- freq_qid1 = Frequency of qid1's
- freq_qid2 = Frequency of qid2's
- q1len = Length of q1
- q2len = Length of q2
- q1_n_words = Number of words in Question 1
- q2_n_words = Number of words in Question 2
- word_Common = (Number of common unique words in Question 1 and Question 2)
- word_Total =(Total num of words in Question 1 + Total num of words in Question 2)
- word_share = (word_common)/(word_Total)
- freq_q1+freq_q2 = sum total of frequency of qid1 and qid2
- freq_q1-freq_q2 = absolute difference of frequency of qid1 and qid2

In [None]:
df['freq_qid1'] = df.groupby('qid1')['qid1'].transform('count').compute()
df['freq_qid2'] = df.groupby('qid2')['qid2'].transform('count').compute() 

In [None]:
df['q1len'] = df['question1'].str.len().compute()
df['q2len'] = df['question2'].str.len().compute()

In [None]:
df['q1_n_words'] = df.apply(lambda row: len(row.question1.split(" ")),axis=1).compute()
df['q2_n_words'] = df.apply(lambda row: len(row.question2.split(" ")),axis=1).compute()

In [None]:
def stripped_common_words(row):
        set1 = set(map(lambda i: i.lower().strip(), row.question1.split(" ")))
        set2 = set(map(lambda i: i.lower().strip(), row.question2.split(" ")))    
        return 1.0 * len(set1 & set2)
df['word_Common'] = df.apply(stripped_common_words, axis=1).compute()

In [None]:
def stripped_word_total(row):
        set1 = set(map(lambda i: i.lower().strip(), row.question1.split(" ")))
        set2 = set(map(lambda i: i.lower().strip(), row.question2.split(" ")))    
        return 1.0 * (len(set1) + len(set2))
df['word_Total'] = df.apply(stripped_word_total, axis=1).compute()

In [None]:
def stripped_word_share(row):
        set1 = set(map(lambda i: i.lower().strip(), row.question1.split(" ")))
        set2 = set(map(lambda i: i.lower().strip(), row.question2.split(" ")))    
        return 1.0 * len(set1 & set2)/(len(set1) + len(set2))
df['word_share'] = df.apply(stripped_word_share, axis=1).compute()

In [None]:
df['freq_q1+q2'] = df['freq_qid1']+df['freq_qid2'].compute()
df['freq_q1-q2'] = abs(df['freq_qid1']-df['freq_qid2']).compute()

In [None]:
df.head()

#### 5.1 Analysis on few extracted features

In [None]:
print ("Minimum number of words in question1 : " , df['q1_n_words'].min().compute())
print ("Minimum number of words in question2 : " , df['q2_n_words'].min().compute())
print ("Number of Questions with minimum words [question1] :", len(df[df['q1_n_words']== 1]))
print ("Number of Questions with minimum words [question2] :", len(df[df['q2_n_words']== 1]))

#### 5.1.a Univariate analysis of feature word_share

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(14, 8))
ax1.set_title("Violin plot of word_share across both the duplicacy level")
sns.violinplot(x = df['is_duplicate'].compute(), y = df['word_share'].compute(),ax=ax1)
ax2.set_title("Distribution of word_share across both the duplicacy level")
sns.distplot(df[df['is_duplicate'] == 1.0]['word_share'].compute() , label = "1", ax=ax2)
sns.distplot(df[df['is_duplicate'] == 0.0]['word_share'].compute() , label = "0" , ax=ax2)
plt.show()

#### 5.1.b Univariate analysis of feature word_common

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(14, 8))
ax1.set_title("Violin plot of word_Common across both the duplicacy level")
sns.violinplot(x = df['is_duplicate'].compute(), y = df['word_Common'].compute(),ax=ax1)
ax2.set_title("Distribution of word_Common across both the duplicacy level")
sns.distplot(df[df['is_duplicate'] == 1.0]['word_Common'].compute() , label = "1", color = 'red',ax=ax2)
sns.distplot(df[df['is_duplicate'] == 0.0]['word_Common'].compute() , label = "0" , color = 'blue' ,ax=ax2)

### 6. Text preprocessing

It involves - 
- Removing html tags
- Removing Punctuations
- Removing Stopwords
- Performing stemming
- Expanding contractions etc.

In [None]:
!pip install distance

In [None]:
import distance

In [None]:
stop_words = stopwords.words("english")
def text_preprocess(txt):
    txt = str(txt).lower()
    txt = txt.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
          .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
          .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
          .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
          .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
        .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
        .replace("€", " euro ").replace("'ll", " will")
    txt = re.sub(r"([0-9]+)000000", r"\1m", txt)
    txt = re.sub(r"([0-9]+)000", r"\1k", txt)
    porter = PorterStemmer()
    pattern = re.compile('\W')
    if type(txt) == type(''):
        txt = re.sub(pattern, ' ', txt)
    if type(txt) == type(''):
        txt = porter.stem(txt)
        example1 = BeautifulSoup(txt)
        txt = example1.get_text()
               
    return txt

In [None]:
safe_div = 0.0001
def fetch_token_features(q1, q2):
    token_features = [0.0]*10
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features
    q1_words = set([word for word in q1_tokens if word not in stop_words])
    q2_words = set([word for word in q2_tokens if word not in stop_words])
    q1_stops = set([word for word in q1_tokens if word in stop_words])
    q2_stops = set([word for word in q2_tokens if word in stop_words])
    common_word_count = len(q1_words & q2_words)
    common_stop_count = len(q1_stops & q2_stops)
    common_token_count = len(set(q1_tokens) & set(q2_tokens))
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + safe_div)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + safe_div)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + safe_div)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + safe_div)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + safe_div)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + safe_div)
    token_features[6]= int(q1_tokens[-1] == q2_tokens[-1])
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
    token_features[9] = (len(q1_tokens) + len(q2_tokens))/2
    return token_features

In [None]:
# fetch the Longest Common sub string
def fetch_longest_substr_ratio(a, b):
    strs = list(distance.lcsubstrings(a, b))
    if len(strs) == 0:
        return 0
    else:
        return len(strs[0]) / (min(len(a), len(b)) + 1)

In [None]:
def extract_features(df):
    # preprocessing each question
    df["question1"] = df["question1"].apply(text_preprocess).compute()
    df["question2"] = df["question2"].apply(text_preprocess).compute()
    token_features = df.apply(lambda row: fetch_token_features(row.question1, row.question2), axis=1).compute()
    cwc_min = dd.from_array(np.array(list(map(lambda i: i[0], token_features)))).compute()
    cwc_min.name = "cwc_min"
    df.merge(cwc_min.to_frame())
    cwc_max = dd.from_array(np.array(list(map(lambda i: i[1], token_features)))).compute()
    cwc_max.name = "cwc_max"
    df.merge(cwc_max.to_frame())
    csc_min = dd.from_array(np.array(list(map(lambda i: i[2], token_features)))).compute()
    csc_min.name = "csc_min"
    df.merge(csc_min.to_frame())
    csc_max = dd.from_array(np.array(list(map(lambda i: i[3], token_features)))).compute()
    csc_max.name = "csc_max"
    df.merge(csc_max.to_frame())
    ctc_min = dd.from_array(np.array(list(map(lambda i: i[4], token_features)))).compute()
    ctc_min.name = "ctc_min"
    df.merge(ctc_min.to_frame())
    ctc_max = dd.from_array(np.array(list(map(lambda i: i[5], token_features)))).compute()
    ctc_max.name = "ctc_max"
    df.merge(ctc_max.to_frame())
    last_word_eq = dd.from_array(np.array(list(map(lambda i: i[6], token_features)))).compute()
    last_word_eq.name = "last_word_eq"
    df.merge(last_word_eq.to_frame())
    first_word_eq = dd.from_array(np.array(list(map(lambda i: i[7], token_features)))).compute()
    first_word_eq.name = "first_word_eq"
    df.merge(first_word_eq.to_frame())
    abs_len_diff = dd.from_array(np.array(list(map(lambda i: i[8], token_features)))).compute()
    abs_len_diff.name = "abs_len_diff"
    df.merge(abs_len_diff.to_frame())
    mean_len = dd.from_array(np.array(list(map(lambda i: i[9], token_features)))).compute()
    mean_len.name = "mean_len"
    df.merge(mean_len.to_frame())
    df["token_set_ratio"] = df.apply(lambda row: fuzz.token_set_ratio(row.question1, row.question2), axis=1).compute()
    df["token_sort_ratio"] = df.apply(lambda row: fuzz.token_sort_ratio(row.question1, row.question2), axis=1).compute()
    df["fuzz_ratio"] = df.apply(lambda row: fuzz.QRatio(row.question1, row.question2), axis=1).compute()
    df["fuzz_partial_ratio"] = df.apply(lambda row: fuzz.partial_ratio(row.question1, row.question2), axis=1).compute()
    df["longest_substr_ratio"]  = df.apply(lambda row: fetch_longest_substr_ratio(row.question1, row.question2), axis=1).compute()
    return df

In [None]:
dff = dd.from_pandas(train_df, npartitions=5)
dff.head()

In [None]:
df = extract_features(df)

In [None]:
df.head(2)

### 6.1 Analysing extracted features 

#### 6.1.a Word cloud formation

In [None]:
df_duplicate = df[df['is_duplicate'] == 1].compute()
df_nonduplicate = df[df['is_duplicate'] == 0].compute()

In [None]:
np.dstack([df_duplicate["question1"], df_duplicate["question2"]]).flatten()

In [None]:
df_duplicate = df[df['is_duplicate'] == 1].compute()
df_nonduplicate = df[df['is_duplicate'] == 0].compute()

duplicate_flatten = np.dstack([df_duplicate["question1"], df_duplicate["question2"]]).flatten()
nonduplicate_flatten = np.dstack([df_nonduplicate["question1"], df_nonduplicate["question2"]]).flatten()
print ("Number of questions in duplicate pairs set(class 1) : ",duplicate_flatten.shape[0])
print ("Number of questions in non-duplicate pairs set(class 0) : ",nonduplicate_flatten.shape[0])

In [None]:
os.chdir("/kaggle/working/")
np.savetxt('train_duplicate.txt', duplicate_flatten, delimiter=' ', fmt='%s')
np.savetxt('train_nonduplicate.txt', nonduplicate_flatten, delimiter=' ', fmt='%s')
#Reading the text files
duplicate_w = open(path.join("/kaggle/working/", 'train_duplicate.txt')).read()
nonduplicate_w = open(path.join("/kaggle/working/", 'train_nonduplicate.txt')).read()
print ("Total number of words in duplicate pair set :",len(duplicate_w))
print ("Total number of words in non duplicate pair set :",len(nonduplicate_w))

In [None]:
stop_words = set(STOPWORDS)
stop_words.add("said")
stop_words.add("br")
stop_words.add(" ")
stop_words.remove("not")
stop_words.remove("no")
stop_words.remove("like")

In [None]:
wc = WordCloud(background_color="white", max_words=len(duplicate_w), stopwords=stop_words)
wc.generate(duplicate_w)
plt.figure(figsize =(10,8))
plt.title("Word cloud for duplicate Question pairs")
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")

In [None]:
wc = WordCloud(background_color="white", max_words=len(nonduplicate_w), stopwords=stop_words)
wc.generate(nonduplicate_w)
plt.figure(figsize =(10,8))
plt.title("Word cloud for non-duplicate Question pairs")
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")

#### 6.1.b Distribution of the token_sort_ratio 

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(14, 8))
ax1.set_title("Violin plot of token_sort_ration across both the duplicacy level")
sns.violinplot(x = df['is_duplicate'].compute(), y = df['token_sort_ratio'].compute(), ax=ax1)
ax2.set_title("Distribution of token_sort_ration across both the duplicacy level")
sns.distplot(df[df['is_duplicate'] == 1.0]['token_sort_ratio'].compute() , label = "1",ax=ax2)
sns.distplot(df[df['is_duplicate'] == 0.0]['token_sort_ratio'].compute() , label = "0" , ax=ax2)

#### 6.1.c Distribution of the fuzz_ratio 

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(14, 8))
ax1.set_title("Violin plot of token_sort_ration across both the duplicacy level")
sns.violinplot(x = df['is_duplicate'].compute(), y = df['fuzz_ratio'].compute(), ax=ax1)
ax2.set_title("Distribution of token_sort_ration across both the duplicacy level")
sns.distplot(df[df['is_duplicate'] == 1.0]['fuzz_ratio'].compute() , label = "1",ax=ax2)
sns.distplot(df[df['is_duplicate'] == 0.0]['fuzz_ratio'].compute() , label = "0" , ax=ax2)

### 7. Featurization through weighted tf-idf based word vectors

In [None]:
ddf = dd.from_pandas(train_df, npartitions=5)

In [None]:
ddf['question1'] = ddf.apply(lambda row: str(row.question1), axis=1).compute()
ddf['question2'] = ddf.apply(lambda row: str(row.question2), axis=1).compute()

In [None]:
merge_questions = list(ddf['question1'].compute()) + list(ddf['question2'].compute())
tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(merge_questions)

In [None]:
word_to_idf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [None]:
w2v = spacy.load('en_core_web_sm')

In [None]:
w2v_vec_q1 = []
for qus1 in tqdm(list(ddf['question1'].compute())):
    doc_q1 = w2v(qus1)
    mean_vec_q1 = np.zeros([len(doc_q1), len(doc_q1[0].vector)])
    for word in doc_q1:
        vec = word.vector
        try:
            idf = word_to_idf[str(word)]
        except:
            idf = 0
        mean_vec_q1 += vec * idf
    mean_vec_q1 = mean_vec_q1.mean(axis=0)
    w2v_vec_q1.append(mean_vec_q1)
q1_feats_m = dd.from_array(np.array(list(w2v_vec_q1))).compute()

In [None]:
w2v_vec_q2 = []
for qus2 in tqdm(list(ddf['question2'].compute())):
    doc_q2 = w2v(qus2)
    mean_vec_q2 = np.zeros([len(doc_q2), len(doc_q2[0].vector)])
    for word in doc_q2:
        vec = word.vector
        try:
            idf = word_to_idf[str(word)]
        except:
            idf = 0
        mean_vec_q2 += vec * idf
    mean_vec_q2 = mean_vec_q2.mean(axis=0)
    w2v_vec_q2.append(mean_vec_q2)
q2_feats_m = dd.from_array(np.array(list(w2v_vec_q2))).compute()

### 8. Merging all the extacted features

In [None]:
q1_feats_m["id"] = df["id"]
q2_feats_m["id"] = df["id"]
df_q = q1_feats_m.merge(q2_feats_m,on ="id",how = "left")

In [None]:
df = df.drop(["qid1", "qid2", "question1","question2"], axis=1).compute()
df_final = df.merge(df_q,on ="id",how = "left")

In [None]:
df_final = dd.from_pandas(df_final, npartitions=5)
df_final.head()

### Splitting into train and test set with 70:30 ratio

In [None]:
from dask_ml.model_selection import train_test_split

In [None]:
y = df_final["is_duplicate"].compute()
df_final = df_final.drop(['id', 'is_duplicate'],axis=1).compute()
X_train,X_test, y_train, y_test = train_test_split(df_final, y, test_size=0.3,random_state = 42)

In [None]:
print("Training data size :",X_train.shape)
print("Test data size :",X_test.shape)

### 9. Machine Learning models

#### 9.1 Fitting Logistic Regression model 


In [None]:
from dask_ml.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(random_state = 42)
clf.fit(X_train.values, y_train.values)

In [None]:
from dask_ml.metrics import accuracy_score, log_loss
y_pred = clf.predict_proba(X_test.values)
print("Log loss of the model : ", log_loss(y_test, y_pred))

In [None]:
y_pred = clf.predict(X_test.values)
print("Accuracy of the model : ", accuracy_score(da.from_array(y_test, chunks = 5),da.from_array(y_pred, chunks = 5)))