# Data Loading 

In [79]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
% pip install -U sentence-transformers



In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics import silhouette_samples, silhouette_score , cohen_kappa_score
import gensim
from sentence_transformers import SentenceTransformer, util
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import torch

In [82]:
##Importing functions to evaluate the clusters
from cluster_evaluate import *

In [83]:
##Reading the data
train_df= pd.read_csv("/content/drive/MyDrive/Group_project/final_project/Dataset/training_data_cleaned.csv")

In [84]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Functions

In [85]:
##Elbow method for clustering
def optimum_k(max_k,X):
  wcss = []
  for i in range(2, max_k):
    #kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans =MiniBatchKMeans(n_clusters=i, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
    cluster_labels=kmeans.fit(X)
    print("model finished training")
    silhouette_avg = silhouette_score(X, kmeans.fit_predict(X))
    print("For n_clusters =", i,
          "The average silhouette_score is :", silhouette_avg)
    wcss.append(kmeans.inertia_)
  plt.plot(range(2, max_k), wcss)
  plt.title('Elbow Method')
  plt.xlabel('Number of clusters')
  plt.ylabel('WCSS')
  plt.show()
#return wcss

In [86]:
##getting length of each question
def ques_len(train_df):
    q=train_df.split(" ")
    return len(q)

In [87]:
def read_questions(row,column_name):
    return gensim.utils.simple_preprocess(str(row[column_name]).encode('utf-8'))

In [88]:
##Getting the avergae vector of the all the word embeddings in a sentence
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

   
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)



In [89]:
###Kmeans Model
def min_batch_kmeans (features):
  kmeans_model = MiniBatchKMeans(n_clusters=2, init='k-means++', n_init=1,init_size=1000, batch_size=1000, verbose=False, max_iter=1000,random_state=0)
  kmeans = kmeans_model.fit(features)
  kmeans_clusters = kmeans.predict(features)
  return kmeans_clusters


# Feature Engineering

**TFIDF**

In [90]:
train_df=train_df[:200000]

In [91]:
##Concatenate the two questions
q1_2 = pd.concat([train_df.question1, train_df.question2], axis = 0)

In [92]:
##TFIDF Feature Vector
tfidf_vec = TfidfVectorizer()
tfidf_vec.fit(q1_2)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [93]:
tfidf1 = tfidf_vec.transform(train_df.question1.values)
tfidf2 = tfidf_vec.transform(train_df.question2.values)

In [94]:
tfidf_diff=abs(tfidf1-tfidf2)

In [95]:
##Dimensionality Reduction of Features
svd = TruncatedSVD(n_components=50)
tfidf_diff=svd.fit_transform(tfidf_diff)


**Word2Vector**

In [96]:
##Getting all unique questions in the dataset
all_questions=pd.concat([train_df['question1'] , train_df['question2']],axis=0, ignore_index=True)
all_questions=all_questions.unique()

In [97]:
##Transform all the training questions
documents=[]
for q in all_questions:
    documents.append(gensim.utils.simple_preprocess(str(q).encode('utf-8')))

In [98]:
len(documents)

300720

In [99]:
###Buiding the Word2vec model
model = gensim.models.Word2Vec(size=300, window=10, min_count=10, sg=1, workers=10)
model.build_vocab(documents)  

In [100]:
###Training the Word2vec model
model.train(sentences=documents, total_examples=len(documents), epochs=model.iter)
model.corpus_count

  


300720

In [101]:
##Transform the 2 questions in the train dataframe
q1 = []
q2=[]
for index, row in train_df.iterrows():
    q1.append(read_questions(row,"question1"))
    q2.append(read_questions(row,"question2"))

In [102]:
###Gettinhg the feature vector of the first question
w2v_q1 = averaged_word_vectorizer(corpus=q1, model=model,num_features=300)
w2v_q1=pd.DataFrame(w2v_q1)
w2v_q1.head()

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,-0.001593,-0.038794,0.024677,-0.102564,-0.009039,-0.030368,0.093767,0.113718,0.207849,-0.196626,0.045823,-0.026632,0.057442,0.032764,-0.011779,-0.158647,-0.057552,-0.066583,0.101354,0.109578,-0.074875,0.121459,-0.046221,-0.0243,-0.118533,-0.024265,0.010992,0.09882,0.068026,-0.079588,-0.081914,-0.072802,-0.054317,0.082919,-0.005844,-0.044471,0.00088,-0.102539,0.217388,-0.120905,...,-0.039034,0.121507,0.057002,-0.05627,-0.174667,0.183474,0.152768,0.059856,-0.082761,0.195758,-0.109678,-0.030437,0.051514,0.070257,0.051799,-0.207297,0.044851,0.188086,-0.180106,-0.064535,-0.089577,0.031865,0.100917,0.100174,0.126637,0.018963,-0.092759,-0.046239,0.022988,-0.043553,-0.06736,-0.134838,0.119411,0.136236,-0.076305,0.005758,0.063647,0.027008,-0.072191,0.039403
1,-0.05013,-0.032574,0.040009,-0.070478,0.01496,-0.016264,0.023681,0.099967,0.076933,-0.159544,0.013706,0.01609,0.134411,0.109207,-0.014994,-0.119911,0.035292,0.002763,0.104111,0.055028,-0.074653,0.105047,-0.139367,0.022343,-0.063903,-0.08701,0.017288,0.0157,0.061284,0.017485,-0.018941,-0.041308,-0.127256,0.099617,-0.025741,-0.035583,0.006283,-0.123724,0.003774,0.00772,...,0.066041,0.051264,0.129276,-0.101303,-0.066006,0.130851,0.122017,0.086055,0.015612,0.06001,-0.105352,0.016289,-0.026795,-0.018099,0.035038,-0.144326,-0.026789,0.040198,-0.154622,0.079914,0.008075,0.149712,0.106122,-0.031059,0.169579,0.015825,-0.025265,-0.06244,0.081382,-0.015202,-0.004839,-0.077604,0.079421,0.151545,0.01741,-0.066915,0.152554,-0.013467,-0.192087,-0.07279
2,-0.062681,-0.001649,-0.065317,-0.069271,-0.099975,0.09711,0.013345,0.12743,0.22829,-0.059728,-0.048351,0.116013,0.131984,-0.025756,-0.076281,0.039065,-0.005354,-0.047132,-0.048377,0.003387,-0.055041,0.250358,-0.013902,0.046574,-0.125487,0.020209,0.041472,-0.028559,-0.056496,-0.165246,-0.142798,-0.113334,-0.151397,0.145488,-0.108705,-0.112679,0.052704,-0.065826,0.090639,-0.065085,...,-0.042932,0.148284,0.105365,-0.111888,0.04903,0.109935,0.134586,0.135957,-0.167896,0.118143,0.008826,-0.072475,-0.120845,0.046872,0.161454,-0.316431,-0.135128,0.096826,-0.048556,-0.114577,0.099434,0.101338,0.102425,0.047874,0.045974,-0.023668,0.043726,0.108648,0.108099,-0.01666,-0.020495,0.045533,0.130025,0.111997,0.051208,-0.007325,0.039539,-0.032001,-0.065817,-0.024569
3,-0.087812,0.070189,0.031436,-0.027102,0.003514,-0.019794,0.065559,0.028836,0.013768,-0.045483,0.023931,0.013375,0.347372,-0.056005,0.056589,-0.043076,-0.164977,-0.051106,0.01802,-0.038279,-0.16232,0.274534,-0.0459,-0.036733,-0.152098,0.174856,0.048618,0.005961,-0.049292,0.008365,-0.083567,-0.085566,-0.024253,-0.038932,-0.091624,-0.028706,-0.015328,-0.111382,0.093598,-0.15159,...,-0.054395,0.106451,0.024779,-0.16705,-0.083785,0.158341,0.019498,0.046263,-0.071816,-0.047805,0.102176,-0.136031,-0.050093,-0.018254,0.141036,-0.3759,-0.137143,0.091926,-0.176439,-0.217191,0.093315,0.078429,-0.052731,0.076376,-5.3e-05,0.010634,-0.273593,0.115974,0.028537,-0.011521,-0.059367,0.105842,0.238214,0.175549,0.112976,0.035935,0.063203,-0.022959,-0.059141,-0.002223
4,0.049042,-0.002081,0.05591,0.128214,-0.08219,0.070018,-0.000118,-0.004932,0.19475,-0.013151,-0.058384,0.028796,0.346712,-0.218021,0.065391,0.017004,-0.05015,-0.116234,-0.042886,0.169525,0.100372,0.168263,-0.123891,-0.073979,-0.068271,0.083101,0.078798,0.012989,0.061512,0.078267,0.006305,-0.095767,-0.128447,0.190162,-0.016007,-0.082453,0.023032,-0.020339,-0.053374,-0.190562,...,0.070832,0.030475,0.173003,-0.124245,-0.033331,0.127756,-0.016138,-0.083493,0.031497,0.068579,-0.124849,-0.014028,-0.098488,0.050588,0.145403,-0.234255,-0.110694,0.015078,-0.012617,-0.205967,0.091717,-0.068639,-0.097945,0.127911,0.016807,0.012469,-0.137795,0.039771,0.072352,0.083041,-0.033834,-0.061666,0.195129,0.275832,0.020365,0.015715,0.004197,-0.042233,-0.16459,-0.018169


In [103]:
###Gettinhg the feature vector of the second question
w2v_q2 = averaged_word_vectorizer(corpus=q2, model=model,num_features=300)
w2v_q2=pd.DataFrame(w2v_q2)
w2v_q2.head()

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,-0.02078,0.000226,0.028596,-0.11315,0.018041,-0.0312,0.103279,0.112188,0.177706,-0.21549,0.05508,-0.06496,0.059817,0.008904,-0.017757,-0.174449,-0.052835,-0.07747,0.128618,0.105752,-0.094884,0.125241,-0.03365,-0.048005,-0.104928,-0.026371,0.039358,0.117997,0.052597,-0.066972,-0.059888,-0.070863,-0.062115,0.096524,0.011578,-0.044455,-0.021363,-0.111664,0.237323,-0.147613,...,-0.070338,0.13096,0.062514,-0.077551,-0.177929,0.189648,0.145972,0.094921,-0.070172,0.232341,-0.119551,-0.020053,0.04035,0.045495,0.057629,-0.201385,0.042488,0.203202,-0.177672,-0.087942,-0.086384,0.04422,0.121775,0.094553,0.130537,0.02822,-0.076818,-0.033109,-0.01691,-0.086736,-0.05482,-0.148757,0.092712,0.154447,-0.048084,-0.021718,0.03966,0.02387,-0.079574,0.01352
1,-0.018421,-0.054055,-0.00959,-0.031233,-0.015305,-0.063024,0.039219,0.068302,0.198152,-0.064593,-0.066106,0.024692,0.180193,0.057497,0.041321,-0.12232,-0.027411,-0.021559,0.122072,0.068078,-0.030081,0.221032,-0.163476,0.032546,-0.106899,-0.050659,0.075521,0.049354,0.136783,-0.029116,-0.054879,-0.060519,-0.024338,0.043632,0.118859,-0.08082,0.059122,-0.052361,0.038442,-0.006723,...,0.039762,0.154458,0.150043,-0.140503,-0.025894,0.166848,0.053585,0.009905,0.035756,0.059033,-0.069275,-0.124656,0.048707,0.021397,0.082955,-0.103757,-0.002217,0.116994,-0.100512,0.00306,-0.037971,0.164385,0.169832,0.002383,0.125062,-0.015336,0.006355,-0.064315,-0.003893,0.03702,-0.102111,-0.070717,0.221541,0.119917,-0.047902,-0.063577,0.123137,0.041174,-0.009667,0.018163
2,-0.034108,-0.002934,0.02014,-0.06632,-0.064315,0.05701,0.107524,0.069644,0.204522,-0.101833,0.005802,0.074387,0.17289,-0.02247,-0.02495,-0.013027,0.055537,-0.108799,-0.034469,0.0539,-0.101268,0.233355,-0.029163,-0.014662,-0.104661,0.046247,0.070235,0.021658,0.052394,-0.032237,-0.061015,-0.15253,-0.074892,0.121794,0.028961,-0.073562,0.105417,-0.110997,0.148358,-0.129712,...,-0.113566,0.171342,0.131497,-0.115382,-0.056529,0.159201,0.20706,0.100875,-0.094255,0.146007,-0.036137,-0.062055,-0.202766,0.052636,0.152947,-0.354267,-0.011583,0.120248,-0.098023,-0.151461,-0.004603,0.105986,0.120886,0.075473,0.047329,0.035175,-0.095878,0.085315,-0.049522,0.043608,-0.055064,0.048344,0.142632,0.134747,-0.030809,-0.044323,0.04944,-0.131361,-0.022852,-0.094697
3,-0.129162,-0.043701,0.266461,0.012439,-0.018922,-0.121805,-0.067603,0.174208,0.208274,-0.225385,-0.006046,-0.072495,0.275671,0.020101,0.005355,-0.055855,-0.04521,-0.027588,-0.03532,0.077095,-0.173976,-0.056131,-0.242168,0.105384,-0.27967,0.168353,0.098826,0.014233,0.132852,0.045832,0.07524,-0.193808,-0.160698,0.026979,-0.097786,-0.057466,0.14932,-0.180378,0.205421,-0.300669,...,0.027099,0.178827,0.32308,-0.166693,-0.062421,0.325875,-0.125612,0.113128,-0.092733,0.081781,0.092503,0.134569,-0.167216,0.041514,0.021099,-0.055928,-0.077506,0.026115,-0.315271,-0.22619,0.142786,0.183226,0.224804,0.073809,-0.003185,-0.108738,-0.005879,0.275592,0.094548,0.048986,-0.075457,-0.101498,0.380525,0.170223,-0.069955,0.014608,0.030732,-0.122876,-0.169868,-0.175261
4,-0.047014,-0.041704,0.076904,0.11948,-0.033833,0.106551,-0.019618,0.036046,0.151472,-0.036999,-0.105446,0.124607,0.375031,-0.158913,0.078993,-0.022829,-0.205966,-0.078922,-0.010842,0.180942,0.156538,0.186525,-0.148085,-0.056993,-0.12161,0.105257,0.057685,0.048716,0.13342,0.025686,-0.081815,-0.107524,-0.091534,0.17388,-0.053981,-0.046464,0.011668,-0.102709,-0.122817,-0.123431,...,0.017206,0.043253,0.09784,-0.157918,-0.025374,0.206137,0.062664,-0.159454,-0.052651,0.069219,0.009159,-0.068053,-0.052622,0.079057,0.155349,-0.215622,-0.042339,0.196823,0.011991,-0.149172,0.039722,-0.130219,-0.041633,0.140603,0.129569,0.016831,-0.141475,-0.048455,0.069644,0.219641,0.032292,-0.092122,0.313925,0.232194,-0.027054,0.028971,0.022303,-0.028875,-0.09671,0.046049


**Word2Vector Differences**

In [104]:
##Getting absolute difference between the two questions 
w2v_diff=abs(w2v_q1-w2v_q2)

**Word2Vector Cosine Similarities**

In [105]:
##Converting to tensorflow
w2v_q1_tf=tf.convert_to_tensor(w2v_q1, dtype=tf.float32)
w2v_q2_tf=tf.convert_to_tensor(w2v_q2, dtype=tf.float32)

In [106]:
##obtaining cosine similarities
cosine_scores =tf.keras.losses.cosine_similarity(w2v_q1_tf,w2v_q2_tf)

In [108]:
##Reshaping the tensorflow vec
cosine_scores=tf.reshape(cosine_scores, [200000,1])

**Word Embeddings**

In [109]:
questions1 = train_df['question1'].values
questions2 = train_df['question2'].values

In [110]:
dir='paraphrase-MiniLM-L3-v2'
model = SentenceTransformer(dir) ## Use GPU to accelerate the model encoding

#Compute embedding for both lists
embeddings1 = model.encode(questions1, convert_to_tensor=True)
embeddings2 = model.encode(questions2, convert_to_tensor=True)


In [111]:
emb= embeddings1 * embeddings2
emb= np.array(emb.tolist())

# Clustering

In [32]:
#optimum_k(40,x[:50000])

**TFIDF Differences**

In [115]:
##MiniBatch Kmeans on TFIDF differences
kmeans_clusters_tfidf=min_batch_kmeans(tfidf_diff)

In [116]:
##Getting the percentage of actual Labels in each cluster
get_clusters_label(kmeans_clusters_tfidf,train_df.is_duplicate)

Unnamed: 0_level_0,Unnamed: 1_level_0,actual_labels
clusters,actual_labels,Unnamed: 2_level_1
0,0,59.55957
0,1,40.44043
1,0,67.765624
1,1,32.234376


In [59]:
#cluster_labels_dist_plot(kmeans_clusters_tfidf,train_df.is_duplicate.values)

In [127]:
calculate_metrics(train_df.is_duplicate,kmeans_clusters_tfidf,tfidf_diff)

Homogeneity score:  0.01
Completeness score:  0.01
V-measure score:  0.01
Adjusted rand score:  -0.0
Kappa score:  -0.08
Silhouette score:  0.08
Correlation:  SpearmanrResult(correlation=-0.08276865334075897, pvalue=6.201587980712732e-301)


In [None]:
# from yellowbrick.text import TSNEVisualizer
# tsne = TSNEVisualizer(decompose= None, random_state=123)
# tsne.fit(kmeans_distances, train_df.is_duplicate)
# tsne.show()

**Word2Vec Differences**

In [117]:
##MiniBatch Kmeans on W2V differences
kmeans_clusters_w2v_diff=min_batch_kmeans(w2v_diff)

In [118]:
##Getting the percentage of actual Labels in each cluster
get_clusters_label(kmeans_clusters_w2v_diff,train_df.is_duplicate)

Unnamed: 0_level_0,Unnamed: 1_level_0,actual_labels
clusters,actual_labels,Unnamed: 2_level_1
0,0,79.044528
0,1,20.955472
1,0,53.997201
1,1,46.002799


In [126]:
calculate_metrics(train_df.is_duplicate,kmeans_clusters_w2v_diff,w2v_diff)

Homogeneity score:  0.05
Completeness score:  0.05
V-measure score:  0.05
Adjusted rand score:  0.02
Kappa score:  0.21
Silhouette score:  0.2
Correlation:  SpearmanrResult(correlation=0.24705888835317308, pvalue=0.0)


**Word2Vec Cosine Similarity**

In [119]:
##MiniBatch Kmeans on W2V cosine similarities
kmeans_clusters_w2v_cosine=min_batch_kmeans(cosine_scores)

In [120]:
##Getting the percentage of actual Labels in each cluster
get_clusters_label(kmeans_clusters_w2v_cosine,train_df.is_duplicate)

Unnamed: 0_level_0,Unnamed: 1_level_0,actual_labels
clusters,actual_labels,Unnamed: 2_level_1
0,0,55.089185
0,1,44.910815
1,0,91.149839
1,1,8.850161


In [125]:
calculate_metrics(train_df.is_duplicate,kmeans_clusters_w2v_cosine,cosine_scores)

Homogeneity score:  0.08
Completeness score:  0.11
V-measure score:  0.09
Adjusted rand score:  -0.01
Kappa score:  -0.28
Silhouette score:  0.64
Correlation:  SpearmanrResult(correlation=-0.3051773020068975, pvalue=0.0)


**Word Embeddings**

In [121]:
##MiniBatch Kmeans on W2V cosine similarities
kmeans_clusters_emb=min_batch_kmeans(emb)

In [123]:
##Getting the percentage of the actual labels in each cluster
get_clusters_label(kmeans_clusters_emb,train_df.is_duplicate)

Unnamed: 0_level_0,Unnamed: 1_level_0,actual_labels
clusters,actual_labels,Unnamed: 2_level_1
0,1,96.8
0,0,3.2
1,0,62.830038
1,1,37.169962


In [124]:
calculate_metrics(train_df.is_duplicate,kmeans_clusters_emb,emb)

Homogeneity score:  0.0
Completeness score:  0.11
V-measure score:  0.0
Adjusted rand score:  0.0
Kappa score:  -0.0
Silhouette score:  0.17
Correlation:  SpearmanrResult(correlation=-0.043580430321308324, pvalue=1.121630903903153e-84)
