# Data Loading

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm, preprocessing 
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# nltk.download('punkt')
# nltk.download('stopwords')
# stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train_df= pd.read_csv("/content/drive/MyDrive/DEBI/nlp/Group_project/final_project/Dataset/training_data_cleaned.csv")

In [None]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,quest1_len,quest2_len
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,14,12
1,1,3,4,what is the story of kohinoor kohinoor diamond,what would happen if the indian government sto...,0,8,13
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0,14,10
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when math2324math is divide...,0,11,9
4,4,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0,13,7


# Text Features

In [None]:
##Concatenate the two questions
q1_2 = pd.concat([train_df.question1, train_df.question2], axis = 0)

In [None]:
##TFIDF Feature Vector
tfidf_vec = TfidfVectorizer()
tfidf_vec.fit(q1_2)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
tfidf1 = tfidf_vec.transform(train_df.question1.values)
tfidf2 = tfidf_vec.transform(train_df.question2.values)

In [None]:
tfidf_diff=abs(tfidf1-tfidf2)

In [None]:
##Dimensionality Reduction of Features
svd = TruncatedSVD(n_components=50)
x=svd.fit_transform(tfidf_diff)


# Modeling


In [None]:
# Method contains all the classification algorithms
def estimators(features, labels, estimator):
  X_train, X_test, y_train, y_test = train_test_split(
     features, labels, test_size=0.2, random_state=0)
  if (estimator == 'SVM'):
    model = svm.SVC().fit(X_train, y_train )
    text = 'SVM'
  if (estimator == 'DecisionTree'):
    model = DecisionTreeClassifier(max_depth=10).fit(X_train, y_train)
    text = 'Decision Tree'
  if (estimator == 'KNN'):
    model = KNeighborsClassifier(n_neighbors = 5).fit(X_train, y_train)
    text = 'KNN'
  if (estimator == 'RandomForest'):
    text = 'RandomForest'
    model=RandomForestClassifier(max_depth=15, random_state=0).fit(X_train, y_train)
  if (estimator == "LR"):
    text= "Linear Regression"
    model = LogisticRegression(random_state=0).fit(X_train, y_train)


  train_prediction = model.predict(X_train)
  prediction = model.predict(X_test)
  print(text, " Train Accuracy : ", accuracy_score(y_train,train_prediction)*100)
  print(text, " Test Accuracy : ", accuracy_score(y_test,prediction)*100)
  print("\n\t\tTEST DATA METRICS")
  print(text, " Confusion Matrix: ",confusion_matrix(y_test, prediction))
  print(text, " Report : ")
  print(classification_report(y_test,prediction))



In [None]:
print('Decision Tree on TFiDF')
estimators(x[:200000], train_df.is_duplicate.values[:200000], 'DecisionTree')

Decision Tree on TFiDF
Decision Tree  Train Accuracy :  71.245625
Decision Tree  Test Accuracy :  67.18499999999999

		TEST DATA METRICS
Decision Tree  Confusion Matrix:  [[20888  4179]
 [ 8947  5986]]
Decision Tree  Report : 
              precision    recall  f1-score   support

           0       0.70      0.83      0.76     25067
           1       0.59      0.40      0.48     14933

    accuracy                           0.67     40000
   macro avg       0.64      0.62      0.62     40000
weighted avg       0.66      0.67      0.65     40000



In [None]:
print('Random Forest on TFiDF')
estimators(x[:200000], train_df.is_duplicate.values[:200000], 'RandomForest')

Random Forest on TFiDF
RandomForest  Train Accuracy :  84.606875
RandomForest  Test Accuracy :  71.52

		TEST DATA METRICS
RandomForest  Confusion Matrix:  [[22728  2339]
 [ 9053  5880]]
RandomForest  Report : 
              precision    recall  f1-score   support

           0       0.72      0.91      0.80     25067
           1       0.72      0.39      0.51     14933

    accuracy                           0.72     40000
   macro avg       0.72      0.65      0.65     40000
weighted avg       0.72      0.72      0.69     40000



In [None]:
print('SVM on TFiDF')
estimators(x[:200000], train_df.is_duplicate.values[:200000], 'SVM')

SVM on TFiDF
SVM  Train Accuracy :  75.514375
SVM  Test Accuracy :  72.2275

		TEST DATA METRICS
SVM  Confusion Matrix:  [[21939  3128]
 [ 7981  6952]]
SVM  Report : 
              precision    recall  f1-score   support

           0       0.73      0.88      0.80     25067
           1       0.69      0.47      0.56     14933

    accuracy                           0.72     40000
   macro avg       0.71      0.67      0.68     40000
weighted avg       0.72      0.72      0.71     40000



In [None]:
print('KNN on TFiDF')
estimators(x[:200000], train_df.is_duplicate.values[:200000], 'KNN')

KNN on TFiDF
KNN  Train Accuracy :  79.335
KNN  Test Accuracy :  68.8975

		TEST DATA METRICS
KNN  Confusion Matrix:  [[18710  6357]
 [ 6084  8849]]
KNN  Report : 
              precision    recall  f1-score   support

           0       0.75      0.75      0.75     25067
           1       0.58      0.59      0.59     14933

    accuracy                           0.69     40000
   macro avg       0.67      0.67      0.67     40000
weighted avg       0.69      0.69      0.69     40000

