In [74]:
#Imports
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

import string
import nltk
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [75]:
#Load the dataset
df= pd.read_csv('data/quora-question-pairs/train.csv.zip')

In [76]:
#Dataset Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404289 non-null  object
 4   question2     404288 non-null  object
 5   is_duplicate  404290 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [77]:
#Dataset head
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [78]:
#SubSampling
df = df.sample(5000)

In [79]:
#Removing the na values
df.dropna(inplace=True)

In [80]:
#Function for preprocessing text
def preprocess(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove other symbols
    #text = re.sub(r'\d+', '', text)
    
    # Remove punctuation and special characters
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    #Tokenize text into words
    words = nltk.word_tokenize(text)
    
    #Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    #Lemmatize words using WordNet
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    #Join words
    text = ' '.join(words)

    return text

In [81]:
#INitializing the TFIDF vectorizer 
vectorizer = TfidfVectorizer(analyzer = 'word',
                        stop_words = 'english',
                        lowercase = True,
                        max_features = 300,
                        norm = 'l1')

In [82]:
#Function to transform text
def T2V(text):
    z= vectorizer.transform([text])
    return z.toarray()

In [83]:
#Applying the preprocessing on text
df['question1'] = df['question1'].apply(preprocess)
df['question2'] = df['question2'].apply(preprocess)

In [84]:
#Concating the textual data 
BagOfWords = pd.concat([df.question1, df.question2], axis = 0)

In [85]:
#Fitting the Vectorizer
vectorizer.fit(BagOfWords)

TfidfVectorizer(max_features=300, norm='l1', stop_words='english')

In [86]:
#Transforming the text
train_q1_tfidf = vectorizer.transform(df.question1)
train_q2_tfidf = vectorizer.transform(df.question2)

In [87]:
#Defining X and Y data sets
X = abs(train_q1_tfidf - train_q2_tfidf)
y = df['is_duplicate']

In [88]:
#Train and Test Splits
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=66)

In [89]:
#Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [90]:
#Evaluation
pred_lr = lr.predict_proba(X_test)[:,1]
logloss_lr = log_loss(y_test, pred_lr)
logloss_lr

0.6157023456752038

In [91]:
#Random Forest
rf = RandomForestClassifier(n_estimators = 200,min_samples_leaf = 10,n_jobs = -1)
rf.fit(X_train, y_train)

RandomForestClassifier(min_samples_leaf=10, n_estimators=200, n_jobs=-1)

In [92]:
#Evaluation
pred_rf = rf.predict_proba(X_test)[:,1]
logloss_rf = log_loss(y_test, pred_rf)
logloss_rf

0.620994786831478

## TEST :

In [134]:
qu01 = "Do you believe there is life after death?"
qu02 = "Is it true that there is life after death?"

In [135]:
textqu01 = T2V(qu01)
textqu02 = T2V(qu02)
x_ex = abs(textqu01+textqu02)

In [136]:
lr.predict(x_ex)

array([1])