In [1]:
import opendatasets as od

In [2]:
od.download('https://www.kaggle.com/c/quora-question-pairs/data')

Skipping, found downloaded files in "./quora-question-pairs" (use force=True to force download)


In [3]:
!ls quora-question-pairs/

sample_submission.csv.zip test.csv.zip              train.csv.zip
test.csv                  train.csv


In [37]:
import pandas as pd
import numpy as np
import string

In [5]:
df = pd.read_csv('quora-question-pairs/train.csv', index_col='id')

In [6]:
df.sample(10)


Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
247673,52268,360948,How do you ask someone if they are as serious ...,"What did mean when someone says ""there are two...",0
38734,9739,57817,How do I get rid of scalp acne?,What are the best ways to get rid of acne?,1
403209,19158,213440,How do a calico cat and a tortoiseshell cat di...,What are the difference between tortoiseshell ...,1
128977,207299,207300,Which is the best Area to open cafe in Jaipur?,"Which is the best place (cafés, restaurants) t...",0
295376,52298,6100,How can I earn money part time online?,What is the easiest way to earn money from onl...,1
189370,288104,288105,What is the difference between TTA(JE) and JTO...,Is BSNL JE (or TTA) a good job for a fresher? ...,0
397535,530661,71659,Will Mark Zuckerberg run for political office?,Will Mark Zuckerberg run for president — in 20...,0
139874,222385,222386,"If dinosaurs were reptiles, then how come bird...",Were dinosaurs endothermic like their closest ...,0
206937,310492,310493,"What are the best & innovative strategies, ide...",What are some tips for starting a retail chain...,1
2350,4673,4674,I found these needle cases with Arabic on them...,I'm a translator and want to partner up with p...,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   qid1          404290 non-null  int64 
 1   qid2          404290 non-null  int64 
 2   question1     404289 non-null  object
 3   question2     404288 non-null  object
 4   is_duplicate  404290 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


In [47]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [63]:
stop_words = set(stopwords.words('english'))
def remove_stopwords_puncs(str_):
    str_ = str_.translate(str.maketrans('', '', string.punctuation)).split()
    return ' '.join([word for word in str_ if word not in stop_words])

def apply_preprocessing(str_):
    # lower
    str_ = str(str_).lower()
    # stopwords
    return remove_stopwords_puncs(str_)


In [64]:
# initially use CountVector
cv = CountVectorizer(max_features=3000)
tf_idf = TfidfVectorizer(max_features=3000)
def return_tf_vector(df1):
    combined_li = list(df1['question1'].apply(apply_preprocessing)) + list(df1['question2'].apply(apply_preprocessing))
    cv_ = cv.fit_transform(combined_li).toarray()
    print(cv_.shape)
    cv_ = np.vsplit(cv_, 2)
    return cv_
    
def return_tf_idf(df1):
    combined_li = list(df1['question1'].apply(apply_preprocessing)) + list(df1['question2'].apply(apply_preprocessing))
    tf_idf_ = tf_idf.fit_transform(combined_li).toarray()
    print(tf_idf_.shape)
    return np.vsplit(tf_idf_, 2)

In [113]:
samp = df.sample(40000)
print(samp.shape)
tf = return_tf_vector(samp)
tf_idf = return_tf_vector(samp)

(40000, 5)
(80000, 3000)
(80000, 3000)


In [114]:
# we will be getting pair of q1 (sample_size, 3000) & q2 (sample_size, 3000) join them on column size
df_tf = pd.DataFrame(np.hstack(tf))
df_tf_idf = pd.DataFrame(np.hstack(tf_idf))

In [115]:
y = samp.iloc[:, -1]

In [116]:
y.shape, df_tf.shape, df_tf_idf.shape

((40000,), (40000, 6000), (40000, 6000))

In [117]:
from sklearn.model_selection import train_test_split

In [118]:
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(df_tf, y)
X_train_idf, X_test_idf, y_train_idf, y_test_idf = train_test_split(df_tf_idf, y)

In [119]:
from sklearn.naive_bayes import MultinomialNB

In [120]:
mnb = MultinomialNB()
mnb_idf = MultinomialNB()

In [121]:
mnb.fit(X_train_tf, y_train_tf)
mnb_idf.fit(X_train_idf, y_train_idf)


In [122]:
y_pred = mnb.predict(X_test_tf)
y_pred_idf = mnb_idf.predict(X_test_idf)

In [123]:
from sklearn.metrics import classification_report, accuracy_score

In [124]:
print(classification_report(y_pred, y_test_tf))
print(accuracy_score(y_pred, y_test_tf))
print(classification_report(y_pred_idf, y_test_idf))
print(accuracy_score(y_pred_idf, y_test_idf))

              precision    recall  f1-score   support

           0       0.80      0.75      0.77      6698
           1       0.55      0.62      0.58      3302

    accuracy                           0.71     10000
   macro avg       0.67      0.68      0.68     10000
weighted avg       0.72      0.71      0.71     10000

0.7055
              precision    recall  f1-score   support

           0       0.80      0.74      0.77      6705
           1       0.54      0.62      0.58      3295

    accuracy                           0.70     10000
   macro avg       0.67      0.68      0.68     10000
weighted avg       0.72      0.70      0.71     10000

0.7032


In [125]:
from sklearn.ensemble import RandomForestClassifier

In [127]:
rfc = RandomForestClassifier()
rfc_idf = RandomForestClassifier()

In [None]:
rfc.fit(X_train_tf, y_train_tf)
rfc_idf.fit(X_train_idf, y_train_idf)

In [None]:
y_pred = rfc.predict(X_test_tf)

In [26]:
print(classification_report(y_pred, y_test))
print(accuracy_score(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.90      0.74      0.81      3874
           1       0.45      0.73      0.56      1126

    accuracy                           0.74      5000
   macro avg       0.68      0.74      0.69      5000
weighted avg       0.80      0.74      0.76      5000

0.738


In [27]:
from xgboost import XGBClassifier

In [28]:
xgb_c = XGBClassifier()

In [29]:
xgb_c.fit(X_train, y_train)

In [30]:
y_pred = xgb_c.predict(X_test)

In [31]:
print(classification_report(y_pred, y_test))
print(accuracy_score(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.89      0.73      0.80      3876
           1       0.43      0.70      0.53      1124

    accuracy                           0.72      5000
   macro avg       0.66      0.72      0.67      5000
weighted avg       0.79      0.72      0.74      5000

0.724


In [35]:
lo = 'sai^&^#@#@$@)'


In [36]:
lo.translate(str.maketrans('', '', string.punctuation))

'sai'