In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup

In [24]:
df = pd.read_csv('train.csv')

In [25]:
df.shape

(404290, 6)

In [26]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [27]:
new_df = df.sample(50000)
new_df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [28]:
new_df.duplicated().sum()

0

In [29]:
def preprocess(q):
    
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')
    
    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')
    
    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)
    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    
    return q

In [30]:
new_df['question1'] = new_df['question1'].apply(preprocess)
new_df['question2'] = new_df['question2'].apply(preprocess)

  q = BeautifulSoup(q)


In [31]:
new_df.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
39640,39640,71841,71842,how do you get netflix on directv,how can i get netflix for free without a credi...,0
64756,64756,112480,11786,is there any marathi speech to text,how do i disable voice data for text to speec...,0
190655,190655,43241,222441,what is displaystyle lim_ x to 0 frac x tan...,what is the value of lim_ x to infty frac...,0
272449,272449,197001,2646,what is the smoothest pick up line you have ev...,what are the best pick up lines,0
16320,16320,31120,31121,how is donald trump a better choice than hilla...,who will be the better president trump or clinton,1


In [32]:
new_df['q1_len'] = new_df['question1'].str.len() 
new_df['q2_len'] = new_df['question2'].str.len()

In [33]:
new_df['q1_num_words'] = new_df['question1'].apply(lambda row: len(row.split(" ")))
new_df['q2_num_words'] = new_df['question2'].apply(lambda row: len(row.split(" ")))

In [46]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return len(w1 & w2)

In [48]:
new_df['word_common'] = new_df.apply(common_words, axis=1)
new_df.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common
39640,39640,71841,71842,how do you get netflix on directv,how can i get netflix for free without a credi...,0,33,52,7,11,3
64756,64756,112480,11786,is there any marathi speech to text,how do i disable voice data for text to speec...,0,35,72,7,15,3
190655,190655,43241,222441,what is displaystyle lim_ x to 0 frac x tan...,what is the value of lim_ x to infty frac...,0,67,64,24,24,9
272449,272449,197001,2646,what is the smoothest pick up line you have ev...,what are the best pick up lines,0,54,31,11,7,4
16320,16320,31120,31121,how is donald trump a better choice than hilla...,who will be the better president trump or clinton,1,56,49,10,9,3
191694,191694,291103,291104,is it a fact that from 2017 onwards du is goi...,is it a fact that from 2016 onwards du is goin...,0,108,95,23,19,17
156400,156400,244784,2967,i have a cooking pot that is made from 100 per...,how can i make a healthy home made pizza,0,76,40,17,9,5
363041,363041,492984,340193,is there any way to watch turkish tv show with...,where can we watch tv shows with english subti...,1,64,50,12,9,5
347063,347063,475487,475488,i just turned 21 and i am tired of being fat ...,i am a 14 year old male that is tired of being...,1,85,80,19,19,9
176413,176413,271429,271430,what are the basic knowledge required while st...,what is the basic requirement of accounting kn...,0,119,53,17,8,4


In [49]:
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return (len(w1) + len(w2))

In [50]:
new_df['word_total'] = new_df.apply(total_words, axis=1)
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total
39640,39640,71841,71842,how do you get netflix on directv,how can i get netflix for free without a credi...,0,33,52,7,11,3,18
64756,64756,112480,11786,is there any marathi speech to text,how do i disable voice data for text to speec...,0,35,72,7,15,3,21
190655,190655,43241,222441,what is displaystyle lim_ x to 0 frac x tan...,what is the value of lim_ x to infty frac...,0,67,64,24,24,9,25
272449,272449,197001,2646,what is the smoothest pick up line you have ev...,what are the best pick up lines,0,54,31,11,7,4,18
16320,16320,31120,31121,how is donald trump a better choice than hilla...,who will be the better president trump or clinton,1,56,49,10,9,3,19


In [52]:
new_df['word_share'] = round(new_df['word_common']/new_df['word_total'],2)
new_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share
39640,39640,71841,71842,how do you get netflix on directv,how can i get netflix for free without a credi...,0,33,52,7,11,3,18,0.17
64756,64756,112480,11786,is there any marathi speech to text,how do i disable voice data for text to speec...,0,35,72,7,15,3,21,0.14
190655,190655,43241,222441,what is displaystyle lim_ x to 0 frac x tan...,what is the value of lim_ x to infty frac...,0,67,64,24,24,9,25,0.36
272449,272449,197001,2646,what is the smoothest pick up line you have ev...,what are the best pick up lines,0,54,31,11,7,4,18,0.22
16320,16320,31120,31121,how is donald trump a better choice than hilla...,who will be the better president trump or clinton,1,56,49,10,9,3,19,0.16


In [36]:
ques_df = new_df[['question1','question2']]

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(ques_df['question1']) + list(ques_df['question2'])
cv = CountVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [38]:
temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(50000, 6000)

In [39]:
temp_df['is_duplicate'] = new_df['is_duplicate']

In [40]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,is_duplicate
39640,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64756,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190655,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272449,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16320,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [41]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(temp_df.iloc[:,0:-1].values,temp_df.iloc[:,-1].values,test_size=0.2,random_state=1)

In [42]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.7518