# Kaggle Quora Challenge

This is a project by Seth Rabin and Mukul Ram.

The aim is to determine whether two questions on Quora possess similar intent.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data_dir = '../../data/'

In [3]:
train = pd.read_csv(data_dir + 'train.csv', index_col='id')
train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
test = pd.read_csv(data_dir + 'test.csv', index_col='test_id')
test.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,question1,question2
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,What but is the best way to send money from Ch...,What you send money to China?
3,Which food not emulsifiers?,What foods fibre?
4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [5]:
train.is_duplicate.value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

In [6]:
train.iloc[0].question1

'What is the step by step guide to invest in share market in india?'

In [7]:
train.iloc[0].question2

'What is the step by step guide to invest in share market?'

In [8]:
train.iloc[0].is_duplicate

0

In [9]:
train.iloc[1].question1

'What is the story of Kohinoor (Koh-i-Noor) Diamond?'

In [10]:
train.iloc[1].question2

'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?'

In [11]:
train.iloc[1].is_duplicate

0

## Exploratory Data Analysis

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404290 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


In [13]:
train = train.dropna()
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404288 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404288 non-null int64
qid2            404288 non-null int64
question1       404288 non-null object
question2       404288 non-null object
is_duplicate    404288 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2345796 entries, 0 to 2345795
Data columns (total 2 columns):
question1    object
question2    object
dtypes: object(2)
memory usage: 53.7+ MB


## Feature Engineering

The features I plan to engineer are - 

In [15]:
import nltk
from tqdm import tqdm
import string
from nltk.corpus import stopwords

tqdm.pandas(desc='progress-bar')

In [16]:
dummy1 = train.iloc[0].question1
dummy2 = train.iloc[0].question2

In [17]:
def process(x):
    import nltk
    import string
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem.wordnet import WordNetLemmatizer
    
    if x:
        stop_words = set(stopwords.words('english') + list(string.punctuation))
    
        tokens = word_tokenize(x)
        updated = list(set(tokens) - stop_words)

        lemma = WordNetLemmatizer()
        lemmatized = [lemma.lemmatize(word) for word in updated]

        return lemmatized
    
    else:
        return []

In [18]:
train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [19]:
train['process1'] = train.question1.progress_apply(process)
train['process2'] = train.question2.progress_apply(process)

progress-bar:   3%|▎         | 11720/404288 [00:05<02:21, 2774.71it/s]

KeyboardInterrupt: 

          progress-bar:   3%|▎         | 11845/404288 [00:20<02:21, 2774.71it/s]

In [None]:
test['process1'] = test.question1.progress_apply(process)
test['process2'] = test.question2.progress_apply(process)

In [None]:
def create_unique(row):
    proc1 = row.process1
    proc2 = row.process2
    
    return 1 if ((set(proc1) - set(proc2)) or (set(proc2) - set(proc1))) else 0

In [None]:
train['unique'] = 0

train.unique = train.progress_apply(create_unique, axis=1)
train.head()

In [None]:
test['unique'] = 0

test.unique = test.progress_apply(create_unique, axis=1)
test.head()