# Kaggle Quora Challenge

by Seth Rabin and Mukul Ramm

The goal is to determine whether two questions have the same purpose

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data_dir = '../../data/'

In [3]:
train = pd.read_csv(data_dir + 'train.csv', index_col = 'id')
train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
test = pd.read_csv(data_dir + 'test.csv', index_col = 'test_id')
test.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,question1,question2
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,What but is the best way to send money from Ch...,What you send money to China?
3,Which food not emulsifiers?,What foods fibre?
4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [5]:
train.iloc[0]

qid1                                                            1
qid2                                                            2
question1       What is the step by step guide to invest in sh...
question2       What is the step by step guide to invest in sh...
is_duplicate                                                    0
Name: 0, dtype: object

In [6]:
train.iloc[0].question1

'What is the step by step guide to invest in share market in india?'

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404290 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


In [8]:
train = train.dropna()

In [9]:
test = test.fillna('', axis=1)

In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2345796 entries, 0 to 2345795
Data columns (total 2 columns):
question1    object
question2    object
dtypes: object(2)
memory usage: 53.7+ MB


# Feature Engineer

The features I plan to engineer are -

1. Frequency of words
2. Synonyms
3. Similar words
4. Unique words
5. Speech tagging



In [11]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string
from tqdm import tqdm

tqdm.pandas(desc='progress-bar')

In [12]:
dummy = train.iloc[0].question1

In [13]:
dummy1 = train.iloc[0].question1

In [14]:
dummy2 = train.iloc[0].question2

In [15]:
def process(x):
    import nltk
    import string 
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem.wordnet import WordNetLemmatizer
    
    if x:
        stop_words = stopwords.words('english') + list(string.punctuation)
        
        tokens = word_tokenize(x.lower())
        updated = list(set(tokens) - set(stop_words))

        lemma = WordNetLemmatizer()

        lemmatized = [lemma.lemmatize(word) for word in updated]

        return lemmatized
    else:
        return []


In [16]:
train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [17]:
train['process1'] = train.question1.progress_apply(process)
train['process2'] = train.question2.progress_apply(process)

progress-bar: 100%|██████████████████████████████████████████| 404288/404288 [03:14<00:00, 2083.94it/s]
progress-bar: 100%|██████████████████████████████████████████| 404288/404288 [03:12<00:00, 2102.91it/s]


In [18]:
test['process1'] = test.question1.progress_apply(process)
test['process2'] = test.question2.progress_apply(process)

progress-bar: 100%|████████████████████████████████████████| 2345796/2345796 [19:09<00:00, 2040.36it/s]
progress-bar: 100%|████████████████████████████████████████| 2345796/2345796 [19:32<00:00, 2000.28it/s]


In [19]:
def create_unique(row):
    proc1 = process(row.question1)
    proc2 = process(row.question2)
    one_bigger_two = set(proc1) - set(proc2)
    two_bigger_one = set(proc2) - set(proc1)
    
    return len(one_bigger_two) + len(two_bigger_one)

In [20]:
train['unique'] = 0
train.unique = train.progress_apply(create_unique, axis=1)
train.head()

progress-bar: 100%|███████████████████████████████████████████| 404288/404288 [07:13<00:00, 931.85it/s]


Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate,process1,process2,unique
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[share, invest, step, What, market, india, guide]","[share, invest, step, market, What, guide]",1
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[Diamond, Koh-i-Noor, story, Kohinoor, What]","[would, government, Koh-i-Noor, diamond, India...",1
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[using, internet, VPN, increase, speed, connec...","[hacking, Internet, speed, increased, DNS, How]",1
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[Why, mentally, lonely, solve, I, How]","[divided, 23^, Find, 24,23, math, remainder, /...",1
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[one, quikly, di, oxide, sugar, water, methane...","[fish, would, survive, water, Which, salt]",1


In [21]:
test['unique'] = 0
test.unique = test.progress_apply(create_unique, axis=1)
test.head()

progress-bar:  20%|████████▍                                 | 470013/2345796 [08:31<34:35, 903.59it/s]

KeyboardInterrupt: 

progress-bar:  20%|████████▍                                 | 470098/2345796 [08:50<34:35, 903.59it/s]

In [None]:
def create_synonym(row):
    proc1 = process(row.question1)
    proc2 = process(row.question2)
    
    if len(proc1) > len(proc2):
        larger =  proc1 
        smaller = proc2
    else:
        larger = proc2
        smaller = proc1
        
    num_syns = 0
    
    for word1 in larger:
        for word2 in smaller:
            for i,j in enumerate(wordnet.synsets(word1)):
                if word2 in j.lemma_names():
                    num_syns += 1
                    break
    if len(larger):                
        return num_syns / len(larger)
    else:
        return 0

In [None]:
dum = train.iloc[0].question1
dum = process(dum)
wordnet.synsets('fan')


In [None]:
test['synonyms'] = test.progress_apply(create_synonym, axis=1)

# Scraping

In [4]:
from bs4 import BeautifulSoup
import requests


In [12]:
def scrape_for_synonyms(word_of_interest):
    
    from bs4 import BeautifulSoup
    import requests

    r = requests.get('http://www.thesaurus.com/browse/' + word_of_interest)
    soup = BeautifulSoup(r.content, 'html.parser')
    relevant = soup.find(class_='relevancy-list')
    syonyms = [words.text for words in relevant.findAll("li")]
    return synonyms

SyntaxError: invalid syntax (<ipython-input-12-f730824a4d51>, line 1)

['crazystar',
 'lunaticstar',
 'cuckoostar',
 'maniacstar',
 'nutsstar',
 'sickstar',
 'psychostar',
 'crackersstar',
 'bonkersstar',
 'brainsickstar',
 'daftstar',
 'dementedstar',
 'derangedstar',
 'distraughtstar',
 'disturbedstar',
 'dottystar',
 'kookystar',
 'locostar',
 'madstar',
 'maniacalstar',
 'mentally illstar',
 'moonstruckstar',
 'touchedstar',
 'unbalancedstar',
 'unhingedstar',
 'unsoundstar',
 'wackystar',
 'crazy as a loonstar',
 'mad as a hatterstar',
 'not all therestar',
 'nutty as a fruitcakestar',
 'out to lunchstar',
 'sick in the headstar',
 'stark raving madstar',
 "off one's rockerstar",
 "out of one's mindstar"]