# Kaggle Quora Challenge

This is a project by Seth Rabin and Mukul Ram.

The aim is to determine whether two questions on Quora possess similar intent.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
data_dir = '../../data/'

In [3]:
train = pd.read_csv(data_dir + 'train.csv', index_col='id')
train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [99]:
test = pd.read_csv(data_dir + 'test.csv', index_col='test_id')
test.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,question1,question2
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,What but is the best way to send money from Ch...,What you send money to China?
3,Which food not emulsifiers?,What foods fibre?
4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [5]:
train.is_duplicate.value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

In [6]:
train.iloc[0].question1

'What is the step by step guide to invest in share market in india?'

In [7]:
train.iloc[0].question2

'What is the step by step guide to invest in share market?'

In [8]:
train.iloc[0].is_duplicate

0

In [9]:
train.iloc[1].question1

'What is the story of Kohinoor (Koh-i-Noor) Diamond?'

In [10]:
train.iloc[1].question2

'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?'

In [11]:
train.iloc[1].is_duplicate

0

## Exploratory Data Analysis

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404290 non-null int64
qid2            404290 non-null int64
question1       404290 non-null object
question2       404288 non-null object
is_duplicate    404290 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


In [13]:
train = train.dropna()
train.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404288 entries, 0 to 404289
Data columns (total 5 columns):
qid1            404288 non-null int64
qid2            404288 non-null int64
question1       404288 non-null object
question2       404288 non-null object
is_duplicate    404288 non-null int64
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


In [100]:
test = test.fillna('', axis=1)
test.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2345796 entries, 0 to 2345795
Data columns (total 2 columns):
question1    2345796 non-null object
question2    2345796 non-null object
dtypes: object(2)
memory usage: 53.7+ MB


## Feature Engineering

The features I plan to engineer are - 

In [15]:
import nltk
from tqdm import tqdm
import string
from nltk.corpus import stopwords

tqdm.pandas(desc='progress-bar')

In [16]:
dummy1 = train.iloc[0].question1
dummy2 = train.iloc[0].question2

In [17]:
def process(x):
    import nltk
    import string
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem.wordnet import WordNetLemmatizer
    
    if x:
        additional_words = 'like '.split()
        stop_words = set(stopwords.words('english') + list(string.punctuation) + additional_words)
    
        tokens = word_tokenize(x.lower())
        updated = list(set(tokens) - stop_words)

        lemma = WordNetLemmatizer()
        lemmatized = [lemma.lemmatize(word) for word in updated]

        return lemmatized
    
    else:
        return []

In [18]:
train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [19]:
train['process1'] = train.question1.progress_apply(process)
train['process2'] = train.question2.progress_apply(process)

progress-bar: 100%|██████████| 404288/404288 [02:16<00:00, 2961.14it/s]
progress-bar: 100%|██████████| 404288/404288 [02:13<00:00, 3039.28it/s]


In [20]:
# test['process1'] = test.question1.progress_apply(process)
# test['process2'] = test.question2.progress_apply(process)

In [21]:
def create_unique(row):
    proc1 = row.process1
    proc2 = row.process2
    
    return len(list(set(proc1) - set(proc2)) + list(set(proc2) - set(proc1)))

In [22]:
train['unique'] = 0

train.unique = train.progress_apply(create_unique, axis=1)
train.head()

progress-bar: 100%|██████████| 404288/404288 [00:13<00:00, 29719.84it/s]


Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate,process1,process2,unique
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[guide, india, invest, market, share, step]","[guide, invest, market, share, step]",1
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[story, kohinoor, koh-i-noor, diamond]","[would, government, back, diamond, koh-i-noor,...",7
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[using, speed, internet, increase, vpn, connec...","[speed, internet, dns, hacking, increased]",7
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[lonely, mentally, solve]","[/math, 24, find, 23^, remainder, 24,23, divid...",11
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[methane, carbon, salt, oxide, water, one, sug...","[salt, fish, would, survive, water]",11


In [23]:
# test['unique'] = 0

# test.unique = test.progress_apply(create_unique, axis=1)
# test.head()

In [26]:
def create_synonym(row):
    from nltk.corpus import wordnet
    
    proc1 = row.process1
    proc2 = row.process2
    
    if len(proc1) > len(proc2):
        larger =  proc1 
        smaller = proc2
    else:
        larger = proc2
        smaller = proc1
        
    num_syns = 0
    
    for word1 in larger:
        for word2 in smaller:
            for i,j in enumerate(wordnet.synsets(word1)):
                if word2 in j.lemma_names():
                    num_syns += 1
                    break
                    
    if len(larger):
        return num_syns / len(larger)
    else:
        return 0

In [27]:
train['syn_rat'] = train.progress_apply(create_synonym, axis=1)


progress-bar:   0%|          | 0/404288 [00:00<?, ?it/s][A
progress-bar:   0%|          | 1/404288 [00:00<19:36:26,  5.73it/s][A
progress-bar:   0%|          | 149/404288 [00:00<13:44:34,  8.17it/s][A
progress-bar:   0%|          | 292/404288 [00:00<9:38:25, 11.64it/s] [A
progress-bar:   0%|          | 442/404288 [00:00<6:46:05, 16.57it/s][A
progress-bar:   0%|          | 610/404288 [00:00<4:45:21, 23.58it/s][A
progress-bar:   0%|          | 763/404288 [00:00<3:20:59, 33.46it/s][A
progress-bar:   0%|          | 892/404288 [00:00<2:22:13, 47.27it/s][A
progress-bar:   0%|          | 1050/404288 [00:00<1:40:47, 66.68it/s][A
progress-bar:   0%|          | 1188/404288 [00:00<1:12:01, 93.27it/s][A
progress-bar:   0%|          | 1325/404288 [00:01<52:13, 128.60it/s] [A
progress-bar:   0%|          | 1459/404288 [00:01<38:02, 176.45it/s][A
progress-bar:   0%|          | 1605/404288 [00:01<28:01, 239.52it/s][A
progress-bar:   0%|          | 1740/404288 [00:01<21:08, 317.41it/s][A

progress-bar:   4%|▍         | 16043/404288 [00:11<04:34, 1413.18it/s][A
progress-bar:   4%|▍         | 16185/404288 [00:11<04:45, 1360.25it/s][A
progress-bar:   4%|▍         | 16322/404288 [00:12<04:56, 1309.26it/s][A
progress-bar:   4%|▍         | 16480/404288 [00:12<04:41, 1378.01it/s][A
progress-bar:   4%|▍         | 16620/404288 [00:12<04:44, 1363.49it/s][A
progress-bar:   4%|▍         | 16762/404288 [00:12<04:41, 1374.59it/s][A
progress-bar:   4%|▍         | 16901/404288 [00:12<04:49, 1337.67it/s][A
progress-bar:   4%|▍         | 17036/404288 [00:12<04:59, 1294.04it/s][A
progress-bar:   4%|▍         | 17185/404288 [00:12<04:48, 1341.71it/s][A
progress-bar:   4%|▍         | 17342/404288 [00:12<04:37, 1396.84it/s][A
progress-bar:   4%|▍         | 17484/404288 [00:12<04:39, 1386.05it/s][A
progress-bar:   4%|▍         | 17637/404288 [00:12<04:31, 1425.93it/s][A
progress-bar:   4%|▍         | 17797/404288 [00:13<04:22, 1472.34it/s][A
progress-bar:   4%|▍         | 17946/4

progress-bar:   8%|▊         | 32032/404288 [00:23<05:13, 1187.20it/s][A
progress-bar:   8%|▊         | 32160/404288 [00:23<05:06, 1212.25it/s][A
progress-bar:   8%|▊         | 32308/404288 [00:23<04:50, 1280.08it/s][A
progress-bar:   8%|▊         | 32440/404288 [00:23<04:48, 1290.76it/s][A
progress-bar:   8%|▊         | 32581/404288 [00:23<04:41, 1321.36it/s][A
progress-bar:   8%|▊         | 32723/404288 [00:23<04:35, 1347.89it/s][A
progress-bar:   8%|▊         | 32870/404288 [00:24<04:29, 1380.64it/s][A
progress-bar:   8%|▊         | 33025/404288 [00:24<04:20, 1425.51it/s][A
progress-bar:   8%|▊         | 33174/404288 [00:24<04:17, 1443.81it/s][A
progress-bar:   8%|▊         | 33320/404288 [00:24<04:19, 1431.21it/s][A
progress-bar:   8%|▊         | 33471/404288 [00:24<04:15, 1453.38it/s][A
progress-bar:   8%|▊         | 33617/404288 [00:24<04:26, 1393.05it/s][A
progress-bar:   8%|▊         | 33758/404288 [00:24<04:42, 1313.49it/s][A
progress-bar:   8%|▊         | 33908/4

progress-bar:  12%|█▏        | 47746/404288 [00:34<04:20, 1370.90it/s][A
progress-bar:  12%|█▏        | 47888/404288 [00:35<04:17, 1384.15it/s][A
progress-bar:  12%|█▏        | 48027/404288 [00:35<04:50, 1227.34it/s][A
progress-bar:  12%|█▏        | 48154/404288 [00:35<04:57, 1195.24it/s][A
progress-bar:  12%|█▏        | 48285/404288 [00:35<04:50, 1227.18it/s][A
progress-bar:  12%|█▏        | 48428/404288 [00:35<04:38, 1279.42it/s][A
progress-bar:  12%|█▏        | 48566/404288 [00:35<04:32, 1306.13it/s][A
progress-bar:  12%|█▏        | 48699/404288 [00:35<04:34, 1294.68it/s][A
progress-bar:  12%|█▏        | 48858/404288 [00:35<04:19, 1370.65it/s][A
progress-bar:  12%|█▏        | 48998/404288 [00:35<04:18, 1376.85it/s][A
progress-bar:  12%|█▏        | 49138/404288 [00:35<04:22, 1352.64it/s][A
progress-bar:  12%|█▏        | 49278/404288 [00:36<04:19, 1365.77it/s][A
progress-bar:  12%|█▏        | 49416/404288 [00:36<04:24, 1340.50it/s][A
progress-bar:  12%|█▏        | 49551/4

progress-bar:  16%|█▌        | 63890/404288 [00:46<05:21, 1059.81it/s][A
progress-bar:  16%|█▌        | 64011/404288 [00:46<05:10, 1097.37it/s][A
progress-bar:  16%|█▌        | 64153/404288 [00:46<04:49, 1174.72it/s][A
progress-bar:  16%|█▌        | 64289/404288 [00:46<04:37, 1224.72it/s][A
progress-bar:  16%|█▌        | 64417/404288 [00:46<04:35, 1234.69it/s][A
progress-bar:  16%|█▌        | 64545/404288 [00:46<04:32, 1245.87it/s][A
progress-bar:  16%|█▌        | 64696/404288 [00:47<04:18, 1313.32it/s][A
progress-bar:  16%|█▌        | 64830/404288 [00:47<04:27, 1271.30it/s][A
progress-bar:  16%|█▌        | 64961/404288 [00:47<04:30, 1252.17it/s][A
progress-bar:  16%|█▌        | 65090/404288 [00:47<04:30, 1253.33it/s][A
progress-bar:  16%|█▌        | 65218/404288 [00:47<04:30, 1254.76it/s][A
progress-bar:  16%|█▌        | 65345/404288 [00:47<04:29, 1258.82it/s][A
progress-bar:  16%|█▌        | 65482/404288 [00:47<04:22, 1288.70it/s][A
progress-bar:  16%|█▌        | 65636/4

progress-bar:  20%|█▉        | 79548/404288 [00:57<03:51, 1403.99it/s][A
progress-bar:  20%|█▉        | 79690/404288 [00:58<03:56, 1374.16it/s][A
progress-bar:  20%|█▉        | 79829/404288 [00:58<04:00, 1349.35it/s][A
progress-bar:  20%|█▉        | 79986/404288 [00:58<03:50, 1404.03it/s][A
progress-bar:  20%|█▉        | 80135/404288 [00:58<03:46, 1428.06it/s][A
progress-bar:  20%|█▉        | 80286/404288 [00:58<03:43, 1450.46it/s][A
progress-bar:  20%|█▉        | 80432/404288 [00:58<03:46, 1430.74it/s][A
progress-bar:  20%|█▉        | 80579/404288 [00:58<03:44, 1441.86it/s][A
progress-bar:  20%|█▉        | 80724/404288 [00:58<03:50, 1401.73it/s][A
progress-bar:  20%|██        | 80865/404288 [00:58<03:57, 1359.19it/s][A
progress-bar:  20%|██        | 81026/404288 [00:58<03:46, 1424.09it/s][A
progress-bar:  20%|██        | 81191/404288 [00:59<03:37, 1482.93it/s][A
progress-bar:  20%|██        | 81341/404288 [00:59<03:42, 1453.64it/s][A
progress-bar:  20%|██        | 81488/4

progress-bar:  24%|██▎       | 95720/404288 [01:09<03:32, 1452.07it/s][A
progress-bar:  24%|██▎       | 95866/404288 [01:09<03:37, 1420.35it/s][A
progress-bar:  24%|██▎       | 96009/404288 [01:09<03:36, 1421.29it/s][A
progress-bar:  24%|██▍       | 96179/404288 [01:09<03:26, 1491.31it/s][A
progress-bar:  24%|██▍       | 96330/404288 [01:10<03:32, 1447.79it/s][A
progress-bar:  24%|██▍       | 96476/404288 [01:10<03:35, 1426.90it/s][A
progress-bar:  24%|██▍       | 96620/404288 [01:10<03:44, 1372.31it/s][A
progress-bar:  24%|██▍       | 96762/404288 [01:10<03:42, 1385.03it/s][A
progress-bar:  24%|██▍       | 96911/404288 [01:10<03:37, 1413.50it/s][A
progress-bar:  24%|██▍       | 97066/404288 [01:10<03:31, 1451.56it/s][A
progress-bar:  24%|██▍       | 97216/404288 [01:10<03:29, 1464.20it/s][A
progress-bar:  24%|██▍       | 97386/404288 [01:10<03:21, 1522.31it/s][A
progress-bar:  24%|██▍       | 97546/404288 [01:10<03:18, 1543.48it/s][A
progress-bar:  24%|██▍       | 97710/4

progress-bar:  28%|██▊       | 112187/404288 [01:21<03:19, 1465.83it/s][A
progress-bar:  28%|██▊       | 112335/404288 [01:21<03:18, 1469.64it/s][A
progress-bar:  28%|██▊       | 112486/404288 [01:21<03:17, 1479.41it/s][A
progress-bar:  28%|██▊       | 112640/404288 [01:21<03:14, 1496.84it/s][A
progress-bar:  28%|██▊       | 112790/404288 [01:21<03:18, 1466.24it/s][A
progress-bar:  28%|██▊       | 112937/404288 [01:21<03:28, 1399.17it/s][A
progress-bar:  28%|██▊       | 113078/404288 [01:21<03:30, 1384.80it/s][A
progress-bar:  28%|██▊       | 113218/404288 [01:21<03:31, 1378.87it/s][A
progress-bar:  28%|██▊       | 113358/404288 [01:21<03:30, 1384.67it/s][A
progress-bar:  28%|██▊       | 113501/404288 [01:21<03:28, 1394.38it/s][A
progress-bar:  28%|██▊       | 113644/404288 [01:22<03:27, 1403.57it/s][A
progress-bar:  28%|██▊       | 113790/404288 [01:22<03:25, 1416.86it/s][A
progress-bar:  28%|██▊       | 113937/404288 [01:22<03:22, 1432.28it/s][A
progress-bar:  28%|██▊   

progress-bar:  32%|███▏      | 128063/404288 [01:32<03:20, 1375.06it/s][A
progress-bar:  32%|███▏      | 128202/404288 [01:32<03:30, 1313.64it/s][A
progress-bar:  32%|███▏      | 128344/404288 [01:32<03:25, 1343.51it/s][A
progress-bar:  32%|███▏      | 128495/404288 [01:32<03:18, 1387.71it/s][A
progress-bar:  32%|███▏      | 128635/404288 [01:32<03:33, 1290.75it/s][A
progress-bar:  32%|███▏      | 128767/404288 [01:32<03:32, 1294.84it/s][A
progress-bar:  32%|███▏      | 128898/404288 [01:33<03:43, 1234.00it/s][A
progress-bar:  32%|███▏      | 129024/404288 [01:33<03:52, 1184.80it/s][A
progress-bar:  32%|███▏      | 129170/404288 [01:33<03:39, 1253.75it/s][A
progress-bar:  32%|███▏      | 129324/404288 [01:33<03:27, 1327.75it/s][A
progress-bar:  32%|███▏      | 129472/404288 [01:33<03:22, 1355.26it/s][A
progress-bar:  32%|███▏      | 129610/404288 [01:33<03:44, 1225.31it/s][A
progress-bar:  32%|███▏      | 129743/404288 [01:33<03:39, 1251.02it/s][A
progress-bar:  32%|███▏  

progress-bar:  36%|███▌      | 143585/404288 [01:43<03:05, 1407.85it/s][A
progress-bar:  36%|███▌      | 143738/404288 [01:44<03:01, 1436.38it/s][A
progress-bar:  36%|███▌      | 143883/404288 [01:44<03:11, 1356.84it/s][A
progress-bar:  36%|███▌      | 144021/404288 [01:44<03:12, 1351.08it/s][A
progress-bar:  36%|███▌      | 144158/404288 [01:44<03:12, 1353.95it/s][A
progress-bar:  36%|███▌      | 144295/404288 [01:44<03:14, 1335.50it/s][A
progress-bar:  36%|███▌      | 144430/404288 [01:44<03:19, 1301.46it/s][A
progress-bar:  36%|███▌      | 144563/404288 [01:44<03:18, 1306.78it/s][A
progress-bar:  36%|███▌      | 144695/404288 [01:44<03:19, 1301.83it/s][A
progress-bar:  36%|███▌      | 144856/404288 [01:44<03:07, 1381.04it/s][A
progress-bar:  36%|███▌      | 145012/404288 [01:45<03:01, 1428.65it/s][A
progress-bar:  36%|███▌      | 145157/404288 [01:45<03:00, 1434.17it/s][A
progress-bar:  36%|███▌      | 145311/404288 [01:45<02:57, 1462.81it/s][A
progress-bar:  36%|███▌  

progress-bar:  39%|███▉      | 159346/404288 [01:55<02:59, 1361.59it/s][A
progress-bar:  39%|███▉      | 159483/404288 [01:55<03:01, 1347.16it/s][A
progress-bar:  39%|███▉      | 159623/404288 [01:55<03:00, 1354.98it/s][A
progress-bar:  40%|███▉      | 159759/404288 [01:55<03:01, 1350.02it/s][A
progress-bar:  40%|███▉      | 159901/404288 [01:55<02:58, 1368.80it/s][A
progress-bar:  40%|███▉      | 160040/404288 [01:55<02:58, 1367.32it/s][A
progress-bar:  40%|███▉      | 160205/404288 [01:56<02:49, 1439.51it/s][A
progress-bar:  40%|███▉      | 160351/404288 [01:56<02:51, 1425.90it/s][A
progress-bar:  40%|███▉      | 160498/404288 [01:56<02:49, 1435.26it/s][A
progress-bar:  40%|███▉      | 160643/404288 [01:56<02:54, 1398.95it/s][A
progress-bar:  40%|███▉      | 160784/404288 [01:56<03:01, 1343.72it/s][A
progress-bar:  40%|███▉      | 160920/404288 [01:56<03:01, 1339.24it/s][A
progress-bar:  40%|███▉      | 161084/404288 [01:56<02:51, 1416.82it/s][A
progress-bar:  40%|███▉  

progress-bar:  43%|████▎     | 175315/404288 [02:06<02:37, 1458.30it/s][A
progress-bar:  43%|████▎     | 175462/404288 [02:07<02:44, 1393.52it/s][A
progress-bar:  43%|████▎     | 175609/404288 [02:07<02:41, 1415.57it/s][A
progress-bar:  43%|████▎     | 175753/404288 [02:07<02:40, 1421.67it/s][A
progress-bar:  44%|████▎     | 175896/404288 [02:07<02:42, 1408.10it/s][A
progress-bar:  44%|████▎     | 176039/404288 [02:07<02:41, 1411.53it/s][A
progress-bar:  44%|████▎     | 176181/404288 [02:07<03:03, 1244.97it/s][A
progress-bar:  44%|████▎     | 176310/404288 [02:07<03:41, 1027.03it/s][A
progress-bar:  44%|████▎     | 176458/404288 [02:07<03:21, 1129.92it/s][A
progress-bar:  44%|████▎     | 176599/404288 [02:07<03:09, 1199.19it/s][A
progress-bar:  44%|████▎     | 176751/404288 [02:08<02:57, 1279.76it/s][A
progress-bar:  44%|████▍     | 176910/404288 [02:08<02:47, 1356.85it/s][A
progress-bar:  44%|████▍     | 177052/404288 [02:08<02:47, 1359.44it/s][A
progress-bar:  44%|████▍ 

progress-bar:  47%|████▋     | 191637/404288 [02:18<02:26, 1446.95it/s][A
progress-bar:  47%|████▋     | 191783/404288 [02:18<02:28, 1432.04it/s][A
progress-bar:  47%|████▋     | 191930/404288 [02:18<02:27, 1442.57it/s][A
progress-bar:  48%|████▊     | 192081/404288 [02:18<02:25, 1461.99it/s][A
progress-bar:  48%|████▊     | 192237/404288 [02:18<02:22, 1489.30it/s][A
progress-bar:  48%|████▊     | 192387/404288 [02:18<02:22, 1484.11it/s][A
progress-bar:  48%|████▊     | 192538/404288 [02:18<02:22, 1488.48it/s][A
progress-bar:  48%|████▊     | 192688/404288 [02:19<02:26, 1440.92it/s][A
progress-bar:  48%|████▊     | 192835/404288 [02:19<02:25, 1449.22it/s][A
progress-bar:  48%|████▊     | 192994/404288 [02:19<02:21, 1488.34it/s][A
progress-bar:  48%|████▊     | 193150/404288 [02:19<02:19, 1508.60it/s][A
progress-bar:  48%|████▊     | 193308/404288 [02:19<02:18, 1528.70it/s][A
progress-bar:  48%|████▊     | 193462/404288 [02:19<02:19, 1510.71it/s][A
progress-bar:  48%|████▊ 

progress-bar:  52%|█████▏    | 208326/404288 [02:29<02:08, 1521.11it/s][A
progress-bar:  52%|█████▏    | 208479/404288 [02:29<02:13, 1462.34it/s][A
progress-bar:  52%|█████▏    | 208627/404288 [02:29<02:14, 1451.61it/s][A
progress-bar:  52%|█████▏    | 208773/404288 [02:29<02:19, 1398.23it/s][A
progress-bar:  52%|█████▏    | 208929/404288 [02:30<02:15, 1442.22it/s][A
progress-bar:  52%|█████▏    | 209090/404288 [02:30<02:11, 1487.38it/s][A
progress-bar:  52%|█████▏    | 209246/404288 [02:30<02:09, 1506.70it/s][A
progress-bar:  52%|█████▏    | 209398/404288 [02:30<02:12, 1471.41it/s][A
progress-bar:  52%|█████▏    | 209546/404288 [02:30<02:17, 1416.81it/s][A
progress-bar:  52%|█████▏    | 209700/404288 [02:30<02:14, 1448.59it/s][A
progress-bar:  52%|█████▏    | 209846/404288 [02:30<02:16, 1425.54it/s][A
progress-bar:  52%|█████▏    | 209990/404288 [02:30<02:18, 1404.32it/s][A
progress-bar:  52%|█████▏    | 210154/404288 [02:30<02:13, 1459.05it/s][A
progress-bar:  52%|█████▏

progress-bar:  56%|█████▌    | 224861/404288 [02:41<02:05, 1433.34it/s][A
progress-bar:  56%|█████▌    | 225006/404288 [02:41<02:07, 1401.67it/s][A
progress-bar:  56%|█████▌    | 225174/404288 [02:41<02:01, 1474.26it/s][A
progress-bar:  56%|█████▌    | 225325/404288 [02:41<02:00, 1483.17it/s][A
progress-bar:  56%|█████▌    | 225475/404288 [02:41<02:04, 1441.63it/s][A
progress-bar:  56%|█████▌    | 225621/404288 [02:41<02:08, 1392.44it/s][A
progress-bar:  56%|█████▌    | 225774/404288 [02:41<02:04, 1430.45it/s][A
progress-bar:  56%|█████▌    | 225919/404288 [02:41<02:11, 1353.43it/s][A
progress-bar:  56%|█████▌    | 226056/404288 [02:41<02:17, 1295.10it/s][A
progress-bar:  56%|█████▌    | 226235/404288 [02:41<02:06, 1409.11it/s][A
progress-bar:  56%|█████▌    | 226381/404288 [02:42<02:05, 1417.56it/s][A
progress-bar:  56%|█████▌    | 226534/404288 [02:42<02:02, 1445.58it/s][A
progress-bar:  56%|█████▌    | 226681/404288 [02:42<02:04, 1431.07it/s][A
progress-bar:  56%|█████▌

progress-bar:  60%|█████▉    | 241449/404288 [02:52<01:48, 1507.34it/s][A
progress-bar:  60%|█████▉    | 241606/404288 [02:52<01:46, 1524.29it/s][A
progress-bar:  60%|█████▉    | 241759/404288 [02:52<01:53, 1426.69it/s][A
progress-bar:  60%|█████▉    | 241904/404288 [02:52<02:02, 1327.49it/s][A
progress-bar:  60%|█████▉    | 242040/404288 [02:52<02:02, 1321.18it/s][A
progress-bar:  60%|█████▉    | 242182/404288 [02:52<02:00, 1346.32it/s][A
progress-bar:  60%|█████▉    | 242318/404288 [02:53<02:05, 1295.64it/s][A
progress-bar:  60%|█████▉    | 242470/404288 [02:53<01:59, 1355.47it/s][A
progress-bar:  60%|██████    | 242627/404288 [02:53<01:54, 1413.28it/s][A
progress-bar:  60%|██████    | 242771/404288 [02:53<01:55, 1402.29it/s][A
progress-bar:  60%|██████    | 242913/404288 [02:53<01:56, 1389.04it/s][A
progress-bar:  60%|██████    | 243053/404288 [02:53<01:56, 1387.67it/s][A
progress-bar:  60%|██████    | 243193/404288 [02:53<01:56, 1383.53it/s][A
progress-bar:  60%|██████

progress-bar:  64%|██████▍   | 257785/404288 [03:03<01:44, 1398.25it/s][A
progress-bar:  64%|██████▍   | 257929/404288 [03:04<01:47, 1364.41it/s][A
progress-bar:  64%|██████▍   | 258083/404288 [03:04<01:43, 1410.47it/s][A
progress-bar:  64%|██████▍   | 258227/404288 [03:04<01:44, 1393.49it/s][A
progress-bar:  64%|██████▍   | 258369/404288 [03:04<01:47, 1356.39it/s][A
progress-bar:  64%|██████▍   | 258526/404288 [03:04<01:43, 1412.90it/s][A
progress-bar:  64%|██████▍   | 258669/404288 [03:04<01:43, 1407.20it/s][A
progress-bar:  64%|██████▍   | 258822/404288 [03:04<01:41, 1439.29it/s][A
progress-bar:  64%|██████▍   | 258967/404288 [03:04<01:45, 1381.12it/s][A
progress-bar:  64%|██████▍   | 259107/404288 [03:04<01:46, 1358.02it/s][A
progress-bar:  64%|██████▍   | 259247/404288 [03:05<01:45, 1368.80it/s][A
progress-bar:  64%|██████▍   | 259398/404288 [03:05<01:42, 1407.16it/s][A
progress-bar:  64%|██████▍   | 259540/404288 [03:05<01:47, 1352.09it/s][A
progress-bar:  64%|██████

progress-bar:  68%|██████▊   | 274089/404288 [03:15<01:31, 1424.56it/s][A
progress-bar:  68%|██████▊   | 274232/404288 [03:15<01:34, 1383.44it/s][A
progress-bar:  68%|██████▊   | 274381/404288 [03:15<01:32, 1410.08it/s][A
progress-bar:  68%|██████▊   | 274541/404288 [03:15<01:28, 1461.54it/s][A
progress-bar:  68%|██████▊   | 274688/404288 [03:15<01:30, 1430.44it/s][A
progress-bar:  68%|██████▊   | 274864/404288 [03:15<01:25, 1514.69it/s][A
progress-bar:  68%|██████▊   | 275018/404288 [03:15<01:25, 1506.05it/s][A
progress-bar:  68%|██████▊   | 275170/404288 [03:16<01:25, 1510.06it/s][A
progress-bar:  68%|██████▊   | 275322/404288 [03:16<01:25, 1507.76it/s][A
progress-bar:  68%|██████▊   | 275474/404288 [03:16<01:27, 1469.45it/s][A
progress-bar:  68%|██████▊   | 275622/404288 [03:16<01:31, 1410.45it/s][A
progress-bar:  68%|██████▊   | 275774/404288 [03:16<01:29, 1441.10it/s][A
progress-bar:  68%|██████▊   | 275944/404288 [03:16<01:25, 1509.60it/s][A
progress-bar:  68%|██████

progress-bar:  72%|███████▏  | 290682/404288 [03:26<01:18, 1453.21it/s][A
progress-bar:  72%|███████▏  | 290842/404288 [03:26<01:16, 1490.50it/s][A
progress-bar:  72%|███████▏  | 291001/404288 [03:26<01:14, 1518.30it/s][A
progress-bar:  72%|███████▏  | 291154/404288 [03:27<01:19, 1430.97it/s][A
progress-bar:  72%|███████▏  | 291299/404288 [03:27<01:20, 1399.26it/s][A
progress-bar:  72%|███████▏  | 291456/404288 [03:27<01:18, 1446.04it/s][A
progress-bar:  72%|███████▏  | 291602/404288 [03:27<01:17, 1446.46it/s][A
progress-bar:  72%|███████▏  | 291755/404288 [03:27<01:16, 1470.27it/s][A
progress-bar:  72%|███████▏  | 291913/404288 [03:27<01:14, 1500.13it/s][A
progress-bar:  72%|███████▏  | 292067/404288 [03:27<01:14, 1511.68it/s][A
progress-bar:  72%|███████▏  | 292239/404288 [03:27<01:11, 1566.98it/s][A
progress-bar:  72%|███████▏  | 292397/404288 [03:27<01:13, 1528.72it/s][A
progress-bar:  72%|███████▏  | 292551/404288 [03:27<01:13, 1519.21it/s][A
progress-bar:  72%|██████

progress-bar:  76%|███████▌  | 307523/404288 [03:38<01:08, 1419.60it/s][A
progress-bar:  76%|███████▌  | 307681/404288 [03:38<01:05, 1464.10it/s][A
progress-bar:  76%|███████▌  | 307829/404288 [03:38<01:06, 1459.52it/s][A
progress-bar:  76%|███████▌  | 307987/404288 [03:38<01:04, 1491.45it/s][A
progress-bar:  76%|███████▌  | 308151/404288 [03:38<01:02, 1531.23it/s][A
progress-bar:  76%|███████▋  | 308305/404288 [03:38<01:04, 1492.97it/s][A
progress-bar:  76%|███████▋  | 308456/404288 [03:38<01:04, 1494.84it/s][A
progress-bar:  76%|███████▋  | 308606/404288 [03:38<01:04, 1476.87it/s][A
progress-bar:  76%|███████▋  | 308757/404288 [03:38<01:04, 1484.49it/s][A
progress-bar:  76%|███████▋  | 308906/404288 [03:38<01:05, 1446.86it/s][A
progress-bar:  76%|███████▋  | 309058/404288 [03:39<01:05, 1460.77it/s][A
progress-bar:  76%|███████▋  | 309205/404288 [03:39<01:07, 1408.70it/s][A
progress-bar:  77%|███████▋  | 309347/404288 [03:39<01:09, 1363.46it/s][A
progress-bar:  77%|██████

progress-bar:  80%|████████  | 323610/404288 [03:49<00:56, 1434.42it/s][A
progress-bar:  80%|████████  | 323760/404288 [03:49<00:55, 1449.96it/s][A
progress-bar:  80%|████████  | 323907/404288 [03:49<00:57, 1388.05it/s][A
progress-bar:  80%|████████  | 324048/404288 [03:49<00:58, 1364.31it/s][A
progress-bar:  80%|████████  | 324187/404288 [03:49<00:58, 1367.35it/s][A
progress-bar:  80%|████████  | 324334/404288 [03:50<00:57, 1396.57it/s][A
progress-bar:  80%|████████  | 324490/404288 [03:50<00:55, 1437.79it/s][A
progress-bar:  80%|████████  | 324640/404288 [03:50<00:54, 1454.95it/s][A
progress-bar:  80%|████████  | 324787/404288 [03:50<00:56, 1400.76it/s][A
progress-bar:  80%|████████  | 324934/404288 [03:50<00:55, 1417.42it/s][A
progress-bar:  80%|████████  | 325077/404288 [03:50<00:55, 1420.58it/s][A
progress-bar:  80%|████████  | 325220/404288 [03:50<00:56, 1406.61it/s][A
progress-bar:  80%|████████  | 325362/404288 [03:50<00:57, 1377.75it/s][A
progress-bar:  81%|██████

progress-bar:  84%|████████▍ | 339312/404288 [04:00<00:47, 1355.45it/s][A
progress-bar:  84%|████████▍ | 339449/404288 [04:00<00:48, 1341.83it/s][A
progress-bar:  84%|████████▍ | 339584/404288 [04:01<00:48, 1329.35it/s][A
progress-bar:  84%|████████▍ | 339718/404288 [04:01<00:48, 1322.82it/s][A
progress-bar:  84%|████████▍ | 339851/404288 [04:01<00:48, 1320.83it/s][A
progress-bar:  84%|████████▍ | 340001/404288 [04:01<00:47, 1367.74it/s][A
progress-bar:  84%|████████▍ | 340150/404288 [04:01<00:45, 1396.05it/s][A
progress-bar:  84%|████████▍ | 340307/404288 [04:01<00:44, 1443.68it/s][A
progress-bar:  84%|████████▍ | 340453/404288 [04:01<00:45, 1404.70it/s][A
progress-bar:  84%|████████▍ | 340595/404288 [04:01<00:46, 1360.72it/s][A
progress-bar:  84%|████████▍ | 340740/404288 [04:01<00:45, 1386.26it/s][A
progress-bar:  84%|████████▍ | 340880/404288 [04:02<00:45, 1386.69it/s][A
progress-bar:  84%|████████▍ | 341020/404288 [04:02<00:47, 1326.41it/s][A
progress-bar:  84%|██████

progress-bar:  88%|████████▊ | 355249/404288 [04:12<00:34, 1415.11it/s][A
progress-bar:  88%|████████▊ | 355392/404288 [04:12<00:34, 1414.17it/s][A
progress-bar:  88%|████████▊ | 355535/404288 [04:12<00:34, 1400.85it/s][A
progress-bar:  88%|████████▊ | 355676/404288 [04:12<00:35, 1378.76it/s][A
progress-bar:  88%|████████▊ | 355830/404288 [04:12<00:34, 1421.48it/s][A
progress-bar:  88%|████████▊ | 355973/404288 [04:12<00:35, 1361.70it/s][A
progress-bar:  88%|████████▊ | 356111/404288 [04:12<00:35, 1358.77it/s][A
progress-bar:  88%|████████▊ | 356258/404288 [04:12<00:34, 1389.58it/s][A
progress-bar:  88%|████████▊ | 356409/404288 [04:13<00:33, 1415.09it/s][A
progress-bar:  88%|████████▊ | 356576/404288 [04:13<00:32, 1482.51it/s][A
progress-bar:  88%|████████▊ | 356726/404288 [04:13<00:33, 1406.25it/s][A
progress-bar:  88%|████████▊ | 356869/404288 [04:13<00:33, 1397.74it/s][A
progress-bar:  88%|████████▊ | 357010/404288 [04:13<00:33, 1400.05it/s][A
progress-bar:  88%|██████

progress-bar:  92%|█████████▏| 370809/404288 [04:23<00:24, 1368.97it/s][A
progress-bar:  92%|█████████▏| 370948/404288 [04:23<00:24, 1372.88it/s][A
progress-bar:  92%|█████████▏| 371104/404288 [04:24<00:23, 1423.23it/s][A
progress-bar:  92%|█████████▏| 371270/404288 [04:24<00:22, 1484.13it/s][A
progress-bar:  92%|█████████▏| 371420/404288 [04:24<00:22, 1454.05it/s][A
progress-bar:  92%|█████████▏| 371567/404288 [04:24<00:22, 1455.05it/s][A
progress-bar:  92%|█████████▏| 371714/404288 [04:24<00:22, 1453.47it/s][A
progress-bar:  92%|█████████▏| 371865/404288 [04:24<00:22, 1466.79it/s][A
progress-bar:  92%|█████████▏| 372013/404288 [04:24<00:22, 1442.74it/s][A
progress-bar:  92%|█████████▏| 372158/404288 [04:24<00:22, 1439.24it/s][A
progress-bar:  92%|█████████▏| 372318/404288 [04:24<00:21, 1482.92it/s][A
progress-bar:  92%|█████████▏| 372467/404288 [04:24<00:21, 1470.62it/s][A
progress-bar:  92%|█████████▏| 372615/404288 [04:25<00:21, 1472.69it/s][A
progress-bar:  92%|██████

progress-bar:  96%|█████████▌| 386930/404288 [04:35<00:13, 1255.90it/s][A
progress-bar:  96%|█████████▌| 387073/404288 [04:35<00:13, 1301.86it/s][A
progress-bar:  96%|█████████▌| 387216/404288 [04:35<00:12, 1336.61it/s][A
progress-bar:  96%|█████████▌| 387351/404288 [04:35<00:13, 1287.45it/s][A
progress-bar:  96%|█████████▌| 387513/404288 [04:35<00:12, 1371.08it/s][A
progress-bar:  96%|█████████▌| 387683/404288 [04:35<00:11, 1451.70it/s][A
progress-bar:  96%|█████████▌| 387831/404288 [04:35<00:11, 1420.08it/s][A
progress-bar:  96%|█████████▌| 387983/404288 [04:35<00:11, 1446.15it/s][A
progress-bar:  96%|█████████▌| 388130/404288 [04:36<00:11, 1434.06it/s][A
progress-bar:  96%|█████████▌| 388275/404288 [04:36<00:11, 1417.18it/s][A
progress-bar:  96%|█████████▌| 388418/404288 [04:36<00:11, 1351.66it/s][A
progress-bar:  96%|█████████▌| 388555/404288 [04:36<00:11, 1338.81it/s][A
progress-bar:  96%|█████████▌| 388690/404288 [04:36<00:11, 1312.57it/s][A
progress-bar:  96%|██████

progress-bar: 100%|█████████▉| 402723/404288 [04:46<00:01, 1285.85it/s][A
progress-bar: 100%|█████████▉| 402868/404288 [04:46<00:01, 1330.88it/s][A
progress-bar: 100%|█████████▉| 403004/404288 [04:46<00:00, 1339.24it/s][A
progress-bar: 100%|█████████▉| 403162/404288 [04:46<00:00, 1403.11it/s][A
progress-bar: 100%|█████████▉| 403304/404288 [04:47<00:00, 1380.72it/s][A
progress-bar: 100%|█████████▉| 403444/404288 [04:47<00:00, 1376.67it/s][A
progress-bar: 100%|█████████▉| 403600/404288 [04:47<00:00, 1425.44it/s][A
progress-bar: 100%|█████████▉| 403744/404288 [04:47<00:00, 1394.23it/s][A
progress-bar: 100%|█████████▉| 403885/404288 [04:47<00:00, 1342.15it/s][A
progress-bar: 100%|█████████▉| 404021/404288 [04:47<00:00, 1314.92it/s][A
progress-bar: 100%|█████████▉| 404155/404288 [04:47<00:00, 1318.40it/s][A
progress-bar: 100%|██████████| 404288/404288 [04:47<00:00, 1404.18it/s][A

In [28]:
train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate,process1,process2,unique,syn_rat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[guide, india, invest, market, share, step]","[guide, invest, market, share, step]",1,0.833333
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[story, kohinoor, koh-i-noor, diamond]","[would, government, back, diamond, koh-i-noor,...",7,0.111111
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[using, speed, internet, increase, vpn, connec...","[speed, internet, dns, hacking, increased]",7,0.333333
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[lonely, mentally, solve]","[/math, 24, find, 23^, remainder, 24,23, divid...",11,0.0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[methane, carbon, salt, oxide, water, one, sug...","[salt, fish, would, survive, water]",11,0.2


In [101]:
train['text'] = train.question1 + ' ' + train.question2
test['text'] = test.question1 + ' ' + test.question2
train.text = train.text.apply(lambda x : list(set(x.split())))
test.text = test.text.apply(lambda x : list(set(x.split())))

In [102]:
train.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate,process1,process2,unique,syn_rat,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[guide, india, invest, market, share, step]","[guide, invest, market, share, step]",1,0.833333,"[is, guide, invest, market, share, the, to, in..."
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[story, kohinoor, koh-i-noor, diamond]","[would, government, back, diamond, koh-i-noor,...",7,0.111111,"[story, (Koh-i-Noor), Indian, is, would, gover..."
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[using, speed, internet, increase, vpn, connec...","[speed, internet, dns, hacking, increased]",7,0.333333,"[a, How, Internet, can, while, using, of, VPN?..."
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[lonely, mentally, solve]","[/math, 24, find, 23^, remainder, 24,23, divid...",11,0.0,"[solve, I, Why, 24,23?, is, very, the, [math]2..."
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[methane, carbon, salt, oxide, water, one, sug...","[salt, fish, would, survive, water]",11,0.2,"[methane, carbon, salt, fish, would, salt,, su..."


In [103]:
text.head()

AttributeError: 'list' object has no attribute 'head'

In [48]:
from nltk.corpus import stopwords

additional_words = 'like '.split()
from nltk.stem.wordnet import WordNetLemmatizerstop_words = set(stopwords.words('english') + list(string.punctuation) + additional_words)

In [76]:
import string

In [95]:
from nltk.stem.wordnet import WordNetLemmatizer

lemma = WordNetLemmatizer()
text = list(set([item for sublist in train.text for item in sublist]) - stop_words)
text = list(set([lemma.lemmatize(item.lower()) for item in text]))
text = [item for item in text if item[0].islower()]

for punct in list(string.punctuation + string.digits):
    text = [item.replace(punct, '') for item in text]
    
text = list(set(text))

In [96]:
len(text)

95044

In [97]:
text

['conceiving',
 'delivery',
 'betnovate',
 'saha',
 'gapless',
 'prasad',
 'roughly',
 'texter',
 'expiraton',
 'rvalue',
 'rupture',
 'herself',
 'outgoingincoming',
 'toget',
 'lydia',
 'chchchcho',
 'centennial',
 'teased',
 'straina',
 'redeemgift',
 'zapata',
 'experiential',
 'cupel',
 'avocado',
 'partsinterior',
 'polished',
 'sdslabs',
 'solarplanetary',
 'comic',
 'polycount',
 'khazakstan',
 'cecp',
 'amaron',
 'otters',
 'adeptia',
 'langar',
 'highcrime',
 'sterotyphy',
 'testingwhich',
 'engel',
 'harish',
 'weyl',
 'sweatshop',
 'adapatations',
 'apparatus',
 'kasinova',
 'vani',
 'trearment',
 'polyhouse',
 'palermo',
 'liquidatederadicatedabolished',
 'consistent”',
 'buildingtower',
 'apartmentsplots',
 'casillas',
 'tyrus',
 'pomt',
 'overdrafted',
 'darker',
 'indivisible',
 'shining',
 'chalisa',
 'retake',
 'bodymind',
 'locales',
 'sedol',
 'resentment',
 'nation',
 'emailwhat',
 'lanterna',
 'tandel',
 'topsycom',
 'cynosure',
 'escobars',
 'cinco',
 'implicit',

In [29]:
X_train, X_test, y_train, y_test = train_test_split(train[['unique', 'syn_rat']], train.is_duplicate)

In [30]:
from xgboost import XGBClassifier

In [31]:
xgb = XGBClassifier()

In [32]:
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [33]:
preds = [item[1] for item in xgb.predict_proba(X_test)]
preds

[0.8253122,
 0.4660283,
 0.21073259,
 0.21615756,
 0.092065305,
 0.51299167,
 0.034543518,
 0.51299167,
 0.4660283,
 0.32989115,
 0.48104659,
 0.0084774401,
 0.45685795,
 0.23367345,
 0.11176248,
 0.36926135,
 0.46582162,
 0.32655433,
 0.5532006,
 0.52365363,
 0.0016057441,
 0.024454763,
 0.19960853,
 0.27941841,
 0.46476477,
 0.52365363,
 0.075379476,
 0.41359359,
 0.50027615,
 0.31297556,
 0.26801762,
 0.53880066,
 0.5387677,
 0.50838858,
 0.10357816,
 0.50027615,
 0.0031514738,
 0.48417506,
 0.19960853,
 0.50411808,
 0.43671221,
 0.50838858,
 0.34233785,
 0.53580081,
 0.51449907,
 0.51449907,
 0.51299167,
 0.48216143,
 0.28961265,
 0.53880066,
 0.60340798,
 0.45685795,
 0.36926135,
 0.80340326,
 0.36926135,
 0.5387677,
 0.40010589,
 0.48313409,
 0.34960264,
 0.33373389,
 0.5387677,
 0.43014708,
 0.48104659,
 0.47781014,
 0.42752001,
 0.0654881,
 0.46582162,
 0.41198227,
 0.50411808,
 0.26303002,
 0.50251234,
 0.44288805,
 0.53477174,
 0.039292075,
 0.31471345,
 0.52365363,
 0.486777

In [34]:
pred_act = xgb.predict(X_test)
pred_act

array([1, 0, 0, ..., 0, 0, 0])

In [35]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [36]:
roc_auc_score(y_test, preds)

0.75776557742108186

In [37]:
accuracy_score(y_test, pred_act)

0.68228589520341931

In [None]:
def write_csv(model_name, model):
    
    model_data = [item[1] for item in model.predict_proba(test)]
    
    pd.DataFrame(data=model_data).to_csv(model_name + '.csv')
    results = pd.read_csv(model_name + '.csv')
    results.columns = ['Id', 'WnvPresent']
    results.Id = results.Id.apply(lambda x : x + 1)
    results.to_csv(model_name + '.csv', index=False)