In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### Importing the dataset

In [14]:
train_data = pd.read_csv('E:/downloads from chrome/quora dataset/train.csv')


In [15]:
train_data.iloc[5,3]

'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?'

In [16]:
train_data.iloc[5,4]

"I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?"

In [17]:
train_data

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


### Data Preprocessing 

In [18]:
def preprocess_string(x):
    punctuations = '''!()-[]{};:\n'\t"\,<>./?@#+$%^&*_~'''
    no_punct = ""
    for char in str(x):
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct.lower()

In [19]:
train_data['question1']  = train_data.question1.apply(lambda x: preprocess_string(x))

### Checking the preprocessed data 

In [20]:
train_data['question1'].head(1)

0    what is the step by step guide to invest in sh...
Name: question1, dtype: object

In [21]:
train_data['question2']  = train_data.question1.apply(lambda x: preprocess_string(x))

### Dropping the columns not required for model fitting

In [22]:
train_data = train_data.drop(["id","qid1","qid2"],axis=1)

In [23]:
train_data

Unnamed: 0,question1,question2,is_duplicate
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,what is the story of kohinoor kohinoor diamond,what is the story of kohinoor kohinoor diamond,0
2,how can i increase the speed of my internet co...,how can i increase the speed of my internet co...,0
3,why am i mentally very lonely how can i solve it,why am i mentally very lonely how can i solve it,0
4,which one dissolve in water quikly sugar salt ...,which one dissolve in water quikly sugar salt ...,0
5,astrology i am a capricorn sun cap moon and ca...,astrology i am a capricorn sun cap moon and ca...,1
6,should i buy tiago,should i buy tiago,0
7,how can i be a good geologist,how can i be a good geologist,1
8,when do you use シ instead of し,when do you use シ instead of し,0
9,motorola company can i hack my charter motorol...,motorola company can i hack my charter motorol...,0


### Training and Test split 

In [27]:
X_train, X_val, y_train, y_val = train_test_split(train_data.drop("is_duplicate",axis=1), train_data["is_duplicate"], 
                                                  test_size = 0.2, random_state = 99)

In [28]:
X_train

Unnamed: 0,question1,question2
204875,who discovered the rhinovirus how dangerous is...,who discovered the rhinovirus how dangerous is...
152381,how the black money be recovered by simultaneo...,how the black money be recovered by simultaneo...
316816,how do i read someones whatsapp messages witho...,how do i read someones whatsapp messages witho...
123368,how will scraping of 500 and 1000 rupees notes...,how will scraping of 500 and 1000 rupees notes...
282130,what are some of the best spin off series in t...,what are some of the best spin off series in t...
235290,what proves that history repeats itself,what proves that history repeats itself
27335,how can i write a letter to cbse online,how can i write a letter to cbse online
92786,whats in it for julian assange,whats in it for julian assange
299188,are there security cameras in us hotel rooms why,are there security cameras in us hotel rooms why
232369,what does the ending of short film teaspoon mean,what does the ending of short film teaspoon mean


In [29]:
y_train

204875    0
152381    1
316816    1
123368    1
282130    0
235290    0
27335     0
92786     0
299188    0
232369    0
294995    0
403050    0
208392    0
88346     0
65074     1
362093    0
80565     0
55682     1
144446    0
366447    0
316146    0
205826    0
339462    0
48271     0
64604     1
250446    0
118996    0
45682     1
246797    0
43336     0
         ..
72420     0
326373    1
2810      0
49743     0
340775    0
144443    1
293369    0
29324     1
366784    1
270580    0
386411    0
75750     0
311355    1
336048    1
115137    1
278583    0
384803    1
64766     1
228247    0
281985    0
325556    1
285893    0
349252    1
319186    0
231144    0
239305    1
167080    0
177337    0
23587     0
160385    0
Name: is_duplicate, Length: 323432, dtype: int64

### Creating the vectorizer for converting the Input for Model Fitting 

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()

### Combining the question1 and question2 of each data point

In [35]:
# Based on train set, combine q1 + q2, and build a term-doc matrix
train_list1 = X_train['question1'].apply(lambda x: x).tolist()
train_list2 = X_train['question2'].apply(lambda x: x).tolist()
train_list = []
for i in range(len(train_list1)):
    train_list.append(train_list1[i] + " " + train_list2[i])

In [36]:
print(train_list[0])

who discovered the rhinovirus how dangerous is it to humans who discovered the rhinovirus how dangerous is it to humans


In [37]:
train_doc = vec.fit_transform(train_list)
train_doc.shape

(323432, 72661)

### Calculating the count vector for both columns question1 and question2 for training data 

In [38]:
train_doc1 = vec.transform(X_train['question1'].apply(lambda x: x).tolist())
train_doc2 = vec.transform(X_train['question2'].apply(lambda x: x).tolist())

In [39]:
print(train_doc2.shape)

(323432, 72661)


In [41]:
x = train_doc1 + train_doc2
y = y_train
x.shape

(323432, 72661)

### Calculating the count vector for both columns question1 and question2 for validation data  

In [42]:
val_doc1 = vec.transform(X_val['question1'].apply(lambda x: x).tolist())
val_doc2 = vec.transform(X_val['question2'].apply(lambda x: x).tolist())

In [54]:
from sklearn.model_selection import GridSearchCV
parameters = {'C':[0.1, 0.4]}
model = LogisticRegression()
clf = GridSearchCV(model, parameters)
clf.fit(x, y)
# model.fit(x, y)


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1, param_grid={'C': [0.1, 0.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [55]:
sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_C',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [57]:
clf.cv_results_



{'mean_fit_time': array([18.55922278, 47.85734296]),
 'mean_score_time': array([0.03537798, 0.03826229]),
 'mean_test_score': array([0.73885392, 0.73803458]),
 'mean_train_score': array([0.76847684, 0.79027895]),
 'param_C': masked_array(data=[0.1, 0.4],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.1}, {'C': 0.4}],
 'rank_test_score': array([1, 2]),
 'split0_test_score': array([0.73998943, 0.74028624]),
 'split0_train_score': array([0.76772207, 0.78935725]),
 'split1_test_score': array([0.73634416, 0.73427572]),
 'split1_train_score': array([0.77012443, 0.79239963]),
 'split2_test_score': array([0.74022818, 0.73954179]),
 'split2_train_score': array([0.76758401, 0.78907996]),
 'std_fit_time': array([0.5431189 , 4.35695746]),
 'std_score_time': array([0.00587553, 0.01041668]),
 'std_test_score': array([0.00177735, 0.00267524]),
 'std_train_score': array([0.00116639, 0.00150381])}

In [61]:
preds_train = clf.predict(train_doc1 + train_doc2)
preds_prob_train = clf.predict_proba(train_doc1 + train_doc2)
preds_val = clf.predict(val_doc1 + val_doc2)
preds_prob_val = clf.predict_proba(val_doc1 + val_doc2)
print("Accuracy of training : ", (preds_train == y_train).mean())
print("Accuracy of validation : ", (preds_val == y_val).mean())

Accuracy of training :  0.7661208538425388
Accuracy of validation :  0.7431299314848252
