In [1]:
#importing libraries numpy,pandas,mathplotlib for extracting, modifying and visualizing the data

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [2]:
#loading input train file to train
train = pd.read_csv("../input/train.csv")
#loading iput test file to test
test = pd.read_csv("../input/test.csv")
#printing the top
train.head()

This is how the training data is given. 

In [3]:
test.head()

The test data only contains questions but not their id's as in train data, as you can see above. 

In [4]:
train.info()

The training data has 404290 instances. 

In [5]:
test.info()

The test data has 2345796 instances.

In [6]:
train_duplicate_mean = train['is_duplicate'].mean()
print ("mean of train data is_duplicate column",train_duplicate_mean)

By finding the mean on the is_duplicate field of train data, we see that about 37% of the train data have pair of questions, which are labeled is_duplicate as 1. 

In [7]:
pt = train.groupby('is_duplicate')['id'].count()
pt.plot.bar()

The plot shows the is_duplicate distribution in the train data. 

In [8]:

question_id_1 = train['qid1'].tolist()
question_id_2 = train['qid2'].tolist()
question_id = pd.Series(question_id_1+question_id_2)
plt.figure(figsize=(15,6))
plt.hist(question_id.value_counts(), bins= 30)
plt.yscale('log', nonposy='clip')

By plotting the no. of questions vs no. of occurences of the question, we observe that most of the questions only appear a few times, except very few. 

In [9]:
from nltk.corpus import stopwords as st
stopwords_set = set(st.words("english"))

def word_dict(sentence):
    question_words_dict = {}
    for word in sentence.lower().split():
        if word not in stopwords_set:
            question_words_dict[word] = 1
    return question_words_dict
def common_words_percentage(entry):
    question_1_words = word_dict(str(entry['question1']))
    question_2_words = word_dict(str(entry['question2']))
     
    if len(question_1_words) == 0 or len(question_2_words) == 0:
        return 0
    shared_in_q1 = [word for word in question_1_words.keys() if word in question_2_words]
    feature_Ratio = ( 2*len(shared_in_q1) )/(len(question_1_words)+len(question_2_words))
    return feature_Ratio

In [10]:
def tfidf_weights(entry):
    question_1_words = word_dict(str(entry['question1']))
    question_2_words = word_dict(str(entry['question2']))
    if len(question_1_words) == 0 or len(question_2_words) == 0:
        return 0
    
    common_wts_1 = [weights.get(w, 0) for w in question_1_words.keys() if w in question_2_words]  
    common_wts_2 = [weights.get(w, 0) for w in question_2_words.keys() if w in question_2_words]
    common_wts = common_wts_1 + common_wts_2
    whole_wts = [weights.get(w, 0) for w in question_1_words] + [weights.get(w, 0) for w in question_2_words]
    
    feature_tfidf = np.sum(common_wts) / np.sum(whole_wts)
    return feature_tfidf

In [11]:
list_of_questions = (train['question1'].str.lower().astype('U').tolist() + train['question2'].str.lower().astype('U').tolist())

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 50,max_features = 3000000,ngram_range = (1,10))
X = vectorizer.fit_transform(list_of_questions)
idf = vectorizer.idf_
weights = (dict(zip(vectorizer.get_feature_names(), idf)))

In [12]:
X_TrainData = pd.DataFrame()
X_TestData = pd.DataFrame()
X_TrainData['common_word_percent'] = train.apply(common_words_percentage, axis=1, raw=True)
X_TrainData['feature_ifidf'] = train.apply(tfidf_weights, axis = 1, raw = True)
Y_TrainData = train['is_duplicate'].values
X_TestData['common_word_percent'] = test.apply(common_words_percentage, axis = 1, raw = True)
X_TestData['feature_ifidf'] = test.apply(tfidf_weights, axis = 1, raw = True)

In [13]:
import nltk
def jaccard_similarity_coefficient(row):
    if (type(row['question1']) is str) and (type(row['question2']) is str):
        words_1 = row['question1'].lower().split()
        words_2 = row['question2'].lower().split()
    else:
        words_1 = nltk.word_tokenize(str(row['question1']))
        words_2 = nltk.word_tokenize(str(row['question2']))
   
    joint_words = set(words_1).union(set(words_2))
    intersection_words = set(words_1).intersection(set(words_2))
    return len(intersection_words)/len(joint_words)

In [None]:
train = train.fillna("")

In [None]:
X_TrainData['Jacard_Distance'] = train.apply(jaccard_similarity_coefficient, axis = 1, raw = True)
X_TestData['Jacard_Distance'] = test.apply(jaccard_similarity_coefficient, axis = 1, raw = True)

In [None]:

from sklearn.metrics.pairwise import cosine_similarity as cs
import re, math
from collections import Counter

WORD = re.compile(r'\w+')
def _cosine_similarity(vector_1, vector_2):
     intersection = set(vector_1.keys()) & set(vector_2.keys())
     numerator = sum([vector_1[x] * vector_2[x] for x in intersection])

     sum1 = sum([vector_1[x]**2 for x in vector_1.keys()])
     sum2 = sum([vector_2[x]**2 for x in vector_2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def sentence_transform(sentence):
     words = WORD.findall(sentence)
     return Counter(words)

def cosine_sim(row):
    vector1 = sentence_transform(str(row['question1']))
    vector2 = sentence_transform(str(row['question2']))
    sim = _cosine_similarity(vector1,vector2)
    return sim

X_TrainData['cosine_sim'] = train.apply(cosine_sim,axis = 1,raw = True )

In [None]:
X_TestData['cosine_sim'] = test.apply(cosine_sim,axis = 1,raw = True )

In [None]:

X_TrainData

In [None]:
from sklearn.cross_validation import train_test_split

X_TrainData, X_ValidData, Y_TrainData, Y_ValidData = train_test_split(X_TrainData, Y_TrainData, test_size=0.20, random_state=4242)

In [None]:
import xgboost as xgb

xg_TrainData = xgb.DMatrix(X_TrainData, label=Y_TrainData)
xg_ValidData = xgb.DMatrix(X_ValidData, label=Y_ValidData)

watchlist = [(xg_TrainData, 'train'), (xg_ValidData, 'valid')]

bst = xgb.train({'objective':'binary:logistic','eval_metric':'logloss','eta':0.02,'max_depth' :5}, xg_TrainData, 500, watchlist, early_stopping_rounds=50, verbose_eval=10)

In [None]:
X_TestData.info()

In [None]:
xg_TestData = xgb.DMatrix(X_TestData)
xg_ValidData = xgb.DMatrix(X_ValidData)

Predict_TestData = bst.predict(xg_TestData)
Predict_ValidData = bst.predict(xg_ValidData)



In [None]:
from sklearn.metrics import precision_recall_curve, auc, roc_curve
fpr, tpr, _ = roc_curve(Y_ValidData, Predict_ValidData)
roc_area = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1)
np.round(roc_area, 10)

In [None]:
precison, recall, _ = precision_recall_curve(Y_ValidData, Predict_ValidData)
plt.figure(figsize=(10,5))

plt.plot(recall, precison)
plt.xlabel('Recall')
plt.ylabel('Precision')
auc(recall, precison)

In [None]:
result = pd.DataFrame()
result['test_id'] = test['test_id']
result['is_duplicate'] = Predict_TestData
result.to_csv('result.csv', index=False)

In [None]:
Predict_TestData