In [1]:
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pal = sns.color_palette()

In [2]:
input_folder = "input/"
df_train = pd.read_csv(input_folder + 'train.csv')
df_test = pd.read_csv(input_folder + 'test.csv')
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


id: simple rowID
qid{1, 2}: The unique ID of each question in the pair
question{1, 2}: The actual textual contents of the questions.
is_duplicate: The label that we are trying to predict - whether the two questions are duplicates of each other.

In [4]:
# TFIDF
from collections import Counter
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer

stops = set(stopwords.words("english"))

# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=5000.0, min_count=2.0):
    if count < min_count:
        return 0.0
    else:
        return 1.0 / (count + eps)
weights = {}

def word_shares(row, wei, stop):
    q1 = set(str(row['question1']).lower().split())
    q1words = q1.difference(stop)
    if len(q1words) == 0:
        return '0:0:0:0:0'

    q2 = set(str(row['question2']).lower().split())
    q2words = q2.difference(stop)
    if len(q2words) == 0:
        return '0:0:0:0:0'

    q1stops = q1.intersection(stop)
    q2stops = q2.intersection(stop)

    shared_words = q1words.intersection(q2words)
    shared_weights = [wei.get(w, 0) for w in shared_words]
    total_weights = [wei.get(w, 0) for w in q1words] + [wei.get(w, 0) for w in q2words]
    # tfidf share
    R1 = np.sum(shared_weights) / np.sum(total_weights)
    # count share
    R2 = float(len(shared_words)) / (float(len(q1words)) + float(len(q2words)))
    # stops in q1
    R31 = float(len(q1stops)) / float(len(q1words))
    # stops in q2
    R32 = float(len(q2stops)) / float(len(q2words))
    return  R1,R2, float(len(shared_words)),R31,R32    

In [5]:
train_mix = (df_train['question1'] + " " + df_train['question2']).astype(str).values
test_mix = (df_test['question1'] + " " + df_test['question2']).astype(str).values
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [16]:
df_train['tfidf'] = df_train.apply(word_shares, args=(weights, stops), axis=1, raw=True)

def tfidf_share(row):
    return row['tfidf'][0]
def count_share(row):
    return row['tfidf'][1]
def len_count_share(row):
    return row['tfidf'][2]
def stops_Q1(row):
    return row['tfidf'][3]
def stops_Q2(row):
    return row['tfidf'][4]
df_train['tfidf_share'] = df_train.apply(tfidf_share, axis=1, raw=True)
df_train['count_share'] = df_train.apply(count_share, axis=1, raw=True)
df_train['len_count_share'] = df_train.apply(len_count_share, axis=1, raw=True)
df_train['stops_Q1'] = df_train.apply(stops_Q1, axis=1, raw=True)
df_train['stops_Q2'] = df_train.apply(stops_Q2, axis=1, raw=True)
df_train.drop('tfidf',1)

In [20]:
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,tfidf,tfidf_share,count_share,len_count_share,stops_Q1,stops_Q2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"(0.393242535477, 0.36363636363636365, 4.0, 1.0...",0.393243,0.363636,4,1.0,1.2
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"(0.194182090666, 0.15384615384615385, 2.0, 1.0...",0.194182,0.153846,2,1.0,0.333333
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"(0.170461289322, 0.18181818181818182, 2.0, 1.3...",0.170461,0.181818,2,1.33333,1.0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"(0.0, 0.0, 0.0, 1.5, 0.8)",0.0,0.0,0,1.5,0.8
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"(0.0, 0.0, 0.0, 0.3, 0.4)",0.0,0.0,0,0.3,0.4
