In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
#
# read sets into dataframes
df_train = pd.read_csv('../input/train.csv')
# for running local
#df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('../input/test.csv')
#
# basic information about the dataset
print("there are {0} question pairs in training and {1} columns".format(df_train.shape[0], df_train.shape[1]))
#print("there are {0} question pairs in testing and {1} columns".format(df_test.shape[0], df_test.shape[1]))
print("-------- TRAIN DATA TYPES --------")
print(df_train.dtypes)
#print("-------- TEST DATA TYPES --------")
#print(df_test.dtypes)
print("-------- TRAIN MISSING VALUES --------")
print(df_train.isnull().sum())
#print("-------- TEST MISSING VALUES --------")
#print(df_test.isnull().sum())

In [None]:
#
# there are a small number of missing values, first i will see if it's just error and replace
# based on the question id
missingq1_qid1 = df_train.loc[df_train['question1'].isnull()]['qid1'].tolist()[0]
missingq2_qid2 = df_train.loc[df_train['question2'].isnull()]['qid2'].tolist()[0]   # this is the same question so just one item 
print("id for missing q1: {0}, id for missing q2: {1}".format(missingq1_qid1, missingq2_qid2))
#
# retrieve from dataset questions with previous ids (question 1 first)
if len(df_train.loc[(df_train['qid1'] == missingq1_qid1) & (df_train['id'] != df_train.loc[df_train['question1'].isnull()]['id'].tolist()[0])]) == 0:
    print("no question to replace missing q1")
else:
    print("replace missing q1 with question id: {}".format(df_train.loc[(df_train['qid1'] == missingq1_qid1) & (df_train['id'] != df_train.loc[df_train['question1'].isnull()]['id'].tolist()[0])]['id']))
#
# drop the missing values
df_train.dropna(inplace=True)
df_train = df_train.reset_index(drop=True)
print("-------- TRAIN MISSING VALUES --------")
print(df_train.isnull().sum())
print(df_train.shape)

In [None]:
#
# as is_duplicate is our target and as a binary column we can compute the mean to get
# the percentage of duplicate question in the dataset
print("percentage of duplicate questions in dataset: {0}%".format(df_train['is_duplicate'].mean()*100))
print("-------- COUNT, UNIQUE, TOP AND FREQUENCY FOR QUESTIONS 1 --------")
print(df_train.qid1.astype(str).describe())
print("-------- COUNT, UNIQUE, TOP AND FREQUENCY FOR QUESTIONS 2 --------")
print(df_train.qid2.astype(str).describe())
print("top question 1 in dataset: \'{0}\'".format(df_train.loc[df_train['qid1'] == 8461, 'question1'].iloc[0]))
print("top question 2 in dataset: \'{0}\'".format(df_train.loc[df_train['qid2'] == 30782, 'question2'].iloc[0]))
print("-------- PERCENTAGE OF UNIQUE QUESTIONS --------")
print("Q1: {}%".format(df_train.qid1.astype(str).describe()['unique']/df_train.qid1.astype(str).describe()['count']*100))
print("Q2: {}%".format(df_train.qid2.astype(str).describe()['unique']/df_train.qid2.astype(str).describe()['count']*100))

It seems that by categorizing these questions we can gain useful knowledge at a question level prior detecting if they are duplicate or not. As printed in the previous snippet, the top questions for both 1 and 2 belong to specific categories and very different from each other. One is essentially "language" category while the second one is "social media"

In [None]:
#
# visualization on unique questions and duplicate questions
# how many repeated questions q1 are duplicated
# how many repeated questions q2 are duplicated
# how many repeated questions q1 and q2 are duplicated
print(df_train.loc[df_train.duplicated('qid1')].is_duplicate.describe())
print(df_train.loc[df_train.duplicated('qid2')].is_duplicate.describe())
print(df_train.loc[df_train.duplicated('qid1') & df_train.duplicated('qid2')].is_duplicate.describe())

In [None]:
#
# visualization on unique questions and duplicate questions
# how many unique questions q1 are duplicated
# how many unique questions q2 are duplicated
# how many unique questions q1 and q2 are duplicated
print(df_train.loc[df_train.duplicated('qid1') == False].is_duplicate.describe())
print(df_train.loc[df_train.duplicated('qid2') == False].is_duplicate.describe())
print(df_train.loc[(df_train.duplicated('qid1') == False) & (df_train.duplicated('qid2') == False)].is_duplicate.describe())

In [None]:
#
# number of words per question (withouth pre-processing)
import spacy
nlp = spacy.load('en')
#
# tokenize words in question1 and quetion2 and get the length of tokens 
df_train['length_question1'] = df_train['question1'].apply(lambda x: len(nlp(x,  disable=['parser', 'tagger', 'ner'])))
df_train['length_question2'] = df_train['question2'].apply(lambda x: len(nlp(x,  disable=['parser', 'tagger', 'ner'])))

In [None]:
#
# Visualizing the distribution of variables in dataset
%matplotlib inline

import numpy as np
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(color_codes=True)

np.random.seed(sum(map(ord, "distributions")))

fig, ax = plt.subplots(1,2,figsize=(30,15))
# frequency density on y axis
sns.distplot(ax=ax[0], bins=50, a=df_train['length_question1'])
sns.distplot(ax=ax[1], bins=50, a=df_train['length_question2'])
print("question 1 length mean: {0} and median: {1}".format(df_train['length_question1'].mean(), df_train['length_question1'].median()))
print("question 2 length mean: {0} and median: {1}".format(df_train['length_question2'].mean(), df_train['length_question2'].median()))
print("maximum length for question 1: {0}, question: \n{2}\n\nmaximum length for question 2: {1}, question: \n{3}".format(df_train['length_question1'].max(), df_train['length_question2'].max(), df_train.loc[df_train['length_question1'] == 146].question1.item(), df_train.loc[df_train['length_question2'] == 271].question2.iloc[0]))

Univariate distribution for both questions length skewed right, mean is towards the right and median closer to the pick (both containing the same median). questions length for second set are similar distributed with the question lenght from the first set. duplicated question 2 for the same length, none of the highest values are find duplicates in the training dataset.

In [None]:
#
# scatter plot to see the relation of both questions length in each axis
sns.jointplot(size=20, ratio=5, x="length_question1", y="length_question2", data=df_train[['length_question1', 'length_question2']]);

Without cleaning stopwords and pre-processing text i want to see how length of both questions relate to the target variable (is_duplicate)

In [None]:
#
# scatter plot for both variables, color by is_duplicate or not
sns.lmplot(x="length_question1", y="length_question2", data=df_train[['length_question1', 'length_question2', 'is_duplicate']],
           fit_reg=False, hue='is_duplicate', legend=True, size=15)
#
# Move the legend to an empty part of the plot
plt.legend(loc='lower right')

Most duplicate questions appear in the range of the same length and in the other side questions with substantial difference in length can mean different questions. it would be a good excercise to plot this difference.

In [None]:
#
# caluclate the difference in between lengths for questions (1 and 2)
df_train['length_difference_q12'] = abs(df_train['length_question1'] - df_train['length_question2'])
#
# plot the absolute difference in between the lengths of questions
plt.figure(figsize=(20,10))
sns.regplot(data=df_train, x="length_difference_q12", y="is_duplicate", logistic=True, n_boot=500, y_jitter=.03)
#
# print correlation matrix
df_train[['length_question1', 'length_question2', 'length_difference_q12', 'is_duplicate']].corr()

I can see that duplicate questions tend to appear when the difference of their lengths is less than 40~ but at the same time not duplicate questions tend to happen too with this criteria, in less proportion but not fully a rule. This also can happen becuase most of the questions tend to have similar lengths and in previous plots i can also see that only a few cases have substantial differences on length.
From previous results and final correlation, i conclude that:
* the pearson correlation between question length 1 and 2 is about 0.462468, which indicates that there is a moderate positive relationship between the variables.
* the pearson correlation between difference in lengths and is duplicate is about -0.206474, as a negative value, indicates that as the difference increases, the target value decreases, this actually makes sense as lower distance of length can represent similar questions (1 for is_duplicate) while substantial differences on length tend to be different questions (0 for is_duplicate)

Now that i have a baseline on the question lengths in both sides as a good indicator of a duplicate question, i can focus on the rest of the features related to natural language processing. For the first task I will find relevant words, ngrams and how they interact in between questions and within the target value.

In [None]:
df_train.head()

In [None]:
from wordcloud import WordCloud
#
# create corpus for both set of questions
question1_corpus = " ".join(df_train['question1'].tolist())
question2_corpus = " ".join(df_train['question2'].tolist())
#
# wordcloud
cloud_1 = WordCloud(width=1920, height=1080, background_color="white", mode="RGB").generate(question1_corpus)
cloud_2 = WordCloud(width=1920, height=1080, background_color="white", mode="RGB").generate(question2_corpus)
# plot definitions
font = {'weight': 'bold', 'size': 28}
plt.figure(figsize=(20, 15))
plt.title("WordCoud for Question 1", loc="center", fontdict=font)
plt.imshow(cloud_1)
plt.axis("off")

In [None]:
# wordcloud for question 2
plt.figure(figsize=(20, 15))
plt.title("WordCoud for Question 2", loc="center", fontdict=font)
plt.imshow(cloud_2)
plt.axis("off")

for both set of questions, it seems to have similarities on the amount words are being used, I will plot both into one WordCloud

In [None]:
questions_corpus = question1_corpus + question2_corpus
print("Total naive tokens in corpus {}".format(len(questions_corpus.split(" "))))
cloud = WordCloud(width=1920, height=1080, background_color="white", mode="RGB").generate(questions_corpus)
# plot definitions
font = {'weight': 'bold', 'size': 28}
plt.figure(figsize=(20, 15))
plt.title("WordCoud for Question Corpus", loc="center", fontdict=font)
plt.imshow(cloud)
plt.axis("off")

I would also like to see how these terms interact with each other, I will use NLTK to plot to most frequent ngrams

In [None]:
import re
# import contractions
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
#
# pre-process text to fix contractions and remove question marks and non alphanumeric characters
# to not sum them in the grams as corpus was joint into one
# questions_corpus = contractions.fix(questions_corpus)
questions_corpus = re.sub(r"'", '',questions_corpus)
questions_corpus = re.sub(r'[^A-Za-z0-9]', ' ',questions_corpus)
# tokenize resulting corpus by words
token = nltk.word_tokenize(questions_corpus)
# build ngrams for 3, 4 and 5 windows
trigrams = ngrams(token,3)
fourgrams = ngrams(token,4)
fivegrams = ngrams(token,5)
trigram_counter = Counter(trigrams)
fourgram_counter = Counter(fourgrams)
fivegram_counter = Counter(fivegrams)

In [None]:
tgc_t10 = trigram_counter.most_common(10)
fgc_t10 = fourgram_counter.most_common(10)
ftgc_10 = fivegram_counter.most_common(10)
print("------ TRIGRAM ------")
for idx in range(0, len(tgc_t10)):
    print("Word Combination {0}, frequency: {1}".format(tgc_t10[idx][0],tgc_t10[idx][1]))
print("------ FOURGRAM ------")
for idx in range(0, len(fgc_t10)):
    print("Word Combination {0}, frequency: {1}".format(fgc_t10[idx][0],fgc_t10[idx][1]))
print("------ FIVEGRAM ------")
for idx in range(0, len(ftgc_10)):
    print("Word Combination {0}, frequency: {1}".format(ftgc_10[idx][0],ftgc_10[idx][1]))