In [1]:
%matplotlib inline
import pandas as pd
import os
from glob import glob
from subprocess import PIPE, Popen
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

# Quora Question Pairs

In [2]:
# Kagglers are challenged to deduplicate Quora questions in this NLP challenge.
# Historically, the Quora team deduplicates questions with a Random Forest model
# to offer an improved exprience to the user.

## Inspecting the data

In [3]:
DATA_DIR = '../data/'
for fl in sorted(glob(DATA_DIR + '*.csv')):
    p = Popen(['head', '{}'.format(fl)], stdin=PIPE, stdout=PIPE, stderr=PIPE)
    a, b = p.communicate()
    print('#'*30 + ' Head for file: {}'.format(fl))
    for ll in a.decode('utf8').split('\n'):
        print(ll)

############################## Head for file: ../data/sample_submission.csv
test_id,is_duplicate
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1

############################## Head for file: ../data/test.csv
"test_id","question1","question2"
0,"How does the Surface Pro himself 4 compare with iPad Pro?","Why did Microsoft choose core m3 and not core i3 home Surface Pro 4?"
1,"Should I have a hair transplant at age 24? How much would it cost?","How much cost does hair transplant require?"
2,"What but is the best way to send money from China to the US?","What you send money to China?"
3,"Which food not emulsifiers?","What foods fibre?"
4,"How ""aberystwyth"" start reading?","How their can I start reading?"
5,"How are the two wheeler insurance from Bharti Axa insurance?","I admire I am considering of buying insurance from them"
6,"How can I reduce my belly fat through a diet?","How can I reduce my lower belly fat in one month?"
7,"By scrapping the 500 and 1000 rupee notes, how is RBI planning to figh

In [4]:
# Right off the top, we find numbers, product/country names, acronyms, 
# mathematical notation, mispellings, hiragana, and synonyms placed in parentheses.

## Evaluation

In [5]:
# The objective here is to minimize the log loss in the binary classifaction of target is_duplicate:
# https://www.kaggle.com/c/quora-question-pairs#evaluation

## Modeling Approach

In [6]:
# One simple approach to identifying duplicate questions would be to compare instances using bag-of-words
# and the Vector Space model hypothesizing similar documents contain similar term freq. distns.
# Using word embeddings, we can reduce dimensionality and learn more subtle relationships with neural nets

In [7]:
with open(DATA_DIR + 'train.csv', 'r') as infile:
    data = infile.readlines()
data = [x.strip().lower().split('","')[3:] for x in data]
data.pop(0)
data = [x.split() for instance in data for x in instance[:-1]]

In [8]:
sentences = [LabeledSentence(words=x, tags=['SENT_{}'.format(idx)]) for idx, x in enumerate(data)]

In [9]:
sentences

[LabeledSentence(words=['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india?'], tags=['SENT_0']),
 LabeledSentence(words=['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market?'], tags=['SENT_1']),
 LabeledSentence(words=['what', 'is', 'the', 'story', 'of', 'kohinoor', '(koh-i-noor)', 'diamond?'], tags=['SENT_2']),
 LabeledSentence(words=['what', 'would', 'happen', 'if', 'the', 'indian', 'government', 'stole', 'the', 'kohinoor', '(koh-i-noor)', 'diamond', 'back?'], tags=['SENT_3']),
 LabeledSentence(words=['how', 'can', 'i', 'increase', 'the', 'speed', 'of', 'my', 'internet', 'connection', 'while', 'using', 'a', 'vpn?'], tags=['SENT_4']),
 LabeledSentence(words=['how', 'can', 'internet', 'speed', 'be', 'increased', 'by', 'hacking', 'through', 'dns?'], tags=['SENT_5']),
 LabeledSentence(words=['why', 'am', 'i', 'mentally', 'very', 'lonely?', 'how', 'can', 'i', 'solve', 'it?'], tags=['SENT_6']),
 L

In [10]:
model = Doc2Vec(sentences, size=100, window=8, min_count=5, workers=32)

In [11]:
model.wv.most_similar(['what', 'are', 'the', 'questions', 'should', 'not', 'ask', 'on', 'quora?'])

[('quora', 0.5629540681838989),
 ('answers', 0.5180166959762573),
 ('answer', 0.5135594606399536),
 ('question', 0.5121930241584778),
 ('facebook?', 0.5018869638442993),
 ('instagram?', 0.49266260862350464),
 ('comment', 0.48520219326019287),
 ('facebook', 0.48392820358276367),
 ('life?', 0.4787965416908264),
 ('always', 0.478450208902359)]

In [12]:
sent_1 = ['what', 'are', 'the', 'questions', 'should', 'not', 'ask', 'on', 'quora?']
sent_2 = ['which', 'question', 'should', 'i', 'ask', 'on', 'quora?']
model.wv.n_similarity(sent_1, sent_2)

0.85865534922335307

In [13]:
# As a baseline model, we could simply set a threshold for similarity between question pairs.
# Taking this idea a step further, we could use question similarity as a feature
# for a nonlinear model which learns to classify duplicates.

## Model Assumptions
*  Preprocessing/Normalization of marginal impact
*  Duplicates can be identified by measuring semantic similarity

## Further Directions
*  Additional preprocessing: filter stopwords, remove punctuation, pattern matching.
*  Explore other embeddings (Word2Vec)
*  Deep learning model may perform well in this context.
https://www.kaggle.com/c/quora-question-pairs/discussion/34355