In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.stem.porter import PorterStemmer # word stemming using Porter stemmer algorithm
import re # regex module


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
# Loads data
data = pd.read_csv('../input/train.csv')
data.head(5)

In [None]:
data.tail(5)

In [None]:
# Make sure that everything is ok
data.head(10)

In [None]:
# Drop the first three columns id, qid1, qid2 since they carry no informations
data = data.drop(['id', 'qid1', 'qid2'], axis=1)

In [None]:
# Looks good! 
# Now, it's time to check if there are any missing data
data.isnull().sum()

In [None]:
# Oops! Looks like we have two missing datas on the column question2
# Remove rows that contain missing data
print("Before dropping: ", data.shape)
data.dropna(axis=0, how='any',inplace=True)
print("After dropping: ", data.shape)

In [None]:
# Cool!
# Now it's time to do some text preprocessing
# We need to:
#            1. Get rid of all non alphanumeric characters, like: ?, *, ^_^,...
#            2. Exclude stopwords ??? (shall we?)
#            3. Transform each word to its root, like: coolest->cool, houses->house, etc...
#            4. TODO: get rid of all non representative words

def text_preprocessor(text, stemmed=False, stopwords=set()):
    """
    - Converts text to lower case
    - Gets rid of non-words
    - stems word using Porter Stemmer algorithm
    """
    text = text.lower()
    text = re.sub('[^\w\s]+','',text)
    tokens = [w for w in text.split() if w not in stopwords]
    if stemmed:
        porter = PorterStemmer()
        # There is a bug in some nltk versions: porter.stem('oed') ---> crash!
        # Solution: just ignore it!
        temp = []
        for t in tokens:
            try:
                temp.append(porter.stem(t))
            except IndexError:
                pass  
        tokens = temp
                
        
    return " ".join(tokens)

In [None]:
# Sanity check
porter = PorterStemmer()
porter.stem('oed')

In [None]:
# But
text_preprocessor('oed')

In [None]:
# Looks good!

In [None]:
# Before we dive into any further, Let's convert data to numpy matrices
X,y = data.iloc[:,:2].values, data.iloc[:,2].values

# Make sure that we din't break anything
print('X.shape: ', X.shape)
print('y.shape: ', y.shape)

In [None]:
# It's to to play with the text_preprocessor function a little bit
questions = X[:10,0]
for q in questions:
    print(q)
    print(text_preprocessor(q))
    print('----------------------------------------------------------------------------')

In [None]:
# What if stemmed is set to True
questions = X[:10,0]
for q in questions:
    print(q)
    print(text_preprocessor(q, stemmed=True))
    print('----------------------------------------------------------------------------')


In [None]:
# Try one more time with the last 10 questions in column question2
questions = X[-10:,1]
for q in questions:
    print(q)
    print(text_preprocessor(q, stemmed=True))
    print('----------------------------------------------------------------------------')

In [None]:
# TODO: 
#      1. Consider to add stopwords, non-representative words
#      2. ????

In [None]:
# It's time to apply text_preprocessor to our data
data['question1'] = data['question1'].apply(text_preprocessor, args=(True,{}))
data['question2'] = data['question2'].apply(text_preprocessor, args=(True,{}))

In [None]:
# Have a look
data.tail(10)

In [None]:
# Save data for later use
# import csv
# file_name = 'final_data.csv'
# data.to_csv(file_name, 
#             header=['question1', 'question1', 'is_duplicate'], 
#             index=False, quoting=csv.QUOTE_NONNUMERIC)

In [None]:
# Statistic similarity between two short docs
from sklearn.feature_extraction.text import CountVectorizer

def statistic_sim(doc1, doc2):
    doc1 = text_preprocessor(doc1, stemmed=True)
    doc2 = text_preprocessor(doc2, stemmed=True)
    count = CountVectorizer()
    bag = count.fit_transform(np.array([doc1, doc2]))
    v1, v2 = bag.toarray()
    return np.dot(v1,v2)/(np.linalg.norm(v1) * np.linalg.norm(v2))
    
    
    
    

In [None]:
# Test statistic_sim func
pairs = X[:10]
duplicates = y[:10]

for pair, dup in enumerate((pairs, duplicates)):
    print(pair, " ", dup)
