Quora Questions Pairs

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
import random
import string
import re

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
#Load and Explore data 

train_set = pd.read_csv("../input/train.csv")
test_set = pd.read_csv("../input/test.csv")
print(train_set.shape)
print(test_set.shape)

print(train_set.head())

In [None]:
#Preprocess Text

def preprocess(question):

    #Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", question) 
    
    #Convert to lower case
    text = letters_only.lower()                          

    # Replace punctuation with tokens so we can use them in our model
    for c in string.punctuation:
         text = text.replace(c,"")
    
    #Convert text to words
    text_to_array = text.split()

     #Convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    
    #Remove stop words
    meaningful_words = [w for w in text_to_array if not w in stops]    
    
    # Remove all words with  5 or fewer occurences
    #word_counts = Counter(words)
    #trimmed_words = [word for word in words if word_counts[word] > 5]
    
    return meaningful_words

In [None]:
#Convert questions to words

words = []

for x in range(0,40):#train_set.shape[0]):
    words += preprocess(train_set.iloc[x,3])
    words += preprocess(train_set.iloc[x,4])

In [None]:
#Create Lookup Tables

def create_lookup_tables(words):
    """
    Create lookup tables for vocabulary
    :param words: Input list of words
    :return: A tuple of dicts.  The first dict....
    """
    word_counts = Counter(words)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab

vocab_to_int, int_to_vocab = create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

In [None]:
#Subsampling

threshold = 1e-5
word_counts = Counter(int_words)
total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]


In [None]:
# Receives a list of words, an index, and a window size, then returns a list of words in the window around the index

def get_target(words, idx, window_size=5):
    ''' Get a list of words in a window around an index. '''
    
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])
    
    return list(target_words)

In [None]:
#Get batches for the network

def get_batches(words, batch_size, window_size=5):
    ''' Create a generator of word batches as a tuple (inputs, targets) '''
    
    n_batches = len(words)//batch_size
    
    # only full batches
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y