# Quora question pairs: data preparation

## Import packages

In [8]:
from __future__ import print_function

import numpy as np
import csv, json
from zipfile import ZipFile
import os
from os.path import expanduser, exists

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file
import pandas as pd

## Load Data Set

In [10]:
# load dataset
def load_datasets(load_dir = "../data/kaggle_competition/", prefix="clean_kaggle_", post_fix=""):
    """
    Function that loads the dataset
    """
    train_set = pd.read_csv(os.path.join(load_dir, "{0}train{1}.csv".format(prefix,post_fix)), keep_default_na=False)
    validation_set = pd.read_csv(os.path.join(load_dir, "{0}validation{1}.csv".format(prefix,post_fix)), keep_default_na=False)
    test_set = pd.read_csv(os.path.join(load_dir, "{0}test{1}.csv".format(prefix,post_fix)), keep_default_na=False)
    return train_set, validation_set, test_set

def xy_split(df, label_col="is_duplicate"):
    """
    Function that splits a data frame into X and y
    """
    return df.drop(label_col, axis=1), df[label_col]

train_set, validation_set, test_set = load_datasets()

## Initialize global variables

In [36]:
Q1_TRAINING_DATA_FILE = '../data/preprocessed/q1_train.npy'
Q2_TRAINING_DATA_FILE = '../data/preprocessed/q2_train.npy'
LABEL_TRAINING_DATA_FILE = '../data/preprocessed/label_train.npy'
Q1_validate_DATA_FILE = '../data/preprocessed/q1_validate.npy'
Q2_validate_DATA_FILE = '../data/preprocessed/q2_validate.npy'
LABEL_validate_DATA_FILE = '../data/preprocessed/label_validate.npy'
Q1_test_DATA_FILE = '../data/preprocessed/q1_test.npy'
Q2_test_DATA_FILE = '../data/preprocessed/q2_test.npy'

WORD_EMBEDDING_MATRIX_FILE = '../data/preprocessed/word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = '../data/preprocessed/nb_words.json'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300

## Download and extract questions pairs data

In [3]:
if not exists(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE):
    get_file(QUESTION_PAIRS_FILE, QUESTION_PAIRS_FILE_URL)

print("Processing", QUESTION_PAIRS_FILE)

question1 = []
question2 = []
is_duplicate = []
with open(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE, encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t')
    for row in reader:
        question1.append(row['text1'])
        question2.append(row['text2'])
        is_duplicate.append(row['duplicate'])

print('Question pairs: %d' % len(question1))

Downloading data from http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv
Processing quora_duplicate_questions.tsv
Question pairs: 404351


## Build tokenized word index

In [28]:

questions = train_set['clean_q1'] + train_set['clean_q2']
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
question1_train = tokenizer.texts_to_sequences(train_set['clean_q1'])
question2_train = tokenizer.texts_to_sequences(train_set['clean_q2'])
question1_validate = tokenizer.texts_to_sequences(validation_set['clean_q1'])
question2_validate = tokenizer.texts_to_sequences(validation_set['clean_q2'])
question1_test = tokenizer.texts_to_sequences(test_set['clean_q1'])
question2_test = tokenizer.texts_to_sequences(test_set['clean_q2'])
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))


Words in index: 84110


In [30]:
question1_validate[:5]

[[4, 13, 5, 218, 475, 63, 1965],
 [2, 9, 15, 9, 7, 30, 32, 15336, 6795],
 [23, 123, 31, 5, 126, 192, 2128],
 [2, 36, 186, 24, 2989, 44624, 147, 272, 530],
 [2, 3, 1, 69, 48, 1000, 12, 195, 327]]

## Download and process GloVe embeddings

In [20]:
embeddings_index = {}
with open('/Users/jingyi/study/ml/glove.840B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

Word embeddings: 2196016


## Prepare word embedding matrix

In [21]:
num_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((num_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Null word embeddings: 22751


## Prepare training data tensors

In [35]:
q1_train = pad_sequences(question1_train, maxlen=MAX_SEQUENCE_LENGTH)
q2_train = pad_sequences(question2_train, maxlen=MAX_SEQUENCE_LENGTH)
labels_train = np.array(train_set["is_duplicate"], dtype=int)

q1_validate = pad_sequences(question1_validate, maxlen=MAX_SEQUENCE_LENGTH)
q2_validate = pad_sequences(question2_validate, maxlen=MAX_SEQUENCE_LENGTH)
labels_validate = np.array(validation_set["is_duplicate"], dtype=int)

q1_test = pad_sequences(question1_test, maxlen=MAX_SEQUENCE_LENGTH)
q2_test = pad_sequences(question2_test, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of question1 train tensor:', q1_train.shape)
print('Shape of question2 train tensor:', q2_train.shape)
print('Shape of train label tensor:', labels_train.shape)

print('Shape of question1 valid tensor:', q1_validate.shape)
print('Shape of question2 valid tensor:', q2_validate.shape)
print('Shape of valid label tensor:', labels_validate.shape)

print('Shape of question1 test tensor:', q1_test.shape)
print('Shape of question2 test tensor:', q2_test.shape)

Shape of question1 train tensor: (323432, 25)
Shape of question2 train tensor: (323432, 25)
Shape of train label tensor: (323432,)
Shape of question1 valid tensor: (80858, 25)
Shape of question2 valid tensor: (80858, 25)
Shape of valid label tensor: (80858,)
Shape of question1 test tensor: (2345796, 25)
Shape of question2 test tensor: (2345796, 25)


## Persist training and configuration data to files

In [39]:
np.save(open(Q1_TRAINING_DATA_FILE, 'wb'), q1_train)
np.save(open(Q2_TRAINING_DATA_FILE, 'wb'), q2_train)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels_train)
np.save(open(Q1_validate_DATA_FILE, 'wb'), q1_validate)
np.save(open(Q2_validate_DATA_FILE, 'wb'), q2_validate)
np.save(open(LABEL_validate_DATA_FILE, 'wb'), labels_validate)
np.save(open(Q1_test_DATA_FILE, 'wb'), q1_test)
np.save(open(Q2_test_DATA_FILE, 'wb'), q2_test)
np.save(open(WORD_EMBEDDING_MATRIX_FILE, 'wb'), word_embedding_matrix)
with open(NB_WORDS_DATA_FILE, 'w') as f:
    json.dump({'nb_words': num_words}, f)

In [41]:
q1_train.shape

(323432, 25)