In [None]:
##########################################
# word2vec_modelCreation.ipynb
# Purpose: Word2Vec Model for Kaggle's Quora Question Pairs Competition (March 2017 - May 2017)
# Author: Priscilla Li
##########################################

In [1]:
##########################################
# Load Required Python Libraries
##########################################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import re
import nltk
# nltk.download('punkt')
import chardet
import itertools
import logging
from gensim.models import word2vec
##########################################

In [2]:
##########################################
# Loads in Quora Dataset
##########################################
#Training Dataset
data = pd.read_csv('train.csv')
##########################################

In [3]:
##########################################
# Tokenizing Functions used to create Word2Vec Model
##########################################
#Tokenizer for sentence splitting
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#Function to prep question1 and question2 for word2vec model
#Word2vec expects a list of lists as input (single sentences each as a list of words)
def question_to_wordlist(text, remove_stopwords = False):
    text = re.sub("[^a-zA-Z]", " ", text)
    words = text.lower().split()
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

def question_to_sentences(text, tokenizer, remove_stopwords = False):
    text = tokenizer.tokenize(text.strip())
    sentences = []
    
    for t in text:
        if(len(t) > 0):
            sentences.append(question_to_wordlist(t, remove_stopwords))
    return sentences
##########################################

In [4]:
##########################################
# Data Preprocessing prior to Word2Vec Model Creation
##########################################
#Prep data for word2vec
sentences = []

print("Parsing sentences from training set...")
#Converting question1 to sentences for word2vec model
for i in tqdm(range(0, len(data['question1']))):
    try:
        #Check for empty strings ""
        if(not pd.isnull(data['question1'][i])):
            sentences += question_to_sentences(data['question1'][i], tokenizer)
    except:
        try:
            encoding = chardet.detect(data['question1'][i])['encoding']
            sentences += question_to_sentences(data['question1'][i].decode(encoding), tokenizer)
        except:
            print(encoding)

#Converting question2 to sentences for word2vec model
for i in tqdm(range(0,len(data['question2']))):
    try:
        if(not pd.isnull(data['question2'][i])):
            sentences += question_to_sentences(data['question2'][i], tokenizer)
    except:
        try:
            encoding = chardet.detect(data['question2'][i])['encoding']
            sentences += question_to_sentences(data['question2'][i].decode(encoding), tokenizer)
        except:
            print(encoding)
##########################################

  0%|          | 0/404290 [00:00<?, ?it/s]  0%|          | 179/404290 [00:00<03:48, 1770.57it/s]

Parsing sentences from training set...


100%|██████████| 404290/404290 [01:26<00:00, 4659.25it/s]%|          | 1439/404290 [00:00<02:25, 2778.23it/s]
100%|██████████| 404290/404290 [01:30<00:00, 4455.55it/s]


In [5]:
##########################################
# Training Word2Vec Model
##########################################
#Set up log messaging while word2vec model is being trained
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#Set parameters for word2vec model
num_features = 300 #Word vector dimensionality
min_word_count = 10 #Minimum word count 
num_workers = 4 #Number of threads to run in parallel
context = 5 #Context window size
downsampling = 1e-3 #Downsample setting for frequent words

#Training word2vec model
print("Training word2vec model...")
model = word2vec.Word2Vec(sentences, workers = num_workers,\
                         size = num_features, min_count = min_word_count,\
                         window = context, sample = downsampling)
##########################################

2017-05-16 00:02:24,966 : INFO : collecting all words and their counts
2017-05-16 00:02:24,968 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-16 00:02:25,021 : INFO : PROGRESS: at sentence #10000, processed 99111 words, keeping 10771 word types
2017-05-16 00:02:25,071 : INFO : PROGRESS: at sentence #20000, processed 198901 words, keeping 15372 word types
2017-05-16 00:02:25,105 : INFO : PROGRESS: at sentence #30000, processed 297847 words, keeping 18786 word types
2017-05-16 00:02:25,147 : INFO : PROGRESS: at sentence #40000, processed 396378 words, keeping 21570 word types


Training word2vec model...


2017-05-16 00:02:25,192 : INFO : PROGRESS: at sentence #50000, processed 495897 words, keeping 23996 word types
2017-05-16 00:02:25,236 : INFO : PROGRESS: at sentence #60000, processed 595251 words, keeping 26209 word types
2017-05-16 00:02:25,279 : INFO : PROGRESS: at sentence #70000, processed 693633 words, keeping 28128 word types
2017-05-16 00:02:25,322 : INFO : PROGRESS: at sentence #80000, processed 793440 words, keeping 29888 word types
2017-05-16 00:02:25,363 : INFO : PROGRESS: at sentence #90000, processed 892903 words, keeping 31430 word types
2017-05-16 00:02:25,403 : INFO : PROGRESS: at sentence #100000, processed 992033 words, keeping 32934 word types
2017-05-16 00:02:25,451 : INFO : PROGRESS: at sentence #110000, processed 1091774 words, keeping 34379 word types
2017-05-16 00:02:25,492 : INFO : PROGRESS: at sentence #120000, processed 1190774 words, keeping 35739 word types
2017-05-16 00:02:25,535 : INFO : PROGRESS: at sentence #130000, processed 1289539 words, keeping 36

In [6]:
##########################################
# Saving Trained Word2Vec Model
##########################################
#Save the model (Can still be trained on further)
model_name = "300features_10minwords_5context_inprogress"
model.save(model_name)

#If not training the model any further call init_sims to make the model memory-efficient
# model.init_sims(replace = True)
# model_name = "300features_10minwords_5context"
# model.save(model_name)
##########################################

2017-05-16 00:05:22,470 : INFO : saving Word2Vec object under 300features_10minwords_5context_inprogress, separately None
2017-05-16 00:05:22,474 : INFO : not storing attribute syn0norm
2017-05-16 00:05:22,476 : INFO : not storing attribute cum_table
2017-05-16 00:05:24,354 : INFO : saved 300features_10minwords_5context_inprogress
