# Processing the train and test data to put each question as a single line and then use gensim to generate word embeddings

In [1]:
import pandas as pd
import numpy as np
import os
import sys

In [2]:
from tqdm import tqdm_notebook

In [3]:
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from gensim.models.word2vec import LineSentence

In [4]:
from multiprocessing import cpu_count
from nltk.tokenize import TreebankWordTokenizer 

In [5]:
EMBEDDING_DIM = 300

In [6]:
tokenize = TreebankWordTokenizer().tokenize

In [7]:
"""
this is just since I was running some experiments on ec2 instances
"""
if "linux" in sys.platform:
    data_dir = "/home/ubuntu/datasets/quora_question_pairs"
else:
    data_dir = "/Users/roopal/workspace/kaggle/quora_question_pairs/data"

In [8]:
data_file = os.path.join(data_dir, "qqp_text")
data_file_tokens = os.path.join(data_dir, "qqp_text_tokens")

In [13]:
train_file = os.path.join(data_dir, "train.csv")
test_file = os.path.join(data_dir, "test.csv")

In [14]:
w2v_model = os.path.join(data_dir, "w2v.model")
w2v_model_bin = os.path.join(data_dir, "w2v.model.bin")
w2v_vocab = os.path.join(data_dir, "w2v.model.vocab")

In [16]:
df_train = pd.read_csv(train_file, index_col=False)

In [17]:
df_test = pd.read_csv(test_file, index_col=False)

In [18]:
print df_train.columns
print df_test.columns

Index([u'id', u'qid1', u'qid2', u'question1', u'question2', u'is_duplicate'], dtype='object')
Index([u'test_id', u'question1', u'question2'], dtype='object')


In [19]:
df_questions = df_train['question1']

In [20]:
type(df_questions)

pandas.core.series.Series

In [21]:
len(df_questions)

404290

In [22]:
df_questions = df_questions.append(df_train["question2"])

In [23]:
len(df_questions)

808580

In [24]:
df_questions = df_questions.append(df_test["question1"])

In [25]:
len(df_questions)

3154376

In [26]:
df_questions = df_questions.append(df_test["question2"])

In [27]:
len(df_questions)

5500172

In [28]:
df_questions.to_csv(data_file, index=False)

In [29]:
def tokenize_text(txt_file, txt_tokens_file):
    with open(txt_tokens_file, 'w') as out:
        with open(txt_file, 'r') as inp:
            data = inp.readlines()
            pbar = tqdm_notebook(total=len(data), desc="current_line", leave=False)
            for i, text in enumerate(data):
                tokenized_text = ' '.join(tokenize(text.lower()))
                out.write(tokenized_text + "\n")
                pbar.update()
            pbar.close()

In [30]:
tokenize_text(data_file, data_file_tokens)



In [34]:
def generate_vectors(input_filename, output_filename_model, output_filename_model_w2v_bin, vocab_filename):
    sentences = LineSentence(input_filename)
#     print sentences
#     bigrams = Phrases(sentences, min_count=5, threshold=10)
#     print bigrams
#     trigrams = Phrases(bigrams[sentences],min_count=5, threshold=10)
#     print trigrams
    print "generating models"
    model = Word2Vec(
        sentences, size=EMBEDDING_DIM, window=5, min_count=5, workers=cpu_count(), iter=6
    )
    print "saving model"
    model.save(output_filename_model)
    print "saving bin model and vocab"
    model.wv.save_word2vec_format(output_filename_model_w2v_bin, vocab_filename, binary=True)

In [35]:
generate_vectors(data_file_tokens, w2v_model, w2v_model_bin, w2v_vocab)

generating models
saving model
saving bin model and vocab


In [36]:
print "Done..."

Done...


In [37]:
model = Word2Vec.load(w2v_model)

In [38]:
most_similar = model.wv.most_similar(u"india", topn=20)
for item in most_similar:
    print item

(u'nepal', 0.5698479413986206)
(u'india.', 0.5509123802185059)
(u'indian', 0.543890655040741)
(u'singapore', 0.5358853340148926)
(u'nigeria', 0.5139864683151245)
(u'australia', 0.501850426197052)
(u'europe', 0.4983462393283844)
(u'usa', 0.4983311891555786)
(u'indonesia', 0.49664753675460815)
(u'malaysia', 0.49647265672683716)
(u'bangladesh', 0.49408218264579773)
(u'mumbai', 0.4914305806159973)
(u'america', 0.4870849549770355)
(u'karnataka', 0.48101332783699036)
(u'uk', 0.4760788679122925)
(u'canada', 0.47129982709884644)
(u'delhi', 0.4702357053756714)
(u'bangalore', 0.46803537011146545)
(u'gujarat', 0.4677974283695221)
(u'kolkata', 0.4662250876426697)


In [39]:
model.wv[u'hello']

array([ 1.63505113,  1.74928784,  0.33551815,  1.20147967, -1.08627117,
        0.32134393,  0.32199851,  0.21930236,  0.63439894, -0.524297  ,
       -0.07424913,  0.14191453,  0.30413657,  0.34961057, -0.33987811,
        2.41526294, -0.01779459, -0.23598608,  0.21480007,  0.5800305 ,
       -1.12349379, -0.51853657, -0.01496914, -0.1390993 ,  0.27757537,
       -0.09089902, -1.24239457, -1.042804  , -0.30473396,  1.13916755,
        0.22293177,  1.06639969, -0.42549893,  0.55504042, -1.09364414,
       -1.14607882, -0.37396845,  0.64139342,  0.09047037,  1.27149355,
        1.12145448,  0.19973932, -0.52814686, -0.26487184, -0.77216774,
        0.4322072 , -0.01802699,  0.591591  , -0.73351568,  0.70908064,
       -0.07412325,  1.10275376, -0.61836624,  0.96817815, -0.16194807,
        0.34148327, -0.66180146, -1.30326378,  0.50620872,  0.48529968,
       -0.39764825, -0.2398877 , -0.18629874, -0.49037498,  0.93808776,
        0.58165681, -0.41860759, -1.64082241, -0.15130819,  0.59