In [None]:
import h5py
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import csv
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import os.path
stopWords = stopwords.words('english')

# Options

In [None]:
#convert glove to word2vec format
gloveFileName = "glove.6B/glove.6B.50d.txt"
word2vecFileName = "glove.6B/glove.6B.50d.word2vec.txt"

trainCSVPath = "train.csv"
testCSVPath = "test.csv"

keepStopWords = True
specialCharacters = [".",",","'","/",":",";",'"',"*","?","+","[","]","(",")","<",">","=",",","-","^"]

finalLenWordVec = 55  #PCA IS A SOLUTION
                    #MultiThreading
padding_word = "-"
useNltkTokenizer = True

# Load Model

In [None]:
if not os.path.isfile(word2vecFileName): 
    print "converting glob to word2vec model"
    glove2word2vec(gloveFileName, word2vecFileName)
    
model = KeyedVectors.load_word2vec_format(word2vecFileName, binary=False)

modelVocab = list(model.wv.vocab)

# Load Data From train CSV

In [None]:
question1Lang = []
question2Lang = []
isSimilar = []

with open(trainCSVPath) as trainCSV:
    trainData = csv.DictReader(trainCSV, delimiter=',')
    for row in trainData:
        question1Lang.append(row['question1'])
        question2Lang.append(row['question2'])
        isSimilar.append(row['is_duplicate'])
print len(question1Lang), len(question2Lang), len(isSimilar) 

# Helper Functions

In [None]:
def removeSpecialCharacters(sentence):
    for character in specialCharacters:
        sentence = sentence.replace(character," ")
    #sentence = sentence.replace("&","and")
    return sentence

padding = model[padding_word]

def padVector(vector, finalLen):
    l = len(vector)
    if finalLen < l:
        vector = vector[-finalLen:]
    
    elif finalLen > l:
        padVec = [padding]*(finalLen-l)
        vector = padVec + vector
    
    return vector
        

def sen2vec(sentence):
    sentence = removeSpecialCharacters(sentence)
    vector = []
    
    if useNltkTokenizer:
        try:
            words = word_tokenize(sentence)
        except:
            return False,[]
    
    else:
        words = sentence.split(" ")
        if " " in words:
            words.remove(" ")
        if "" in words:
            words.remove("")

    
    words = [w.lower() for w in words]

    if not keepStopWords:
        words = [w for w in words if w not in stopWords]

    for word in words:
        if word not in modelVocab:
            return False,[]
        
        word2vec = model[word]
        vector.append(word2vec)

    vector = padVector(vector, finalLenWordVec)

    return True, vector

# Main

In [None]:
##DATA IS NOT NORMALIZED 

question1Data = []
question2Data = []
label = []
saveFileName = 'train0to10000.h5py'

for index in range(10000):#range(len(question1Lang)):
    #if index%100 == 0:
    #    print "processed: ", index
        
    question1 = question1Lang[index]
    question2 = question2Lang[index]
    
    q1success, q1vec = sen2vec(question1)
    q2success, q2vec = sen2vec(question2)
    
    if q1success and q2success:
        #print index
        question1Data.append(q1vec)
        question2Data.append(q2vec)
        label.append(int(isSimilar[index]))
        
question1Data = np.array(question1Data)
question2Data = np.array(question2Data)
label = np.array(label)

with h5py.File(saveFileName,'w') as pr:
    pr.create_dataset('question1',data = question1Data)
    pr.create_dataset('question2',data = question2Data)
    pr.create_dataset('label', data = label)