In [12]:
import sys
print(sys.path)
import re
import os
import csv
import nltk
import sklearn

import numpy as np
import pandas as pd
import tensorflow as tf

from gensim.models import KeyedVectors

from nltk.corpus import stopwords

EMBEDDING_FILE = '/Users/swapnil/work/Kaggle/data/googleNewsEmbeddings/GoogleNews-vectors-negative300.bin'
TRAIN_FILE = '/Users/swapnil/work/Kaggle/data/quoraPairs/train.csv'
EMBEDDING_DIM = 300


['', '/Users/swapnil/anaconda2/lib/python27.zip', '/Users/swapnil/anaconda2/lib/python2.7', '/Users/swapnil/anaconda2/lib/python2.7/plat-darwin', '/Users/swapnil/anaconda2/lib/python2.7/plat-mac', '/Users/swapnil/anaconda2/lib/python2.7/plat-mac/lib-scriptpackages', '/Users/swapnil/anaconda2/lib/python2.7/lib-tk', '/Users/swapnil/anaconda2/lib/python2.7/lib-old', '/Users/swapnil/anaconda2/lib/python2.7/lib-dynload', '/Users/swapnil/anaconda2/lib/python2.7/site-packages', '/Users/swapnil/anaconda2/lib/python2.7/site-packages/Sphinx-1.5.1-py2.7.egg', '/Users/swapnil/anaconda2/lib/python2.7/site-packages/aeosa', '/Users/swapnil/anaconda2/lib/python2.7/site-packages/IPython/extensions', '/Users/swapnil/.ipython']


In [13]:
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
                                            binary=True)

train = pd.read_csv(TRAIN_FILE)

In [14]:
print('Found %s word vectors of word2vec' % len(word2vec.vocab))
print(train['is_duplicate'].value_counts())

Found 3000000 word vectors of word2vec
0    255027
1    149263
Name: is_duplicate, dtype: int64


In [15]:
PRINT_STATUS_ITER = 50000
stops = set(stopwords.words("english"))
print('Test print')
buckets = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

def cleanText(text):
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", str(text))
    return text

class PairScore:
    def __init__(self,ind1,ind2,dist):
        self.ind1 = ind1
        self.ind2 = ind2
        self.dist = dist
    
    def __lt__(self,other):
        return self.dist > other.dist

class Pair:
    def __init__(self,vec1,vec2,ind1,ind2):
        self.vec1 = vec1
        self.vec2 = vec2
        self.ind1 = ind1
        self.ind2 = ind2

class DataInstance:
    def __init__(self,dataId,pairs,label):
        self.dataId = dataId
        self.pairs = pairs
        self.label = label
    
    
def generateClosePairs(embedding1,embedding2,smallQuestion,bigQuestion):
    listPairs = []
    # Creating all possible pairs.
    for i in range(0,embedding1.shape[0]):
        for j in range(0,embedding2.shape[0]):
            p = PairScore(i,j,sklearn.metrics.pairwise
                          .cosine_similarity(embedding1[i].reshape(1,-1),embedding2[j].reshape(1,-1))[0])
            listPairs.append(p)
    
    sortedPairs = sorted(listPairs)
    
    # Creating right pairs by considering each word only once.. 
    smallConsidered = [False for i in range(embedding1.shape[0])]
    bigConsidered = [False for i in range(embedding2.shape[0])]
    finalPairs = []
    for pair in sortedPairs:
        if smallConsidered[pair.ind1] == False and bigConsidered[pair.ind2] == False:
            finalPair = Pair(embedding1[pair.ind1],embedding2[pair.ind2],pair.ind1,pair.ind2)
            finalPairs.append(finalPair)
            smallConsidered[pair.ind1] = True
            bigConsidered[pair.ind2] = True
            
    for i,considered in enumerate(bigConsidered):
        if considered == False:
            finalPair = Pair(np.zeros(EMBEDDING_DIM),embedding2[i],-1,i)
            finalPairs.append(finalPair)
            
    return finalPairs
        
    #for finalPair in finalPairs:
     #   print(smallQuestion[finalPair.ind1] if finalPair.ind1!=-1 else "-1",",",bigQuestion[finalPair.ind2])
    
def processData(data,isTrain):
    processedList = []
    maxSeqLen = 0
    print('Start of processData')
    for ind,row in train.iterrows():
        question1 = cleanText(row['question1'])
        question2 = cleanText(row['question2'])
        label = row['is_duplicate']
        dataId = row['id']
    
        question1Splits = question1.lower().split()
        question2Splits = question2.lower().split()

        # Removing stopwords.
        shortQuestion1 = [w for w in question1Splits if not w in stops]
        shortQuestion2 = [w for w in question2Splits if not w in stops]

        # Deciding short question and long question.
        if len(shortQuestion1) < len(shortQuestion2):
            smallQuestion = shortQuestion1
            bigQuestion = shortQuestion2
        else:
            smallQuestion = shortQuestion2
            bigQuestion = shortQuestion1
        
        smallEmbedding = np.zeros((len(smallQuestion), EMBEDDING_DIM))
        bigEmbedding = np.zeros((len(bigQuestion), EMBEDDING_DIM))
    
        # Getting embedding vectors for each word.
        wordsBelowScore = np.zeros(len(buckets))
    
        for i in range(0,len(smallQuestion)):
            word = smallQuestion[i]
            if word in word2vec.vocab:
                smallEmbedding[i] = word2vec.word_vec(word)
    
        for i in range(0,len(bigQuestion)):
            word = bigQuestion[i]
            if word in word2vec.vocab:
                bigEmbedding[i] = word2vec.word_vec(word)
        
        pairs = generateClosePairs(smallEmbedding,bigEmbedding,smallQuestion,bigQuestion)
        processedList.append(DataInstance(dataId,pairs,label))
        if len(pairs) > maxSeqLen:
            maxSeqLen = len(pairs)
        if ind % PRINT_STATUS_ITER == 0:
            print('processData curr iter ',ind)
            
    return maxSeqLen,processedList

def createDataset(processedList,maxSeqLen):
    print('Start of createDataset')
    dataset = np.zeros((len(processedList),maxSeqLen,EMBEDDING_DIM))
    lenthSet = np.zeros(len(processedList))
    labels = np.zeros(len(processedList))
    dataIds = np.zeros(len(processedList))
    for i,processed in enumerate(processedList):
        pairs = processed.pairs
        j = 0;
        for pair in pairs:
            diffSqr = np.square(np.subtract(pair.vec1,pair.vec2))
            dataset[i,j,:] = diffSqr
            j = j + 1;
        while j < maxSeqLen:
            dataset[i,j,:] = -1 * np.ones(EMBEDDING_DIM)
            j = j + 1
        lenthSet[i] = len(pairs)
        labels[i] = processed.label
        dataIds[i] = processed.dataId
        if i % PRINT_STATUS_ITER == 0:
            print('createDataset curr iter ',i)
    return dataset,lenthSet,labels,dataIds

Test print


In [19]:
#trainMaxSeqLen,trainProcessedList = processData(train,True)    
np.save('/Users/swapnil/work/Kaggle/data/quoraPairs/processedData/train/trainProcessedList.npy',trainProcessedList)
np.save('/Users/swapnil/work/Kaggle/data/quoraPairs/processedData/train/trainMaxSeqLen.npy',trainMaxSeqLen)

Start of processData
('processData curr iter ', 0)
('processData curr iter ', 50000)
('processData curr iter ', 100000)
('processData curr iter ', 150000)
('processData curr iter ', 200000)
('processData curr iter ', 250000)
('processData curr iter ', 300000)
('processData curr iter ', 350000)
('processData curr iter ', 400000)


IOError: [Errno 2] No such file or directory: '/Users/swapnil/work/Kaggle/data/quoraPairs/processedData/train/trainProcessedList.npy'

In [21]:
np.save('/Users/swapnil/work/Kaggle/data/quoraPairs/processedData/trainProcessedList.npy',trainProcessedList)
np.save('/Users/swapnil/work/Kaggle/data/quoraPairs/processedData/trainMaxSeqLen.npy',trainMaxSeqLen)

In [None]:
trainProcessedList = np.load('/Users/swapnil/work/Kaggle/data/quoraPairs/processedData/trainProcessedList.npy')
trainMaxSeqLen = np.load('/Users/swapnil/work/Kaggle/data/quoraPairs/processedData/trainMaxSeqLen.npy')

trainDataset,trainLengthSet,trainLabels,trainDataIds = createDataset(processedList,maxSeqLen)
np.save('/Users/swapnil/work/Kaggle/data/quoraPairs/processedData/trainDataset.npy',trainDataset)
np.save('/Users/swapnil/work/Kaggle/data/quoraPairs/processedData/trainLengthSet.npy',trainLengthSet)
np.save('/Users/swapnil/work/Kaggle/data/quoraPairs/processedData/trainLabels.npy',trainLabels)
np.save('/Users/swapnil/work/Kaggle/data/quoraPairs/processedData/trainDataIds.npy',trainDataIds)

Start of createDataset
('createDataset curr iter ', 0)
('createDataset curr iter ', 50000)


In [7]:
BATCH_SIZE = 100
graph = tf.Graph()
with graph.as_default():
    batchData = tf.placeholder(tf.float32, [None, maxSeqLen, EMBEDDING_DIM])
    batchSeqLength = tf.placeholder(tf.int32,[tf.shape(batchData)[0]])
    
    labels = tf.placeholder(tf.float32,[BATCH_SIZE])
    logisticWeights = tf.Variable(tf.truncated_normal(
      [EMBEDDING_DIM], stddev=0.1))
    
    def model(batchData,seqLength):
        rnnCell = tf.contrib.rnn.BasicLSTMCell(100)
        rnnOutputs, state = tf.nn.dynamic_rnn(rnnCell, batchData, dtype=tf.float32)
        lastIndex = tf.range(BATCH_SIZE)*tf.shape(rnnOutputs)[1] + (seqLength - 1)
        lastOutput = tf.gather(tf.reshape(rnnOutputs, [-1, state_size]), lastIndex)
        bias = tf.placeholder(tf.float32,[1])
        logits = tf.matmul(lastOutput,logisticWeights) + bias
        return logits
    
    logits = model(batchData,batchSeqLength)
    preds = tf.nn.softmax(logits)
    correct = tf.equal(tf.cast(tf.argmax(preds,1),tf.int32), labels)
    loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))   
    optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)


[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


NameError: name 'maxSeqLen' is not defined

In [58]:
def length(sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

def last_relevant(output, length):
    batch_size = tf.shape(output)[0]
    max_length = tf.shape(output)[1]
    out_size = int(output.get_shape()[2])
    index = tf.range(0, batch_size) * max_length + (length - 1)
    flat = tf.reshape(output, [-1, out_size])
    relevant = tf.gather(flat, index)
    return relevant
    #return output
    

graph = tf.Graph()
with graph.as_default():
    out = tf.constant([[[1,2,3,0,0],[1,2,0,0,0],[0,0,0,0,0]],[[1,2,3,0,0],[1,2,0,0,0],[1,2,3,4,5]]])
    outShape = tf.shape(out)
    lengthOuts = length(out)
    lastOuts = last_relevant(out,lengthOuts)
    #s = tf.shape(out)
    
with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  lengthVal,outs,out,outShape = session.run([lengthOuts,lastOuts,out,outShape])
  print(lengthVal)
  print(outShape)
  print(out)
  print(outs)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
[3 3]
[2 3 5]
[[[1 2 3 0 0]
  [1 2 0 0 0]
  [1 2 3 4 5]]

 [[1 2 3 0 0]
  [1 2 0 0 0]
  [1 2 3 4 5]]]
[[1 2 3 4 5]
 [1 2 3 4 5]]


In [None]:
graph = tf.Graph()
with graph.as_default():
    a = tf.constant([[5,6,7],[7,8,9]])
    b = tf.Variable(2)
    b.assign(2)
    b = tf.add(a,b)
    c = a*b
    
    
with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  aout,bout,cout = session.run([a,b,c])
  print(aout,bout,cout)

In [60]:
l1 = [1,2,3]
print(type(l1))
l2 = l1
l2.append(4)
print(l1)

<type 'list'>
[1, 2, 3, 4]
