In [1]:
import time
import csv
import collections
import string
import pandas as pd
import numpy as np
import nltk
from sklearn import preprocessing, model_selection  
import scipy
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
nltk.download('punkt')

Using TensorFlow backend.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
#load trianing data

def load_data():
    print("loading training data...")
    x_train, y_train = [], []
    df_train = pd.read_csv("train.csv", low_memory = False).dropna()
    for index, row in df_train.iterrows():
        s1, s2 = nltk.word_tokenize(row["question1"]), nltk.word_tokenize(row["question2"])
        sen1, sen2 = [w.lower() for w in s1 if w.isalnum()], [w.lower() for w in s2 if w.isalnum()] #keep only alphanumeric words
        x_train.append([sen1, sen2])
        del s1, s2, sen1, sen2
        
    print("Done!")
    return x_train, list(df_train["is_duplicate"])

In [3]:
x_tokens, y_train = load_data()
assert len(x_tokens)==len(y_train)

loading training data...
Done!


In [4]:
print(x_tokens[0], len(x_tokens))

[['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india'], ['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market']] 404287


In [0]:
# GloVe Encoding

def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [6]:
encodings = loadGloveModel('glove.6B.300d.txt') #loading GloVe vectors

Loading Glove Model
Done. 400000  words loaded!


In [7]:
encodings['step'].shape #checking -> should be (300,)

(300,)

In [0]:
def pair_encodings(data, encodings):
    X, count = [], 0
    for i,pair in enumerate(data):
        v1, v2 = np.sum([encodings[w] for w in pair[0] if w in encodings], axis=0)/len(pair[0]), np.sum([encodings[w] for w in pair[1] if w in encodings], axis=0)/len(pair[1])
        if v1.shape==(300,) and v2.shape==(300,):
            vec = np.append(v1,v2).reshape(600,)
            assert vec.shape == (600,)
            X.append(vec)
            del vec
        else:
            count+=1
            print(pair, count)
            del y_train[i]
        del v1, v2
    return np.array(X)

In [9]:
x_train = pair_encodings(x_tokens, encodings)
print(x_train.shape)
del x_tokens

  after removing the cwd from sys.path.


[[], ['why', 'is', 'cornell', 'endowment', 'the', 'lowest', 'in', 'the', 'ivy', 'league']] 1
[[], ['why', 'should', 'one', 'not', 'work', 'at', 'google']] 2
[['what', 'is', 'the', 'most', 'visited', 'tourist', 'attraction', 'in', 'africa'], ['उसपर']] 3
[['how', 'could', 'i', 'solve', 'this'], []] 4
[[], ['what', 'is', 'the', 'gmail', 'tech', 'support', 'help', 'phone', 'number']] 5
[['is', 'there', 'anywhere', 'in', 'the', 'world', 'offering', 'pain', 'management', 'for', 'peripheral', 'neuropathy', 'as', 'opioids', 'haved', 'been', 'banned', 'in', 'us'], []] 6
[['is', 'there', 'any', 'chances', 'for', 'hailstones', 'tomorrow'], ['parisflatlist']] 7
[['since', 'childhood', 'why', 'are', 'we', 'taught', 'to', 'use', 'our', 'right', 'and', 'not', 'the', 'left', 'hand', 'for', 'writing', 'and', 'having', 'food'], ['aosdhiadsoihadso', 'dasodashdasoh']] 8
[[], ['who', 'are', 'moses', 'noah', 'and', 'exodus']] 9
[[], ['problem', 'of', 'solving', 'a', 'problem', 'is', 'not', 'a', 'problem', '

In [10]:
print(x_train.shape, np.array(y_train).shape)

(404259, 600) (404259,)


In [0]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(x_train, np.array(y_train), test_size=0.025, shuffle=True)

In [12]:
print(x_train.shape, x_val.shape)

(394152, 600) (10107, 600)


In [13]:
model = Sequential()
model.add(Dense(300, kernel_initializer='normal', input_dim=600, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, kernel_initializer='normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train, np.array(y_train), epochs=60, batch_size=64, validation_data=(x_val, y_val))
score = model.evaluate(x_val, y_val, batch_size=128)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 394152 samples, validate on 10107 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [0]:
del x_train, x_val, y_train, y_val #freeing RAM

In [15]:
#Writing test results to CSV

with open('test_results.csv', mode='w') as file:
  writer = csv.writer(file, delimiter=",")
  writer.writerow(['test_id','is_duplicate'])
  df_test = pd.read_csv("test.csv", low_memory = False)
  x_test, no_array, count = [], [], 0
  for index,row in df_test.iterrows():
    if pd.isnull(row["question1"])==False:
      s1 = nltk.word_tokenize(row["question1"])
      sen1 = [w.lower() for w in s1 if w.isalnum()]; del s1  #keep only alphanumeric words
      v1 = [encodings[w] for w in sen1 if w in encodings]; del sen1
      vec1 = np.sum(v1, axis=0)/len(v1); del v1
    else:
      print(index, "has a null object!")
      vec1 = np.array([])
    if pd.isnull(row["question2"])==False:
      s2 = nltk.word_tokenize(row["question2"])
      sen2 = [w.lower() for w in s2 if w.isalnum()]; del s2  #keep only alphanumeric words
      v2 = [encodings[w] for w in sen2 if w in encodings]; del sen2
      vec2 = np.sum(v2, axis=0)/len(v2); del v2
    else:
      print(index, "has a null object!")
      vec2 = np.array([])
    if vec1.shape==(300,) and vec2.shape==(300,):
      vec = np.append(vec1,vec2).reshape(600,); del vec1, vec2
      assert vec.shape == (600,)
      score = model.predict(vec.reshape(1,600))
      pred = 0 if score <=0.5 else 1
      #print(pred)
      writer.writerow([index,pred])
      del vec
    else:
      writer.writerow([index,0])
      count+=1
      print(index, count)

  
    
      

  if sys.path[0] == '':


47707 1
56929 2
89963 3
94646 4
111392 5
121810 6




144334 7
197732 8
294600 9
316017 10
320933 11
364725 12
379205 has a null object!
379205 13
395482 14
469325 15
498004 16
498781 17
507834 18
575003 19
631690 20
647844 21
671066 22
672455 23
681754 24
684886 25
714289 26
726779 27
730790 28
788917 29
792864 30
805015 31
807779 32
817520 has a null object!
817520 33
856433 34
884278 35
902143 36
913286 37
923320 38
943911 has a null object!
943911 39
948749 40
963864 41
976382 42
1021275 43
1046690 has a null object!
1046690 44
1049034 45
1070499 46
1114263 47
1114544 48
1221596 49
1250219 50
1266178 51
1270024 has a null object!
1270024 52
1312352 53
1316479 54
1341861 55
1355809 56
1404324 57
1407596 58
1432959 59
1436078 60
1461432 has a null object!
1461432 61
1567104 62
1619208 63
1635393 64
1697898 65
1722030 66
1728010 67
1735488 68
1768203 69
1774201 70
1827360 71
1836463 72
1894751 73
1922872 74
1959010 75
1967420 76
1977114 77
1979732 78
2004705 79
2009885 80
2037090 81
2049895 82
2051949 83
2114530 84
2115520 85
2134489 86


In [0]:
df = pd.read_csv("test_results.csv", low_memory = False)

In [17]:
df.tail()

Unnamed: 0,test_id,is_duplicate
2345791,2345791,0
2345792,2345792,0
2345793,2345793,0
2345794,2345794,1
2345795,2345795,0


In [0]:
df1 = df[df["is_duplicate"]==1]

In [19]:
len(df)

2345796