In [1]:
import keras
from keras.models import load_model
import tensorflow as tf
from keras import backend as K

import sqmutils.data_utils as du

import os
import time 

import pandas as pd
import numpy as np

import csv

import json


%load_ext autoreload
%autoreload 2
%matplotlib inline 


Using TensorFlow backend.


# Configs

In [11]:
model_dir = "models"
dataset_dir = "dataset"
model_weights = os.path.join(model_dir, "best_val_f1_model.h5")
# you can download test data from here:
# https://www.kaggle.com/c/quora-question-pairs/download/test.csv
test_dataset_path = "/home/elkhand/Downloads/test.csv"
cleaned_test_dataset_path = os.path.join(dataset_dir, "cleaned_test.csv")
test_probabilities_csv = os.path.join(dataset_dir, "test_probabilities.csv")
embedding_path = "/home/elkhand/datasets/fasttext/wiki.en.vec"
emb_dim = 300


config = du.get_config(None, None, None,  embedding_dimension=emb_dim)
custom_objects= {"f1": du.f1, "recall" : du.recall, "precision" : du.precision}

config
 {'train_dataset_path': None, 'test_size': None, 'val_size': None, 'max_seq_len': 32, 'embedding_dimension': 300, 'batch_size': 3096, 'nb_epochs': 100, 'recurrent_dropout': 0.3, 'dropout': 0.3, 'seed': 7, 'is_debug_on': False} 



# Reading test data

In [57]:
# Cleaning from duplicates and storing to file.
start = time.time()
dfTest = pd.read_csv(test_dataset_path, sep=',', encoding='utf-8')
end = time.time()
print("Total time passed", (end - start))
print("Total test examples", len(dfTest))

Total time passed 19.774346113204956
Total test examples 2345796


## Remove duplicates

In [58]:
start = time.time()
valid_ids =[type(x)==int for x in dfTest.test_id] 
dfTest = dfTest[valid_ids].drop_duplicates()
dfTest = dfTest.replace(np.nan, '', regex=True)
dfTest = dfTest.fillna('')
dfTest.to_csv(cleaned_test_dataset_path, sep=',', encoding='utf-8', index=False)
end = time.time()
print("Total time passed", (end - start))
print("Total test examples", len(dfTest))

Total time passed 5.645338296890259
Total test examples 2345796


In [33]:
dfTest[:10]

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?
5,5,How are the two wheeler insurance from Bharti ...,I admire I am considering of buying insurance ...
6,6,How can I reduce my belly fat through a diet?,How can I reduce my lower belly fat in one month?
7,7,"By scrapping the 500 and 1000 rupee notes, how...",How will the recent move to declare 500 and 10...
8,8,What are the how best books of all time?,What are some of the military history books of...
9,9,After 12th years old boy and I had sex with a ...,Can a 14 old guy date a 12 year old girl?


# Load embeddings

We will be using Fasttext Wiki word vectors 300D

In [5]:
print("word vectors path", embedding_path)
start = time.time()
w2v = du.load_embedding(embedding_path)
end = time.time()
print("Total time passed: ", (end-start))

word vectors path /home/elkhand/datasets/fasttext/wiki.en.vec
embedding size : 2519371
embedding dimension : (300,)
Total time passed:  329.0242750644684


# Load pre-trained model

In [7]:
model = load_model(model_weights, custom_objects = custom_objects)

# Predict Test dataset probabilities

In [60]:
def write_to_csv_with_test_id(csv_file, results, testId_list):
    print("testId_list", len(testId_list),"start: ", testId_list[0],"end: ", testId_list[-1], "len(results)", len(results))
    if len(testId_list) != len(results):
        print("\n ERROR!!!! \n")
    index = 0
    for test_id in testId_list:
        line = str(test_id) + "," + str(round(results[index][0],1)) + "\n"
        csv_file.write(line)
        index += 1      

start = time.time()
with open(test_probabilities_csv, "w") as csv_file:
    #Write header
    line = "test_id,is_duplicate" + "\n"
    csv_file.write(line)
    
    step_size = 20000
    ranges = [i for i in range(step_size, len(dfTest) + step_size, step_size)]
    start_index = 0
    nanCount = 1
    # Batch prediction
    for to_index in ranges:
        predict_start = time.time()
        test_ids = list(dfTest[start_index:to_index]['test_id'])
        df_test_q1_emb, df_test_q2_emb = du.load_dataset(dfTest[start_index:to_index], w2v, config, isTestDataset=True)        
        results = model.predict([df_test_q1_emb, df_test_q2_emb], verbose=0) 
        predict_end = time.time()
        print("start_index",start_index,"to_index",to_index,"len(result)",len(results),"Pred time: ", (predict_end - predict_start))
        write_to_csv_with_test_id(csv_file, results, test_ids)
        start_index = to_index
        
end = time.time()
print("Total time passed", (end - start))

start_index 0 to_index 20000 len(result) 20000 Pred time:  84.72997903823853
testId_list 20000 start:  0 end:  19999 len(results) 20000
start_index 20000 to_index 40000 len(result) 20000 Pred time:  85.90014982223511
testId_list 20000 start:  20000 end:  39999 len(results) 20000
start_index 40000 to_index 60000 len(result) 20000 Pred time:  87.45600414276123
testId_list 20000 start:  40000 end:  59999 len(results) 20000
start_index 60000 to_index 80000 len(result) 20000 Pred time:  87.99083805084229
testId_list 20000 start:  60000 end:  79999 len(results) 20000
start_index 80000 to_index 100000 len(result) 20000 Pred time:  87.6667160987854
testId_list 20000 start:  80000 end:  99999 len(results) 20000
start_index 100000 to_index 120000 len(result) 20000 Pred time:  88.06139135360718
testId_list 20000 start:  100000 end:  119999 len(results) 20000
start_index 120000 to_index 140000 len(result) 20000 Pred time:  87.56041717529297
testId_list 20000 start:  120000 end:  139999 len(results

start_index 1120000 to_index 1140000 len(result) 20000 Pred time:  88.09119892120361
testId_list 20000 start:  1120000 end:  1139999 len(results) 20000
start_index 1140000 to_index 1160000 len(result) 20000 Pred time:  88.16745710372925
testId_list 20000 start:  1140000 end:  1159999 len(results) 20000
start_index 1160000 to_index 1180000 len(result) 20000 Pred time:  87.6527841091156
testId_list 20000 start:  1160000 end:  1179999 len(results) 20000
start_index 1180000 to_index 1200000 len(result) 20000 Pred time:  88.18561935424805
testId_list 20000 start:  1180000 end:  1199999 len(results) 20000
start_index 1200000 to_index 1220000 len(result) 20000 Pred time:  87.95569944381714
testId_list 20000 start:  1200000 end:  1219999 len(results) 20000
start_index 1220000 to_index 1240000 len(result) 20000 Pred time:  88.17555832862854
testId_list 20000 start:  1220000 end:  1239999 len(results) 20000
start_index 1240000 to_index 1260000 len(result) 20000 Pred time:  88.13709998130798
test

start_index 2200000 to_index 2220000 len(result) 20000 Pred time:  87.82819104194641
testId_list 20000 start:  2200000 end:  2219999 len(results) 20000
start_index 2220000 to_index 2240000 len(result) 20000 Pred time:  88.02479314804077
testId_list 20000 start:  2220000 end:  2239999 len(results) 20000
start_index 2240000 to_index 2260000 len(result) 20000 Pred time:  88.24507546424866
testId_list 20000 start:  2240000 end:  2259999 len(results) 20000
start_index 2260000 to_index 2280000 len(result) 20000 Pred time:  87.6471254825592
testId_list 20000 start:  2260000 end:  2279999 len(results) 20000
start_index 2280000 to_index 2300000 len(result) 20000 Pred time:  87.9636058807373
testId_list 20000 start:  2280000 end:  2299999 len(results) 20000
start_index 2300000 to_index 2320000 len(result) 20000 Pred time:  88.35396552085876
testId_list 20000 start:  2300000 end:  2319999 len(results) 20000
start_index 2320000 to_index 2340000 len(result) 20000 Pred time:  88.43807339668274
testI