### Getting the Dataset and Pre-Requisite files

In [0]:
!unzip 92a4172ee04e11e9.zip

Archive:  92a4172ee04e11e9.zip
   creating: DataSet/
  inflating: DataSet/Results.csv     
  inflating: DataSet/Train.csv       
  inflating: DataSet/readme.md       
  inflating: DataSet/Sample_Submission.csv  
  inflating: DataSet/Test.csv        


In [0]:
! kaggle

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 6, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python2.7/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python2.7/dist-packages/kaggle/api/kaggle_api_extended.py", line 146, in authenticate
    self.config_file, self.config_dir))
IOError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [0]:
! cp kaggle.json /root/.kaggle/kaggle.json


In [0]:
! chmod 600 /root/.kaggle/kaggle.json

In [0]:
!kaggle datasets download -d watts2/glove6b50dtxt


Downloading glove6b50dtxt.zip to /content
 93% 63.0M/67.7M [00:01<00:00, 24.7MB/s]
100% 67.7M/67.7M [00:01<00:00, 43.9MB/s]


In [0]:
!unzip glove6b50dtxt.zip

Archive:  glove6b50dtxt.zip
  inflating: glove.6B.50d.txt        


### Importing the necessary Libraries

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import *
from keras.models import Sequential,Model
import re


Using TensorFlow backend.


### Looking at the Train and Test files to study

In [0]:
train = pd.read_csv("DataSet/Train.csv")
test = pd.read_csv("DataSet/Test.csv")

In [0]:
train.head()

Unnamed: 0,question,answer_text,distractor
0,Meals can be served,in rooms at 9:00 p. m.,"'outside the room at 3:00 p. m.', 'in the dini..."
1,It can be inferred from the passage that,The local government can deal with the problem...,"'If some tragedies occur again ', ' relevant d..."
2,The author called Tommy 's parents in order to,help them realize their influence on Tommy,"'blame Tommy for his failing grades', 'blame T..."
3,It can be inferred from the passage that,the writer is not very willing to use idioms,'idioms are the most important part in a langu...
4,How can we deal with snake wounds according to...,Stay calm and do n't move .,'Cut the wound and suck the poison out .'


In [0]:
test.head()

Unnamed: 0,question,answer_text
0,What 'S the main idea of the text ?,The lack of career -- based courses in US high...
1,"In the summer high season , Finland does nt se...",the sun is out at night
2,If you want to apply for Chinese Business Inte...,have to get confirmed at least twice
3,"That afternoon , the boy 's clothes were dry b...",nobody made room for him in the water .
4,Which of the following statements is NOT true ?,There are twelve countries in the World Wildli...


In [0]:
train = train.values
test = test.values

In [0]:
len(train)

31499

### Forming the dictionary for Question as key and Answer and Distractors as items

In [0]:
answers = {}
distractors = {}
count = 0
for x in range(train.shape[0]):
  answers[train[x][0]] = train[x][1]
  a=[]
  for y in train[x][2].split(", "):
    a.append(str(y[1:-1]))
  distractors[train[x][0]] = a
  count = count+1

In [0]:
count

31499

In [0]:
distractors["Meals can be served"]

['outside the room at 3:00 p. m.',
 'in the dining - room at 6:00 p. m.',
 'in the dining - room from 7:30 a. m. to 9:15 p. m.']

### Cleaning the Text

In [0]:
def clean_text(sentence):
  sentence = sentence.lower()
  sentence = re.sub("[^a-z0-9]+"," " , sentence)
  sentence = sentence.split()
  
  sentence = [s for s in sentence if((len(s)>1) or (re.match("[0-9]+",s) is not None))]
  sentence = " ".join(sentence)
  
  return sentence

In [0]:
# Clean all the captions
a={}
d={}
for key , dist_list in distractors.items():
  for i in range(len(dist_list)):
    dist_list[i] = clean_text(dist_list[i])
  answer=clean_text(answers[key])
  key=clean_text(key)
  a[key]=answer
  d[key]=dist_list
answers=a
distractors=d


In [0]:
distractors["meals can be served"]

['outside the room at 3 00',
 'in the dining room at 6 00',
 'in the dining room from 7 30 to 9 15']

### Saving the Answers and Distractors to text file

In [0]:
with open("answers.txt","w") as f:
  f.write(str(answers))

In [0]:
with open("distractors.txt","w") as f:
  f.write(str(distractors))

In [0]:
vocab = set()
for key in answers.keys():
  [vocab.update(key.split())]
  [vocab.update(answers[key].split())]
  [vocab.update(sentence.split()) for sentence in distractors[key]]

In [0]:
print(len(vocab))

21459


In [0]:
total = []
for key in answers.keys():
  [total.append(i) for i in key.split()]
  [total.append(i) for i in answers[key].split()]
  [total.append(i) for des in distractors[key] for i in des.split()]

print(len(total))

718584


In [0]:
import collections
counter = collections.Counter(total)
freq_cnt = dict(counter)

In [0]:
sorted_freq_cnt = sorted(freq_cnt.items(),reverse = True,key = lambda x:x[1])

threshold =10
sorted_freq_cnt = [x for x in sorted_freq_cnt if x[1]>threshold]
total_words = [x[0] for x in sorted_freq_cnt]


In [0]:
print(len(sorted_freq_cnt))

4723


### Appending unique words for Start and End of sentence

In [0]:
train_distractors = {}
for key in distractors.keys():
  train_distractors[key] = []
  for dist in distractors[key]:
    dist_to_append = "StartSeq " + dist + " EndSeq"
    train_distractors[key].append(dist_to_append)

In [0]:
print(train_distractors["meals can be served"])

['StartSeq outside the room at 3 00 EndSeq', 'StartSeq in the dining room at 6 00 EndSeq', 'StartSeq in the dining room from 7 30 to 9 15 EndSeq']


In [0]:
len(total_words)

4723

In [0]:
word_to_idx = {}
idx_to_word = {}

for i,word in enumerate(total_words):
  word_to_idx[word] = i+1
  idx_to_word[i+1] = word

In [0]:
len(word_to_idx)

4723

In [0]:
word_to_idx["StartSeq"]=4724
idx_to_word[4724] = "StartSeq"

word_to_idx["EndSeq"]=4725
idx_to_word[4725] = "EndSeq"

vocab_size = len(word_to_idx) 

In [0]:
vocab_size= vocab_size+1


In [0]:
vocab_size

4726

In [0]:
max_len=0
for key in train_distractors.keys():
  for dist in train_distractors[key]:
    max_len = max(max_len,len(dist.split()))
print(max_len)    

30


In [0]:
max_q=0
for key in train_distractors.keys():
  max_q = max(max_q,len(key.split()))
print(max_q)


48


In [0]:
max_a = 0
for key in answers.keys():
  max_a = max(max_a,len(answers[key].split()))
print(max_a)  

101


### Generating Data

In [0]:
def data_generator(train_distractors,answers,word_to_idx,max_len,batch_size):
  X1,X2,X3,y = [],[],[],[]
  
  n=0
  while True:
    for key,dist_list in train_distractors.items():
      n+=1
      
      question = key
      answer = answers[key]
      
      
      seqq = [word_to_idx[wordQ] for wordQ in question.split() if wordQ in word_to_idx]
      question= pad_sequences([seqq],maxlen=max_q,value=0,padding='post')[0]
      
      
      seqa = [word_to_idx[wordA] for wordA in answer.split() if wordA in word_to_idx]
      answer = pad_sequences([seqa],maxlen=max_a,value=0,padding='post')[0]
      
      for dist in dist_list:
        seq = [word_to_idx[word] for word in dist.split() if word in word_to_idx]
        for i in range(1,len(seq)):
          xi = seq[0:i]
          yi = seq[i]
          
          xi = pad_sequences([xi],maxlen=max_len,value = 0,padding='post')[0] 
          yi = to_categorical([yi],num_classes = vocab_size)[0]
          
          
          
          X1.append(question)
          X2.append(answer)
          X3.append(xi)
          y.append(yi)
        if n==batch_size:
          yield[[np.array(X1),np.array(X2),np.array(X3)],np.array(y)]
          X1,X2,X3,y = [],[],[],[]
          n=0
  

### Importing the Glove Embeddings

In [0]:
f=open("./glove.6B.50d.txt",encoding="utf8")

In [0]:
embedding_index = {}
for line in f:
  values=line.split()
  word = values[0]
  embedding_index[word]=np.array(values[1:],dtype='float')

In [0]:
embedding_index['apple']

array([ 0.52042 , -0.8314  ,  0.49961 ,  1.2893  ,  0.1151  ,  0.057521,
       -1.3753  , -0.97313 ,  0.18346 ,  0.47672 , -0.15112 ,  0.35532 ,
        0.25912 , -0.77857 ,  0.52181 ,  0.47695 , -1.4251  ,  0.858   ,
        0.59821 , -1.0903  ,  0.33574 , -0.60891 ,  0.41742 ,  0.21569 ,
       -0.07417 , -0.5822  , -0.4502  ,  0.17253 ,  0.16448 , -0.38413 ,
        2.3283  , -0.66682 , -0.58181 ,  0.74389 ,  0.095015, -0.47865 ,
       -0.84591 ,  0.38704 ,  0.23693 , -1.5523  ,  0.64802 , -0.16521 ,
       -1.4719  , -0.16224 ,  0.79857 ,  0.97391 ,  0.40027 , -0.21912 ,
       -0.30938 ,  0.26581 ])

In [0]:
def getEmbeddingMatrix():
  emb_dim=50
  matrix = np.zeros((vocab_size,emb_dim))
  for word,idx in word_to_idx.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
      matrix[idx] = embedding_vector
  return matrix    
      

In [0]:
embedding_matrix = getEmbeddingMatrix()
embedding_matrix.shape

(4726, 50)

In [0]:
embedding_matrix[4724]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [0]:
print(max_len)

30


## Architecture of the Model

In [0]:
input_dist = Input(shape = (max_len,))
input_dist1=  Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(input_dist)
input_dist2 = Dropout(0.3)(input_dist1)
input_dist3 = LSTM(256)(input_dist2)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
input_ques = Input(shape = (max_q,))
input_ques1=  Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(input_ques)
input_ques2 = Dropout(0.3)(input_ques1)
input_ques3 = LSTM(256)(input_ques2)

In [0]:
input_ans = Input(shape = (max_a,))
input_ans1=  Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(input_ans)
input_ans2 = Dropout(0.3)(input_ans1)
input_ans3 = LSTM(256)(input_ans2)

In [0]:
decoder1 = add([input_dist3,input_ques3,input_ans3])
decoder2 = Dense(512 ,activation = 'relu')(decoder1)
outputs = Dense(vocab_size,activation= 'softmax')(decoder2)

model = Model(inputs = [input_ques,input_ans,input_dist],outputs = outputs)

In [0]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 48)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 101)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 50)       236300      input_1[0][0]                    
____________________________________________________________________________________________

In [0]:
model.layers[3].set_weights([embedding_matrix])
model.layers[3].trainable = False  

model.layers[4].set_weights([embedding_matrix])
model.layers[4].trainable = False  
model.layers[5].set_weights([embedding_matrix])
model.layers[5].trainable = False  




In [0]:
model.compile(loss='categorical_crossentropy',optimizer = 'adam')




In [0]:
model.load_weights("model_19.h5")

In [0]:
epochs=20
number_of_ques = 64
steps = len(train_distractors)//number_of_ques

In [0]:
!mkdir model_weights

mkdir: cannot create directory ‘model_weights’: File exists


In [0]:
for i in range(epochs):
  generator = data_generator(train_distractors,answers,word_to_idx,max_len,number_of_ques)
  model.fit_generator(generator,epochs=1,steps_per_epoch = steps,verbose = 1)
  model.save("./model_weights/model_"+str(i)+".h5")

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


### Pre-processing for Test Dataset

In [0]:
answers_t = {}
count = 0
for x in range(test.shape[0]):
  answers_t[test[x][0]] = test[x][1]
  count = count+1

In [0]:
a={}
for key , answer in answers_t.items():
  answer=clean_text(answers_t[key])
  key=clean_text(key)
  a[key]=answer
answers_t=a


In [0]:
print(answers_t["what the main idea of the text"])

lack of water affects california crops


In [0]:
print(len(answers_t.keys()))

10353


In [0]:
result = pd.read_csv("Result.csv")

In [0]:
result.head()

Unnamed: 0,question,answer_text,distractor
0,What 'S the main idea of the text ?,The lack of career -- based courses in US high...,'the difference of tests in learning and moder...
1,"In the summer high season , Finland does nt se...",the sun is out at night,"'the sun is too unclear', 'there is no time to..."
2,If you want to apply for Chinese Business Inte...,have to get confirmed at least twice,"'have more printed books', 'take to the new yo..."
3,"That afternoon , the boy 's clothes were dry b...",nobody made room for him in the water .,"'the monkey could let him', 'he did want to ta..."
4,Which of the following statements is NOT true ?,There are twelve countries in the World Wildli...,'the chinese people do like the harm of the en...


In [0]:
results = result.values

In [0]:
results.shape

(13500, 3)

### Predicting the Distractors

In [0]:
print(results[9100:9200])

[['What is a resolution ?' 'Something you say .'
  "'you re', 'it nice', 'how'"]
 ['Which of the following is true according to the passage ?'
  '" Fast food fanatics " usually do not stock their fridges with fresh fruit .'
  "'foods foods are more than than regular', 'the foods drinks color are harmful', 'we can buy shoes instead clothes'"]
 ['What did the man do when he saw the writer ?'
  'He helped her without hesitation .'
  "'he did want to go his way', 'she was afraid of the little boy', 'bought the hand'"]
 ['" Do remember to put it into the letter box on your way to work " showed'
  "Mrs. Black doubted her husband 's memory"
  "'betty sister', 'mary sister', 'the man room'"]
 ['Why do many people like visiting museums ?'
  'Because visits to museums can help gain knowledge .' nan]
 ['Compared with the device designed by Larry Rome , this new device'
  'produces power without adding more loads to the walker' nan]
 ['Which way is NOT mentioned   in the passage ?'
  'Listen to an

In [0]:
def predict_distractors(X1,X2):
  dists = []
  for j in range(3):
    in_text = "StartSeq"
    for i in range(max_len):
      sequence = [word_to_idx[w] for w in in_text.split() if w in word_to_idx]
      sequence = pad_sequences([sequence],maxlen=max_len,padding = 'post')[0]
      XQ = []
      XA = []
      XI = []
      XQ.append(X1)
      XA.append(X2)
      XI.append(sequence)
      y_pred = model.predict([np.array(XQ),np.array(XA),np.array(XI)])
      
      if(i<=1):
        y_pred=np.array(y_pred)
        y_pred = y_pred.argsort()
        y_pred=y_pred[0][:]
        y_pred=y_pred[len(y_pred)-1-j]
      else:
        y_pred=y_pred.argmax()
      word = idx_to_word[y_pred]
      in_text += (' ' + word)

      if word == 'EndSeq':
        break
    final_dists = in_text.split()[1:-1]
    final_dists = ' '.join(final_dists)
    dists.append(final_dists)
  return dists

In [0]:
L = []
for i in range(9100,results.shape[0]):
  
  question= clean_text(results[i][0])
  answer = clean_text(results[i][1])

#   print(question)
#   print(answer)
  seqq = [word_to_idx[wordQ] for wordQ in question.split() if wordQ in word_to_idx]
  question= pad_sequences([seqq],maxlen=max_q,value=0,padding='post')[0]

  seqa = [word_to_idx[wordA] for wordA in answer.split() if wordA in word_to_idx]
  answer = pad_sequences([seqa],maxlen=max_a,value=0,padding='post')[0]
  
#   question = question.reshape((1,question.shape[0]))
#   answer = answer.reshape((1,answer.shape[0]))
  
  distractor = predict_distractors(question,answer)
  distractor = str(distractor)
  distractor = distractor[1:-1]
  results[i][2] = distractor
  
  if (i+1)%100==0:
    print("questions done",i+1)

questions done 9200
questions done 9300
questions done 9400
questions done 9500
questions done 9600
questions done 9700
questions done 9800
questions done 9900
questions done 10000
questions done 10100
questions done 10200
questions done 10300
questions done 10400
questions done 10500
questions done 10600
questions done 10700
questions done 10800
questions done 10900
questions done 11000
questions done 11100
questions done 11200
questions done 11300
questions done 11400
questions done 11500
questions done 11600
questions done 11700
questions done 11800
questions done 11900
questions done 12000
questions done 12100
questions done 12200
questions done 12300
questions done 12400
questions done 12500
questions done 12600
questions done 12700
questions done 12800
questions done 12900
questions done 13000
questions done 13100
questions done 13200
questions done 13300
questions done 13400
questions done 13500


### Conversion of Final Results to Results.csv file

In [0]:
results = pd.DataFrame(results)

In [0]:
results.to_csv("./Results.csv",header=["question","answer_text","distractor"],index = None)