# Microsoft AI Challenge

### Importing Libs

In [1]:
import numpy as np
import string
import re
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sys
import os
import tensorflow as tf
from IPython.display import clear_output as clr
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer as WNL
import urllib
from bs4 import BeautifulSoup

In [2]:
# Setting Working Dircetory
cw_path = os.getcwd() + '/../../'
os.listdir(cw_path)

['.git',
 '.ipynb_checkpoints',
 'Datasets',
 'dev-evaluate-v2.0-in1.json',
 'Embeddings',
 'evaluate-v2.0.py',
 'Models',
 'Readings',
 'README.md']

## Precprocessing 

In [160]:
class preprocessor:
    
    def __init__(self, mqw, mpw):
        
        self.GloveEmbeddings = {}
        self.emb_dim = 50
        self.max_query_words = mqw
        self.max_passage_words = mpw
        
        self.lemmatizer = WNL()
    
    
    def normalize_text(self, s):
        lower_s = self.lower(s)
        rem_p_s = self.remove_punc(lower_s)
        rem_a_s = self.remove_articles(rem_p_s)
        space_s = self.white_space_fix(rem_a_s)
        lemma_s = self.lemmatize(space_s)
        return lemma_s
        
    def lemmatize(self, txt):
        lemmatizer = self.lemmatizer
        return ' '.join(lemmatizer.lemmatize(lemmatizer.lemmatize(word, pos = 'v'), pos = 'v') for word in txt.split())

    def remove_articles(self, text):
        return re.sub('\s+(a|an|and|the)(\s+)', ' ',text)

    def white_space_fix(self, text):
        return ' '.join(text.split())

    def remove_punc(self, text):
        translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        return text.translate(translator)

    def lower(self, text):
        return text.lower()
    
    def loadEmbeddings(self, embeddingfile):

        fe = open(embeddingfile,"r",encoding="utf-8",errors="ignore")
        for line in fe:
            tokens= line.strip().split()
            word = tokens[0]
            vec = tokens[1:]
            vec = " ".join(vec)
            self.GloveEmbeddings[word]=vec
        self.emb_dim = len(vec.split(' '))
        #Add Zerovec, this will be useful to pad zeros, it is better to experiment with padding any non-zero constant values also.
        self.GloveEmbeddings["zerovec"] = "0.0 "*(self.emb_dim-1) + "0.0"
        fe.close()
        
    def mini_df_to_stacks(self, df):
        try: 
            df = df[df[0] != 0]
        except:
            pass

        df_list = []
        que_stack = []
        ans_stack = []
        out_stack = []
        ratio_stack = []
        for ind in range(df.shape[0]):
            vals = df.values
            question = self.normalize_text(vals[ind][1]).strip().split()
            new_question = []
            for word in question:
                if(not word):
                    continue
                try:
                    new_question.append(np.array(self.GloveEmbeddings[word].strip().split()).astype(float))
                except:
                    new_question.append(np.array(self.GloveEmbeddings["zerovec"].strip().split()).astype(float))
            answer = self.normalize_text(vals[ind][2]).strip().split()
            new_answer = []
            for word in answer:
                if(not word):
                    continue
                try:
                    new_answer.append(np.array(self.GloveEmbeddings[word].strip().split()).astype(float))
                except:
                    new_answer.append(np.array(self.GloveEmbeddings["zerovec"].strip().split()).astype(float))
                    
            
            l = vals[ind][3]
            que = np.array(new_question)
            ans = np.array(new_answer)
            que = que[:min(self.max_query_words, que.shape[0])]
            ans = ans[:min(self.max_passage_words, ans.shape[0])]
            
            stems_a = question
            stems_b = answer
            ratio = len(set(stems_a).intersection(stems_b)) / float(len(set(stems_a)))
            
            zec = np.array(self.GloveEmbeddings["zerovec"].strip().split()).astype(float).reshape(1,self.emb_dim)
            
            try:
                pad_que = np.concatenate([que, np.tile(zec.reshape(self.emb_dim,1), 
                                                       self.max_query_words - que.shape[0]).T], axis = 0
                                        ).reshape(self.max_query_words,self.emb_dim)

                pad_ans = np.concatenate([ans, np.tile(zec.reshape(self.emb_dim,1), 
                                                       self.max_passage_words - ans.shape[0]).T], axis = 0
                                        ).reshape(self.max_passage_words,self.emb_dim)

                final_out = np.array([1,0]).reshape(-1)
                if(l==1):
                    final_out = np.array([0,1]).reshape(-1)


                que_stack.append(pad_que)
                ans_stack.append(pad_ans)
                out_stack.append(final_out)
                ratio_stack.append(ratio)
            except:
                pass
            
            
        
        que_stack = np.array(que_stack)
        ans_stack = np.array(ans_stack)
        out_stack = np.array(out_stack)
        ratio_stack = np.array(ratio_stack)
        
        return (que_stack, ans_stack, out_stack, ratio_stack)

In [161]:
# Initializing preprocessor
q_max, a_max = 40,120

PRE = preprocessor(q_max, a_max)
embeddingFileName = cw_path + "Embeddings/glove.6B.100d.txt"
PRE.loadEmbeddings(embeddingFileName)

In [162]:
### Check if its working ----
text = "Brilliant! I knew somebody was going to submit regex as the solution. I'm afraid I'm an amateur, though. Do you know of any good regex tutorials/references?"
print(PRE.normalize_text(text))

brilliant i know somebody be go to submit regex as solution i m afraid i m amateur though do you know of any good regex tutorials reference


## Visualization Tensorboard

In [7]:
tf.reset_default_graph()
logs_path = cw_path + 'Models/Model_01/maic2'
writer = tf.summary.FileWriter(logs_path)

## Modelling 

In [8]:
## Input params

## Question module params
question_shape = (q_max, PRE.emb_dim)
q_hidden_units = PRE.emb_dim

## Answer module params
answer_shape = (a_max, PRE.emb_dim)
a_hidden_units = PRE.emb_dim

## Episoidic memory params
episode_shape = (a_max, 25)
e_hidden_units = PRE.emb_dim
n_episodes = 5

## Output memory params
output_shape = 2

In [9]:
class question_module():
    def __init__(self,input_shape, q_hidden_size, question_ph, sess):
        with tf.name_scope('Q_module') as scope:
            with sess.as_default():

                self.n_steps, self.emb_dim = input_shape

                self.q_hidden_size = q_hidden_size

                self.question = question_ph

                self.q_cell = tf.nn.rnn_cell.LSTMCell(q_hidden_size, initializer=tf.variance_scaling_initializer(), name = 'Question_Encoder_Cell')

                self.init_s, self.init_c = self.q_cell.zero_state(tf.shape(self.question)[0], tf.float32)

                self.s = self.init_s
                self.c = self.init_c

                self.state = [self.s, self.c]

                for i in range(self.n_steps):
                    self.output, self.state = self.q_cell(self.question[:,i,:], state = self.state)


                self.final_state = self.state

    def encode(self):
        return self.final_state

In [10]:
class answer_module():
    def __init__(self,input_shape, a_hidden_size, answer_ph, sess):
        with tf.name_scope('A_module') as scope:
            with sess.as_default():

                self.n_steps, self.emb_dim = input_shape

                self.a_hidden_size = a_hidden_size

                self.answer = answer_ph

                self.a_cell = tf.nn.rnn_cell.LSTMCell(a_hidden_size, initializer=tf.variance_scaling_initializer(), name = 'Answer_Encoder_Cell')

                self.init_s, self.init_c = self.a_cell.zero_state(tf.shape(self.answer)[0], tf.float32)

                self.s = self.init_s
                self.c = self.init_c

                self.state = [self.s, self.c]

                self.outputs = []
                for i in range(self.n_steps):
                    self.output, self.state = self.a_cell(self.answer[:,i,:], state = self.state)
                    self.outputs.append(self.output)

                self.final_state = self.state
                self.outputs = tf.stack(self.outputs, axis = 1, name = 'Answers_Encoded_Stacked')


    def encode(self):
        return self.outputs, self.final_state

In [11]:
class episoidic_mem_module():
    
    def __init__(self,input_shape, e_hidden_size, depth_mem, episode_ph, quec_ph, queh_ph, sess):
        with tf.name_scope('E_module') as scope:
            with sess.as_default():

                self.n_steps, self.emb_dim = input_shape

                self.e_hidden_size = e_hidden_size

                self.cepisode_ans = episode_ph
                self.question_enc_c = quec_ph
                self.question_enc_h = queh_ph

                self.que_enc = self.question_enc_c
                self.episode = self.cepisode_ans


                self.outputs = []
                for k in range(depth_mem):
                    sequence = self.episode
                    state_prev = self.que_enc
                    attension_k = self.get_attensions(state_prev, sequence, k)
                    self.que_enc = attension_k[:,0,:]

                    self.outputs.append(attension_k)
                    
                self.outputs = tf.concat(self.outputs, axis = 1, name = 'Episodes_Encoded_Stacked')
                print(self.outputs.shape)

                self.fcell = tf.nn.rnn_cell.LSTMCell(e_hidden_size, initializer=tf.variance_scaling_initializer(), name = 'Epsiodic_Final_LSTM')
                self.fstate = [self.question_enc_c, self.question_enc_h]
                for k in range(depth_mem):
                    out, self.fstate = self.fcell(self.outputs[:,k,:], state = self.fstate)

                self.out = out
            
    def get_attensions(self, state_prev, sequence, k):
        with tf.name_scope('E_module_attensions') as scope:
            N = sequence.shape[1]
            s_new = tf.keras.backend.repeat(state_prev,N)
            a_new = sequence#tf.stack(sequence, axis = 1)
            concat = tf.concat([s_new, a_new], axis = -1, name = 'Attenssion_Concat_' + str(k))
            dense1 = tf.layers.dense(concat, 100, activation=tf.nn.leaky_relu, name = 'Attenssion_Dense1_' + str(k))
            batch1 = tf.layers.batch_normalization(dense1, momentum=0.4, training=True)
            drop1 = tf.layers.dropout(batch1, rate=0.4, training=True)
            dense2 = tf.layers.dense(drop1, 10, activation=tf.nn.leaky_relu, name = 'Attenssion_Dense2_' + str(k))
            batch2 = tf.layers.batch_normalization(dense2, momentum=0.4, training=True)
            drop2 = tf.layers.dropout(batch2, rate=0.4, training=True)
            dense3 = tf.layers.dense(drop2, 1, activation=tf.nn.leaky_relu, name = 'Attenssion_Dense3_' + str(k))
            alphas = tf.nn.softmax(dense3, axis = 1, name = 'Attenssion_Alphas_' + str(k))
            context = tf.keras.layers.Dot(axes = 1)([alphas, a_new])
            return context
    
    
    def episodic_enc(self):
            return self.fstate

In [78]:
class final_model:
    
    def __init__(self,sess):
        with sess.as_default():
            
            self.sess = sess
            
            with tf.name_scope('PHs') as scope:
                self.question_n = tf.placeholder(tf.float32, [None, question_shape[0], question_shape[1]], name = 'Que_PH')
                self.answer_n = tf.placeholder(tf.float32, [None, answer_shape[0], answer_shape[1]],name = 'Ans_PH')
                self.label = tf.placeholder(tf.float32, [None, output_shape], name = 'Lab_PH')
                self.sample_weights = tf.placeholder(tf.float32, [None,], name = 'sample_weights')
                self.ratio = tf.placeholder(tf.float32, [None,1], name = 'Ratios')
                
                batch_que = tf.layers.batch_normalization(self.question_n, momentum=0.4, training=True)
                batch_ans = tf.layers.batch_normalization(self.answer_n, momentum=0.4, training=True)
                
                self.question = batch_que
                self.answer = batch_ans
                
                print(self.question.shape, self.answer.shape)
            
            with tf.name_scope('Modules') as scope:
                self.Q_module = question_module(question_shape, q_hidden_units,self.question, sess)
                self.A_module = answer_module(answer_shape, a_hidden_units,self.answer, sess)

                self.encoding_q_n = self.Q_module.encode()
                self.a_outputs_n, self.encoding_a_n = self.A_module.encode()
                
                print(self.encoding_q_n.c.shape, self.a_outputs_n.shape)
                
                batch_que_enc_c = tf.layers.batch_normalization(self.encoding_q_n.c, momentum=0.4, training=True)
                batch_que_enc_h = tf.layers.batch_normalization(self.encoding_q_n.h, momentum=0.4, training=True)
                batch_ans_out = tf.layers.batch_normalization(self.a_outputs_n, momentum=0.4, training=True)
                
                self.a_outputs = batch_ans_out

                self.E_module = episoidic_mem_module(episode_shape, e_hidden_units, n_episodes,self.a_outputs,batch_que_enc_c, batch_que_enc_h, sess)

                self.e_outputs_n = self.E_module.episodic_enc()
                
                batch_epi_enc_c = tf.layers.batch_normalization(self.e_outputs_n.c, momentum=0.4, training=True)
                batch_epi_enc_h = tf.layers.batch_normalization(self.e_outputs_n.h, momentum=0.4, training=True)
                
            
            with tf.name_scope('Final') as scope:
                prevec = tf.concat([batch_epi_enc_c, batch_que_enc_c], axis =1, name = 'QUE_EPI_concat')
                dense = tf.layers.dense(prevec, 16, activation=tf.nn.leaky_relu, name = 'Final_Dense')
                concat1 = tf.concat([dense, self.ratio], axis = 1, name = 'ratio_concat')
                dense1 = tf.layers.dense(concat1, 8, activation=tf.nn.leaky_relu, name = 'Final_Dense1')
                dense2 = tf.layers.dense(concat1, 8, activation=tf.nn.leaky_relu, name = 'Final_Dense2')
                self.logits = tf.layers.dense(dense1, output_shape, name = 'Final_Logits')
                self.perf = tf.layers.dense(dense2, 1, name = 'Final_Perf')

                self.out = tf.nn.softmax(self.logits, name  = 'Final_Out')
            
            with tf.name_scope('Training') as scope:
                self.loss_vec = tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf.stop_gradient(self.label), logits = self.logits, name = 'Final_Loss')
                self.c_perf = tf.reduce_sum(tf.multiply(self.perf, self.sample_weights))
                self.v_loss = tf.reduce_sum(tf.multiply(self.loss_vec, self.sample_weights))
                self.loss = (self.v_loss-tf.stop_gradient(self.c_perf))
                self.loss2 = tf.square(tf.stop_gradient(self.loss) - self.c_perf)
                
                self.opt = tf.train.AdamOptimizer(0.0002, name = 'Optimizer')
                self.opt2 = tf.train.AdamOptimizer(0.0002, name = 'Optimizer2')
                
                self.train_step = self.opt.minimize(self.loss)
                self.train_Step2 = self.opt2.minimize(self.loss2)
                
#                 self.grads = self.opt.compute_gradients(self.loss)
#                 self.grads = [(tf.clip_by_value(g, -1.0, 1.0), v) for g, v in self.grads]
#                 self.train_step = self.opt.apply_gradients(self.grads)
            
            with tf.name_scope('Init') as scope:
                self.variables = tf.get_collection(tf.GraphKeys.VARIABLES)
                self.init_vars = tf.variables_initializer(self.variables)
            
    def train(self, question, answer, label, ratio,sample_weights=None, epochs = 1, verbose = 1):
        sum_sw = np.sum(sample_weights)
        N = sample_weights.shape[0]
        sample_weights = N*sample_weights/sum_sw
        if(sample_weights is None):
            sample_weights = np.ones((label.shape[0]))
        _feed_dict = {self.ratio : ratio, self.question_n: question, self.answer_n : answer,  self.label :label, self.sample_weights : sample_weights}
        for i in range(epochs):
            self.sess.run([self.train_step], feed_dict = _feed_dict)
        self.sess.run([self.train_Step2], feed_dict = _feed_dict)
        if(verbose == 1):
            loss, out, logits = self.sess.run([self.v_loss, self.out, self.logits], feed_dict = _feed_dict)
            print('Epochs :',i+1,', loss:', loss, ', accuracy:', self.accuracy(label, out, sample_weights))
            return logits

    def accuracy(self, y_true, y_pred, sample_weights):
        y_tl = np.argmax(y_true, axis = 1)
        y_pr = np.argmax(y_pred, axis = 1)
        acc1 = np.mean((y_tl == y_pr)*sample_weights.reshape(-1,1))
        acc2 = np.mean((y_tl == y_pr))
        acc = str(acc1)+'::'+str(acc2)
        return acc
    
#     def predict(self, question , answer):
#         _feed_dict = {self.question_n: question, self.answer_n : answer,  self.label :label}
#         out = self.sess.run([self.out], feed_dict = _feed_dict)[0]
#         return np.argmax(out, axis = 1)
    
    def score(self, question, answer, ratio):
        _feed_dict = {self.question_n: question, self.answer_n : answer, self.ratio : ratio}
        out = self.sess.run(self.out, feed_dict = _feed_dict)
        scores = out[:,1]
        return scores
    
    
#     def make_partial_graph(self):
        
#         with tf.name_scope('partial_Final') as scope:
            
#             self.pr_lr = tf.placeholder(tf.float32, shape =[], name = 'pr_lr')
            
#             batch_ans_enc_c = tf.layers.batch_normalization(self.encoding_a_n.c, momentum=0.4, training=True)
#             batch_epi_enc_c = tf.layers.batch_normalization(self.e_outputs_n.c, momentum=0.4, training=True)
#             batch_que_enc_c = tf.layers.batch_normalization(self.encoding_q_n.c, momentum=0.4, training=True)
            
            
#             prevec = tf.concat([batch_epi_enc_c, batch_que_enc_c, batch_ans_enc_c], axis =1, name = 'pr_QUE_EPI_concat')
#             dense = tf.layers.dense(prevec, 32, activation=tf.nn.leaky_relu, name = 'pr_Final_Dense')
#             concat1 = tf.concat([dense, self.ratio], axis = 1, name = 'pr_ratio_concat')
#             dense1 = tf.layers.dense(concat1, 16, activation=tf.nn.leaky_relu, name = 'pr_Final_Dense1')
#             self.pr_logits = tf.layers.dense(dense1, output_shape, name = 'pr_Final_Logits')
#             self.pr_out = tf.nn.softmax(self.pr_logits, name  = 'pr_Final_Out')
        
#         with tf.name_scope('partial_Training') as scope:
            
# #             self.pr_loss_vec = tf.nn.softmax_cross_entropy_with_logits_v2(labels = tf.stop_gradient(self.label),
# #                                                                        logits = self.pr_logits, name = 'partial_Final_Loss')
            
            
            
#             self.pr_loss_vec = tf.losses.hinge_loss(labels = tf.stop_gradient(self.label),
#                                                                        logits = self.pr_logits, reduction = tf.losses.Reduction.NONE)
            
#             self.pr_v_loss = tf.reduce_sum(tf.multiply(tf.reduce_sum(self.pr_loss_vec, axis  = 1), self.sample_weights))

#             self.pr_opt = tf.train.AdamOptimizer(learning_rate = self.pr_lr, name = 'partial_Optimizer')
                
# #             self.pr_grads = self.pr_opt.compute_gradients(self.pr_v_loss)
# #             self.pr_grads = [(tf.clip_by_value(g, -1.0, 1.0), v) for g, v in self.pr_grads]
# #             self.pr_train_step = self.opt.apply_gradients(self.pr_grads)
            
#             self.pr_train_step = self.pr_opt.minimize(self.pr_v_loss)
            
#         with tf.name_scope('partial_Init') as scope:
#             self.pr_vars = tf.get_collection(tf.GraphKeys.VARIABLES)
#             self.pr_variables = [v for v in self.pr_vars if v not in self.variables]
#             self.pr_init_vars = tf.variables_initializer(self.pr_variables)
    
    
#     def pr_train(self, question, answer, label, ratio,sample_weights=None, epochs = 1, verbose = 1, lr = 0.01):
#         sum_sw = np.sum(sample_weights)
#         N = sample_weights.shape[0]
#         sample_weights = N*sample_weights/sum_sw
#         if(sample_weights is None):
#             sample_weights = np.ones((label.shape[0]))
#         _feed_dict = {self.ratio : ratio, self.question_n: question, self.answer_n : answer,
#                       self.label :label, self.sample_weights : sample_weights, self.pr_lr : lr}
        
#         for i in range(epochs):
#             self.sess.run([self.pr_train_step], feed_dict = _feed_dict)
            
#         if(verbose == 1):
#             loss, out, logits = self.sess.run([self.pr_v_loss, self.pr_out, self.pr_logits], feed_dict = _feed_dict)
#             print('Epochs :',i+1,', loss:', loss, ', accuracy:', self.accuracy(label, out, sample_weights))
#             return logits

In [79]:
tf.reset_default_graph()

In [80]:
sess = tf.Session()

In [81]:
model = final_model(sess)

(?, 40, 100) (?, 120, 100)
(?, 100) (?, 120, 100)
(?, 5, 100)


In [82]:
# model.make_partial_graph()

In [83]:
save_path = './saves2'
saver = tf.train.Saver()
saver.restore(model.sess,tf.train.latest_checkpoint(save_path))
# model.sess.run(model.init_vars)

INFO:tensorflow:Restoring parameters from ./saves2/episoid_mem_model


In [84]:
# model.make_partial_graph()

# model.sess.run(model.pr_init_vars)

# saver.save(model.sess, save_path+'/episoid_mem_model')

In [85]:
# writer.add_graph(sess.graph)

In [86]:
# avg_loss = tf.summary.scalar(name='Avg_Loss_act', tensor=model.v_loss)
# loss1 = tf.summary.scalar(name='Avg_Loss_1', tensor=model.loss)
# loss2 = tf.summary.scalar(name='Avg_Loss_2', tensor=model.loss2)

# pr_avg_loss = tf.summary.scalar(name='Pr_Avg_Loss_act', tensor=model.pr_v_loss)
# # loss1 = tf.summary.scalar(name='Avg_Loss_1', tensor=model.loss)
# # loss2 = tf.summary.scalar(name='Avg_Loss_2', tensor=model.loss2)

In [87]:
# csize = 1000
# dfs = pd.read_csv(data_dir + 'data.tsv', sep='\t', chunksize=csize, header = None)
# count = 0

In [88]:
# for df in dfs:
#     try: 
#         df = df[df[0] != 0]
#     except:
#         pass
#     if(count*csize < 250*1000):
#         print(count*csize)
#         count +=1
#         if(count%10 == 0):
#             clr()
#         continue
        
#     question, answer, label, ratio = PRE.mini_df_to_stacks(df)
#     r_max = np.max(ratio)
#     r_min = np.min(ratio)
#     ratio_sw = (ratio-r_min)/(r_max-r_min)
    
#     ones = np.argmax(label, axis = 1)
#     sample_weights_ratio1 = ones*ratio_sw
    
#     sample_weights_ratio2 = (1-ones)*(1-ratio_sw)
#     sample_weights = (sample_weights_ratio1 + sample_weights_ratio2)*(0.8*ones + 0.1)
   
    
    
#     ratio = ratio.reshape(-1,1)
#     if(count%10 == 0):
#         y_label = np.argmax(label, axis = 1)
#         idx_1 = np.where(y_label == 1)[0]
#         idx_0 = np.where(y_label == 0)[0]
#         sam_1 = np.random.choice(idx_1)
#         sam_0 = np.random.choice(idx_0)

#         dat_1 = df.iloc[sam_1]
#         dat_0 = df.iloc[sam_0]
        
#         clr()
#         print(count)
#         logits = model.pr_train(question, answer, label, ratio, sample_weights, epochs = 1, verbose = 1, lr = 0.01)
#         print(dat_1[0],':',dat_1[1],':',dat_1[2],':',dat_1[3], ': score :', logits[sam_1])
#         print(dat_0[0],':',dat_0[1],':',dat_0[2],':',dat_0[3], ': score :', logits[sam_0])
#         saver.save(model.sess, save_path+'/episoid_mem_model')
#         if(count%50==0):
#             txtrt = str(count) + "," + str(csize)
#             with open('count.txt', 'w') as f:
#                 f.write(txtrt)
#                 f.close()
#     else:
#         print(count)
#         model.pr_train(question, answer, label, ratio, sample_weights, epochs = 1, verbose = 0, lr = 0.01)
        

#     s0,s1,s2,s3 = model.sess.run([pr_avg_loss, avg_loss, loss1, loss2], feed_dict = {model.ratio : ratio,model.question_n: question,
#                                                       model.answer_n : answer,
#                                                       model.label :label,
#                                                       model.sample_weights : sample_weights})
#     writer.add_summary(s0, count)
#     writer.add_summary(s1, count)
#     writer.add_summary(s2, count)
#     writer.add_summary(s3, count)
        
#     count += 1

In [334]:
def find_answer(Question):
    

    search_url = "https://en.wikipedia.org/w/index.php?search=" + Question.replace(" ", "+")

    html = urllib.request.urlopen(search_url).read()
    soup = BeautifulSoup(html)

    links = []
    for i in range(3):
        try:
            link = soup.findAll("div", ["mw-search-result-heading"])[i].find('a')['href']
            links.append('https://en.wikipedia.org' + link)
        except:
            pass

    net_text = 'Nothing here :)\n'

    for link in links:

        html_link = urllib.request.urlopen(link).read()
        soup_link = BeautifulSoup(html_link)

        # kill all script and style elements
        for script in soup_link(["script", "style"]):
            script.extract()    # rip it out

        # get text
        text = soup_link.get_text()

        # break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)

        if(len(text) > 1):
            net_text += text + "\n"

    net_text+="Nothing here :)"

    list_sentences = net_text.split("\n")

    answers = pd.DataFrame(list_sentences, columns = [2])

    answers[1] = Question
    answers[0] = 999
    answers[3] = 0

    df = answers.reindex(sorted(answers.columns), axis=1)

    question, answer, label, ratio = PRE.mini_df_to_stacks(df.iloc[:-2])

    scores = model.score(question, answer, ratio.reshape(-1,1))

    index = np.argmax(scores)
    
    if(scores[index] > 0.7):
        return list_sentences[np.argmax(scores)] + ", with confidence : " + str(scores[index])
    else:
        return "Sorry Not Found"

In [343]:
# Wikipedia Search
Question = "Who is the father of the Computer"

In [344]:
find_answer(Question)

'Charles Babbage, an English mechanical engineer and polymath, originated the concept of a programmable computer. Considered the "father of the computer",[17] he conceptualized and invented the first mechanical computer in the early 19th century. After working on his revolutionary difference engine, designed to aid in navigational calculations, in 1833 he realized that a much more general design, an Analytical Engine, was possible. The input of programs and data was to be provided to the machine via punched cards, a method being used at the time to direct mechanical looms such as the Jacquard loom. For output, the machine would have a printer, a curve plotter and a bell. The machine would also be able to punch numbers onto cards to be read in later. The Engine incorporated an arithmetic logic unit, control flow in the form of conditional branching and loops, and integrated memory, making it the first design for a general-purpose computer that could be described in modern terms as Turin