# **Aim of this Challenge:** 

Create intelligent question and answer systems that can reliably predict context without relying on complicated and opaque rating guidelines.

# The Business Problem:


To create a more human-like question and answering system can answer the provided question having the intuitive understanding of the question. This can attract users and address their question more human-like and this can also increase the number of user participation in the question answering forms and create human-like conversation chat boxes.


# Exploring dataset

In [None]:
# importing the required libraries 

import pandas as pd
import  numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_dataset = pd.read_csv('/kaggle/input/google-quest-challenge/train.csv')
test_dataset = pd.read_csv('/kaggle/input/google-quest-challenge/test.csv')
sample_submission_dataset = pd.read_csv('/kaggle/input/google-quest-challenge/sample_submission.csv')

print("Train shape:", train_dataset.shape)
print("Test shape:", test_dataset.shape)
print("Sample submission shape:", sample_submission_dataset.shape)

### Observations:
* In train dataset we have 41 column and 6079 rows(instances/training points).
* in test dataset we have only 11 column and 476 rows(instances/test points).
* in submission dataset we have 31 column and 476 rows.

In [None]:
# Check for train data samples
train_dataset.head(2)

# Spliting the data in to train and validation

In [None]:
y_columns = ['question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']

y = train_dataset[y_columns]
X = train_dataset.drop(y_columns,axis=1)

In [None]:
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split


X_train_dataset, X_valid_dataset, y_train_dataset, y_valid_dataset = train_test_split(X,y, test_size=0.10)

In [None]:
X_train_dataset.shape, X_valid_dataset.shape, y_train_dataset.shape, y_valid_dataset.shape

In [None]:
X_train_dataset

#  **Preprocessing Text Feature**

In [None]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    phrase = re.sub(r"(W|w)on(\'|\’)t ", "will not ", phrase)
    phrase = re.sub(r"(C|c)an(\'|\’)t ", "can not ", phrase)
    phrase = re.sub(r"(Y|y)(\'|\’)all ", "you all ", phrase)
    phrase = re.sub(r"(Y|y)a(\'|\’)ll ", "you all ", phrase)
    phrase = re.sub(r"(I|i)(\'|\’)m ", "i am ", phrase)
    phrase = re.sub(r"(A|a)isn(\'|\’)t ", "is not ", phrase)
    phrase = re.sub(r"n(\'|\’)t ", " not ", phrase)
    phrase = re.sub(r"(\'|\’)re ", " are ", phrase)
    phrase = re.sub(r"(\'|\’)d ", " would ", phrase)
    phrase = re.sub(r"(\'|\’)ll ", " will ", phrase)
    phrase = re.sub(r"(\'|\’)t ", " not ", phrase)
    phrase = re.sub(r"(\'|\’)ve ", " have ", phrase)
    
    return phrase


def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '12345', x)
    x = re.sub('[0-9]{4}', '1234', x)
    x = re.sub('[0-9]{3}', '123', x)
    x = re.sub('[0-9]{2}', '12', x)
    return x

In [None]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [None]:
# Combining all the above stundents 
from tqdm import tqdm
def preprocess_text(text_data):
    preprocessed_text = []
    # tqdm is for printing the status bar
    for sentance in tqdm(text_data):
        sent = decontracted(sentance)
        sent = clean_text(sentance)
        sent = clean_numbers(sentance)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\n', ' ')
        sent = sent.replace('\\"', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        # https://gist.github.com/sebleier/554280
        sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
        preprocessed_text.append(sent.lower().strip())
    return preprocessed_text

In [None]:
X_train_dataset['preprocessed_question_title'] = preprocess_text(X_train_dataset['question_title'].values)
X_train_dataset['preprocessed_question_body'] = preprocess_text(X_train_dataset['question_body'].values)
X_train_dataset['preprocessed_answer'] = preprocess_text(X_train_dataset['answer'].values)


X_valid_dataset['preprocessed_question_title'] = preprocess_text(X_valid_dataset['question_title'].values)
X_valid_dataset['preprocessed_question_body'] = preprocess_text(X_valid_dataset['question_body'].values)
X_valid_dataset['preprocessed_answer'] = preprocess_text(X_valid_dataset['answer'].values)

In [None]:
test_dataset['preprocessed_question_title'] = preprocess_text(test_dataset['question_title'].values)
test_dataset['preprocessed_question_body'] = preprocess_text(test_dataset['question_body'].values)
test_dataset['preprocessed_answer'] = preprocess_text(test_dataset['answer'].values)

### question_title text after preprocessing

In [None]:
# Text before preprocessing
X_train_dataset['question_title'].values[0]

In [None]:
# Text after preprocessing
X_train_dataset['preprocessed_question_title'].values[0]

### question_body after preprocessing

In [None]:
# Text before preprocessing
X_train_dataset['question_body'].values[0]

In [None]:
# Text after preprocessing
X_train_dataset['preprocessed_question_body'].values[0]

### Answer after preprocessing

In [None]:
# Text before preprocessing
X_train_dataset['answer'].values[0]

In [None]:
# Text after preprocessing
X_train_dataset['preprocessed_answer'].values[0]

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
import tensorflow_hub as hub
import tensorflow as tf
import bert_tokenization as tokenization
import tensorflow.keras.backend as K
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil

np.set_printoptions(suppress=True)

In [None]:
hub_url_bert = "../input/bert-hub/bert_en_uncased_L-12_H-768_A-12"
bert_layer = hub.KerasLayer(hub_url_bert, trainable=True)


In [None]:
#vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
#do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()


tokenizer = tokenization.FullTokenizer('../input/bert-hub/bert_en_uncased_L-12_H-768_A-12/assets/vocab.txt', True)

print("Vocab size:", len(tokenizer.vocab))

In [None]:
X_train_dataset.shape, X_valid_dataset.shape, test_dataset.shape

In [None]:
X_train_dataset.columns

# Transforming input features for bert model

### Functions to get `Input Ids` , `Input mask`, `Input segment` for bert

In [None]:
def extract_masks(tokens, max_seq_length):
    
    """Mask for padding"""
    
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))



def extract_segments(tokens, max_seq_length):
    
    """Segments: 0 for the first sequence, 1 for the second"""
    
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))



def extract_ids(tokens, tokenizer, max_seq_length):
    
    """Token ids from Tokenizer vocab"""
    
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

### In the below `_traim_input` function:

* if the input sentence has the number of tokens > 512, the 
sentence is trimmed down to 512. To trim the number of tokens, 256 tokens from 
the start and 256 tokens from the end are kept and the remaining tokens are dropped.

> **Ex.** suppose an answer has 700 tokens, to trim this down to 512, 256 tokens from the
beginning are taken and 256 tokens from the end are taken and concatenated to make 
512 tokens. The remaining [700-(256+256) = 288] tokens that are in the middle of the 
answer are dropped. 

* The logic makes sense because in large texts, the beginning part
usually describes what the text is all about and the end part describes the conclusion
of the text. This is also closely related to the target features that we need to predict.

In [None]:
def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

### In the below `_convert_to_bert_inputs` function

* Concatinate the three text features in to one single features and convert the input to bert compatable inputs

In [None]:
def _convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    text = ["[CLS]"] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = extract_ids(text, tokenizer, max_sequence_length)
    input_masks = extract_masks(text, max_sequence_length)
    input_segments = extract_segments(text, max_sequence_length)

    return [input_ids, input_masks, input_segments]

In [None]:
# Transforming bert training dataset to bert compatible input

input_ids, input_masks, input_segments = [], [], []
max_sequence_length = 512
for _, instance in tqdm(X_train_dataset.iterrows()):
    t, q, a = instance.question_title, instance.question_body, instance.answer

    t, q, a = _trim_input(t, q, a, max_sequence_length)

    ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

X_train_bert =  [np.asarray(input_ids, dtype=np.int32), 
                np.asarray(input_masks, dtype=np.int32), 
                np.asarray(input_segments, dtype=np.int32)]

In [None]:
# Transforming bert validation dataset to bert compatible input

input_ids, input_masks, input_segments = [], [], []
max_sequence_length = 512
for _, instance in tqdm(X_valid_dataset.iterrows()):
    t, q, a = instance.question_title, instance.question_body, instance.answer

    t, q, a = _trim_input(t, q, a, max_sequence_length)

    ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

X_valid_bert =  [np.asarray(input_ids, dtype=np.int32), 
                np.asarray(input_masks, dtype=np.int32), 
                np.asarray(input_segments, dtype=np.int32)]




In [None]:
# Transforming bert test dataset to bert compatible input

input_ids, input_masks, input_segments = [], [], []
max_sequence_length = 512
for _, instance in tqdm(test_dataset.iterrows()):
    t, q, a = instance.question_title, instance.question_body, instance.answer

    t, q, a = _trim_input(t, q, a, max_sequence_length)

    ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

X_test_bert =  [np.asarray(input_ids, dtype=np.int32), 
                np.asarray(input_masks, dtype=np.int32), 
                np.asarray(input_segments, dtype=np.int32)]

In [None]:
len(X_train_bert), X_train_bert[0].shape, X_train_bert[1].shape, X_train_bert[2].shape

# Fine-tuning bert model

In [None]:
from scipy.stats import spearmanr

class SpearmanCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

    def on_epoch_end(self, epoch, logs={}):
        print("y_val :", self.y_val.shape)
        y_pred_val = self.model.predict(self.x_val)
        print("y_pred_val :",y_pred_val.shape )
        rho_val = np.mean([spearmanr(self.y_val[:, ind], y_pred_val[:, ind] + np.random.normal(0, 1e-7, y_pred_val.shape[0])).correlation for ind in range(y_pred_val.shape[1])])
        print(rho_val)
        print('\nval_spearman-corr: %s' % (str(round(rho_val, 6))), end=100*' '+'\n')
        return rho_val

In [None]:
tf.keras.backend.clear_session()

max_seq_length = 512

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")

input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")

segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")

hub_url_bert = "../input/bert-hub/bert_en_uncased_L-12_H-768_A-12"
bert_layer = hub.KerasLayer(hub_url_bert, trainable=True)

pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

bert_model = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=sequence_output)

In [None]:

input_word_ids = tf.keras.layers.Input(
    (512,), dtype=tf.int32, name='input_word_ids')
input_masks = tf.keras.layers.Input(
    (512,), dtype=tf.int32, name='input_masks')
input_segments = tf.keras.layers.Input(
    (512,), dtype=tf.int32, name='input_segments')


sequence_output = bert_model([input_word_ids, input_masks, input_segments])

x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.2)(x)
out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

model = tf.keras.Model(
    inputs=[input_word_ids, input_masks, input_segments], outputs=out
)
    
model.summary()

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
custom_callback = SpearmanCallback(
        validation_data=(X_valid_bert, np.array(y_valid_dataset))
)


In [None]:
y_train_dataset = np.asarray(y_train_dataset)

model.compile(loss='binary_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)) # 3e-5

history = model.fit(X_train_bert, y_train_dataset, epochs=3, 
          validation_data=(X_valid_bert, np.array(y_valid_dataset)),
              batch_size=4, callbacks=[custom_callback])

In [None]:
submission = model.predict(X_test_bert)

In [None]:
submission.shape

In [None]:
sample_submission_dataset[y_columns] = submission

In [None]:
sample_submission_dataset.to_csv("submission.csv", index = False)