# **Aim of this Challenge:** 

Create intelligent question and answer systems that can reliably predict context without relying on complicated and opaque rating guidelines.

# The Business Problem:


To create a more human-like question and answering system can answer the provided question having the intuitive understanding of the question. This can attract users and address their question more human-like and this can also increase the number of user participation in the question answering forms and create human-like conversation chat boxes.


# Exploring dataset

In [None]:
# importing the required libraries 

import pandas as pd
import  numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_dataset = pd.read_csv('/kaggle/input/google-quest-challenge/train.csv')
test_dataset = pd.read_csv('/kaggle/input/google-quest-challenge/test.csv')
sample_submission_dataset = pd.read_csv('/kaggle/input/google-quest-challenge/sample_submission.csv')

print("Train shape:", train_dataset.shape)
print("Test shape:", test_dataset.shape)
print("Sample submission shape:", sample_submission_dataset.shape)

### Observations:
* In train dataset we have 41 column and 6079 rows(instances/training points).
* in test dataset we have only 11 column and 476 rows(instances/test points).
* in submission dataset we have 31 column and 476 rows.

In [None]:
# Check for train data samples
train_dataset.head(2)

In [None]:
# getting basic info from training data
train_dataset.info()

> **Observations:** There are 10 features and no null values and 10 are having type as object and 30 labels are having type as float64 

### Features:
 1   question_title                         
 2   question_body                           
 3   question_user_name                      
 4   question_user_page                     
 5   answer                                 
 6   answer_user_name                      
 7   answer_user_page                        
 8   url                                     
 9   category                                
 10  host      

In [None]:
# Describing the train data
train_dataset.describe()

### **Observations:** 
* In the above 41 columns, 10 are feature and 30 are the class labels and one column qa_id is the unique ID for every instance.
* **21 class** labels are for **questions** that is the label  that starts with "question_..."
* **9 class** labels are for **answers** that is the label  which starts with "answer_..."

* Total we have **30 Class Lables**

In [None]:
# Let's see the list of column names

list(train_dataset.columns[1:])

In [None]:
train_dataset.head()

## Checking density of words & characters present in the `question_title` feature

In [None]:
import seaborn as sns


def word_count(sentense):
    sentense = sentense.strip()

    return len(sentense.split(" "))


fig, ax = plt.subplots(1,2, figsize = ( 20 , 5))


question_title_lengths_train = train_dataset['question_title'].apply(len)
question_title_lengths_test = test_dataset['question_title'].apply(len)
question_title_lengths_train_words = train_dataset['question_title'].apply(word_count)
question_title_lengths_test_words = test_dataset['question_title'].apply(word_count)


sns.histplot(question_title_lengths_train, label="Train", kde=True, stat="density", linewidth=0,  color="red", ax=ax[0])
sns.histplot(question_title_lengths_test, label="Test", kde=True, stat="density", linewidth=0,  color="blue", ax=ax[0])
sns.histplot(question_title_lengths_train_words, label="Train", kde=True, stat="density", linewidth=0,  color="red", ax=ax[1])
sns.histplot(question_title_lengths_test_words, label="Test", kde=True, stat="density", linewidth=0,  color="blue", ax=ax[1])

# Set label for x-axis
ax[0].set_xlabel( "No. of characters" , size = 12 )
  
# Set label for y-axis
ax[0].set_ylabel( "Density of character" , size = 12 )
  
# Set title for plot
ax[0].set_title( "Density of characters in 'question_title' feature\n" , size = 15 )

ax[0].legend()


# Set label for x-axis
ax[1].set_xlabel( "No. of Words" , size = 12 )
  
# Set label for y-axis
ax[1].set_ylabel( "Density of Words" , size = 12 )
  
# Set title for plot
ax[1].set_title( "Density of Words in 'question_title' feature\n" , size = 15 )

ax[1].legend()



plt.show();


### Observation: 
* Both train and test having the same distribution of characters and words. 
* Most of the words lies in range 5-10 both train and test. 
* Most of the characters lies in the range 40-60 train and test. 

## Checking density of words & characters present in the `question_body` feature

In [None]:
import seaborn as sns


def word_count(sentense):
    sentense = sentense.strip()

    return len(sentense.split(" "))


fig, ax = plt.subplots(1,2, figsize = ( 20 , 5))


question_body_lengths_train = train_dataset['question_body'].apply(len)
question_body_lengths_test = test_dataset['question_body'].apply(len)
question_body_lengths_train_words = train_dataset['question_body'].apply(word_count)
question_body_lengths_test_words = test_dataset['question_body'].apply(word_count)


sns.histplot(question_body_lengths_train, label="Train", kde=True, stat="density", linewidth=0,  color="red", ax=ax[0])
sns.histplot(question_body_lengths_test, label="Test", kde=True, stat="density", linewidth=0,  color="blue", ax=ax[0])
sns.histplot(question_body_lengths_train_words, label="Train", kde=True, stat="density", linewidth=0,  color="red", ax=ax[1])
sns.histplot(question_body_lengths_test_words, label="Test", kde=True, stat="density", linewidth=0,  color="blue", ax=ax[1])

# Set label for x-axis
ax[0].set_xlabel( "No. of characters" , size = 12 )
  
# Set label for y-axis
ax[0].set_ylabel( "Density of character" , size = 12 )
  
# Set title for plot
ax[0].set_title( "Density of characters in 'question_body' feature\n" , size = 15 )

ax[0].legend()


# Set label for x-axis
ax[1].set_xlabel( "No. of Words" , size = 12 )
  
# Set label for y-axis
ax[1].set_ylabel( "Density of Words" , size = 12 )
  
# Set title for plot
ax[1].set_title( "Density of Words in 'question_body' feature\n" , size = 15 )

ax[1].legend()



plt.show();


### Observation:
* We can observe that the distribution of both words and characters are very much right skewed.
* Most of the characters in question_body lies below 2500.
* Most of the words in question_body lies below 1000.

## Similarly we will check for `answer` feature

In [None]:
import seaborn as sns


def word_count(sentense):
    sentense = sentense.strip()
    return len(sentense.split(" "))


fig, ax = plt.subplots(1,2, figsize = ( 20 , 5))


answer_lengths_train = train_dataset['answer'].apply(len)
answer_lengths_test = test_dataset['answer'].apply(len)
answer_lengths_train_words = train_dataset['answer'].apply(word_count)
answer_lengths_test_words = test_dataset['answer'].apply(word_count)


sns.histplot(answer_lengths_train, label="Train", kde=True, stat="density", linewidth=0,  color="red", ax=ax[0])
sns.histplot(answer_lengths_test, label="Test", kde=True, stat="density", linewidth=0,  color="blue", ax=ax[0])
sns.histplot(answer_lengths_train_words, label="Train", kde=True, stat="density", linewidth=0,  color="red", ax=ax[1])
sns.histplot(answer_lengths_test_words, label="Test", kde=True, stat="density", linewidth=0,  color="blue", ax=ax[1])

# Set label for x-axis
ax[0].set_xlabel( "No. of characters" , size = 12 )
  
# Set label for y-axis
ax[0].set_ylabel( "Density of character" , size = 12 )
  
# Set title for plot
ax[0].set_title( "Density of characters in 'answer' feature\n" , size = 15 )

ax[0].legend()


# Set label for x-axis
ax[1].set_xlabel( "No. of Words" , size = 12 )
  
# Set label for y-axis
ax[1].set_ylabel( "Density of Words" , size = 12 )
  
# Set title for plot
ax[1].set_title( "Density of Words in 'answer' feature\n" , size = 15 )

ax[1].legend()



plt.show();


### Observation:
* As similar to question_body we can find that answer distribution is also skewed.
* Their may be some extreme outlier instance that words/char length are very high in both question_body and answer features.

## Analyzing `question_body` and `answer` features sequence length

In [None]:
for i in range(0,101,10):
    print(f'{i}th percentile of question_body input sequence {np.percentile(question_body_lengths_train_words, i)}')
print()
for i in range(90,101):
    print(f'{i}th percentile of question_body input sequence {np.percentile(question_body_lengths_train_words, i)}')
print()
for i in [99.1,99.2,99.3,99.4,99.5,99.6,99.7,99.8,99.9,100]:
    print(f'{i}th percentile of question_body input sequence {np.percentile(question_body_lengths_train_words, i)}')

## **Observation:** 99.9% the of words in question body lies below **3220**

In [None]:
for i in range(0,101,10):
    print(f'{i}th percentile of answer input sequence {np.percentile(answer_lengths_train_words, i)}')
print()
for i in range(90,101):
    print(f'{i}th percentile of answer input sequence {np.percentile(answer_lengths_train_words, i)}')
print()
for i in [99.1,99.2,99.3,99.4,99.5,99.6,99.7,99.8,99.9,100]:
    print(f'{i}th percentile of answer input sequence {np.percentile(answer_lengths_train_words, i)}')

## **Observation:** 99.9% of words in answer feature lies below **2200**

# Analyzing `category` Feature

In [None]:
train_dataset['category'].unique()

In [None]:
train_category_feature_count = train_dataset['category'].value_counts()
test_category_feature_count = test_dataset['category'].value_counts()

print("Train category:\n",train_category_feature_count)
print()
print("Test category:\n",test_category_feature_count)

In [None]:
figure, ax = plt.subplots(1,2, figsize=(12, 6))

train_category_feature_count.plot(kind='bar', ax=ax[0])
test_category_feature_count.plot(kind='bar', ax=ax[1])

ax[0].set_title('Train')
ax[0].set_xlabel( "unique category" , size = 12 )
ax[0].set_ylabel( "count" , size = 12 )

ax[1].set_title('Test')
ax[1].set_xlabel( "unique category" , size = 12 )
ax[1].set_ylabel( "count" , size = 12 )

plt.show()

In [None]:
# Sample stack over flow question and answer
train_dataset[train_dataset['category'] == 'STACKOVERFLOW'].values[11]

In [None]:
# sample science question and answer 
train_dataset[train_dataset['category'] == 'SCIENCE'].values[11]

In [None]:
# sample life art and culture question and answer
train_dataset[train_dataset['category'] == 'LIFE_ARTS'].values[11]

In [None]:
# sample life art and culture question and answer
train_dataset[train_dataset['category'] == 'CULTURE'].values[11]

### Observation:
* Five unique category are present in the category feature.
* **Technology** and **Stackoverflow** are the highest count and both are related topics.
* **Life_arts** as the lowest count category.
* Distribution of train and test category are the same.
* **Life_arts & culture** follow general english syntax & structure.
* **Science** utilizes latex with expressions prepended and appended with symbol: $
* **Technology & stackoverflow** have code snippets & logs.

# Word cloud

In [None]:
from wordcloud import WordCloud


def plot_wordcloud(text, ax, title=None):
    wordcloud = WordCloud(max_font_size=None, background_color='white', width=1200, height=1000).generate(text)
    ax.imshow(wordcloud)
    if title is not None:
        ax.set_title(title, size = 15)
    ax.axis("off")

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# word cloud for train data
text = ' '.join(train_dataset['question_title'].values)
plot_wordcloud(text, axes[0][0], 'Train Question title')

text = ' '.join(train_dataset['question_body'].values)
plot_wordcloud(text, axes[0][1], 'Train Question body')

text = ' '.join(train_dataset['answer'].values)
plot_wordcloud(text, axes[0][2], 'Train Answer')


# word cloud for Test data
text = ' '.join(test_dataset['question_title'].values)
plot_wordcloud(text, axes[1][0], 'Test Question title')

text = ' '.join(test_dataset['question_body'].values)
plot_wordcloud(text, axes[1][1], 'Test Question body')

text = ' '.join(test_dataset['answer'].values)
plot_wordcloud(text, axes[1][2], 'Test Answer')

plt.tight_layout()
fig.show()

### Observation:
* We can observe that some of words match between train and test set.
Reference: https://www.kaggle.com/corochann/google-quest-first-data-introduction?scriptVersionId=23910525&cellId=34

# Analyzing labels 

In [None]:
for label in train_dataset.columns[11:]:
    print(f"{label:.20}: no. of unique label values: {len(train_dataset[label].unique())}")

### Observation:
* The output label are regression(real) values but the distribution is not continuous.
* Except for `answer_satisfaction` label rest every label are having unique values some are with 9 unique values and some are of 5 unique values.
* Using this insights we can use post pocessing to get better scoring 

In [None]:
for label in train_dataset.columns[11:]:
    sns.histplot(train_dataset[label], label=label, kde=False)
    plt.show()

### Observation:
* **Label values are imbalance** like for some of the label values are having only one values ex: **question_type_spelling**, **question_not_really_question** etc that is the distribution of label are very dissimilar.

### correlation between target variables

In [None]:
fig, ax = plt.subplots(figsize=(20,20))   
sns.heatmap(train_dataset[11:].corr(), linewidths=1, ax=ax, annot_kws={"fontsize":40})
plt.show();

### Observations:
From the above heatmap of correleation we can observe that `answer_helpful`, `answer_level_of_information`, `answer_plausible`, `answer_releveance` and `answer_satification` have some correlation between them.

## Analyzing `host` feature

In [None]:
print(f"Total unique host present in the dataset {len(train_dataset['host'].unique())}")

In [None]:
train_host_feature_count = train_dataset['host'].value_counts()


figure, ax = plt.subplots( figsize=(20, 5))

train_host_feature_count.plot(kind='bar', ax=ax)

ax.set_title('Train dataset - count of Q&A collected from each website', size=20)
ax.set_xlabel( "Host" , size = 12 )
ax.set_ylabel( "Count" , size = 12 )

plt.show()

In [None]:
test_host_feature_count = test_dataset['host'].value_counts()
figure, ax = plt.subplots( figsize=(20, 5))
test_host_feature_count.plot(kind='bar', ax=ax)
ax.set_title('Test dataset - count of Q&A collected from each website', size=20)
ax.set_xlabel( "Host" , size = 12 )
ax.set_ylabel( "Count" , size = 12 )
plt.show()


### Observation:
* All question and answer in the dataset are extracted from **63 websites**.
* Most of the question and answer are from **stackoverflow.com** as we observe from the  `category` feature analysis that most of the caterogy fall under **technology and stackoverflow**.

# Spliting the data in to train and validation

In [None]:
y_columns = ['question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written', 'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written']

y = train_dataset[y_columns]
X = train_dataset.drop(y_columns,axis=1)

In [None]:
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split


X_train_dataset, X_valid_dataset, y_train_dataset, y_valid_dataset = train_test_split(X,y, test_size=0.10)

In [None]:
X_train_dataset.shape, X_valid_dataset.shape, y_train_dataset.shape, y_valid_dataset.shape

In [None]:
X_train_dataset

#  **Preprocessing Text Feature**

In [None]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    phrase = re.sub(r"(W|w)on(\'|\’)t ", "will not ", phrase)
    phrase = re.sub(r"(C|c)an(\'|\’)t ", "can not ", phrase)
    phrase = re.sub(r"(Y|y)(\'|\’)all ", "you all ", phrase)
    phrase = re.sub(r"(Y|y)a(\'|\’)ll ", "you all ", phrase)
    phrase = re.sub(r"(I|i)(\'|\’)m ", "i am ", phrase)
    phrase = re.sub(r"(A|a)isn(\'|\’)t ", "is not ", phrase)
    phrase = re.sub(r"n(\'|\’)t ", " not ", phrase)
    phrase = re.sub(r"(\'|\’)re ", " are ", phrase)
    phrase = re.sub(r"(\'|\’)d ", " would ", phrase)
    phrase = re.sub(r"(\'|\’)ll ", " will ", phrase)
    phrase = re.sub(r"(\'|\’)t ", " not ", phrase)
    phrase = re.sub(r"(\'|\’)ve ", " have ", phrase)
    
    return phrase


def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '12345', x)
    x = re.sub('[0-9]{4}', '1234', x)
    x = re.sub('[0-9]{3}', '123', x)
    x = re.sub('[0-9]{2}', '12', x)
    return x

In [None]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [None]:
# Combining all the above stundents 
from tqdm import tqdm
def preprocess_text(text_data):
    preprocessed_text = []
    # tqdm is for printing the status bar
    for sentance in tqdm(text_data):
        sent = decontracted(sentance)
        sent = clean_text(sentance)
        sent = clean_numbers(sentance)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\n', ' ')
        sent = sent.replace('\\"', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        # https://gist.github.com/sebleier/554280
        sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
        preprocessed_text.append(sent.lower().strip())
    return preprocessed_text

In [None]:
X_train_dataset['preprocessed_question_title'] = preprocess_text(X_train_dataset['question_title'].values)
X_train_dataset['preprocessed_question_body'] = preprocess_text(X_train_dataset['question_body'].values)
X_train_dataset['preprocessed_answer'] = preprocess_text(X_train_dataset['answer'].values)


X_valid_dataset['preprocessed_question_title'] = preprocess_text(X_valid_dataset['question_title'].values)
X_valid_dataset['preprocessed_question_body'] = preprocess_text(X_valid_dataset['question_body'].values)
X_valid_dataset['preprocessed_answer'] = preprocess_text(X_valid_dataset['answer'].values)

In [None]:
test_dataset['preprocessed_question_title'] = preprocess_text(test_dataset['question_title'].values)
test_dataset['preprocessed_question_body'] = preprocess_text(test_dataset['question_body'].values)
test_dataset['preprocessed_answer'] = preprocess_text(test_dataset['answer'].values)

### question_title text after preprocessing

In [None]:
# Text before preprocessing
X_train_dataset['question_title'].values[0]

In [None]:
# Text after preprocessing
X_train_dataset['preprocessed_question_title'].values[0]

### question_body after preprocessing

In [None]:
# Text before preprocessing
X_train_dataset['question_body'].values[0]

In [None]:
# Text after preprocessing
X_train_dataset['preprocessed_question_body'].values[0]

### Answer after preprocessing

In [None]:
# Text before preprocessing
X_train_dataset['answer'].values[0]

In [None]:
# Text after preprocessing
X_train_dataset['preprocessed_answer'].values[0]

# **Feature engineering:**

## Text count based features:

1. Number of characters in the **question_title**
2. Number of characters in the **question_body**
3. Number of characters in the **answer**
4. Number of words in the **question_title**
5. Number of words in the **question_body**
6. Number of words in the **answer**
7. Number of unique words in the **question_title**
8. Number of unique words in the **question_body**
9. Number of unique words in the **answer**


In [None]:
def word_count(sentense):
    sentense = sentense.strip()

    return len(sentense.split(" "))

def unique_word_count(sentense):
    sentense = sentense.strip()

    return len(set(sentense.split(" ")))


In [None]:
import warnings
warnings.filterwarnings('ignore')

# Number of characters in the text
X_train_dataset["question_title_num_chars"] = X_train_dataset["question_title"].apply(len)
X_train_dataset["question_body_num_chars"] = X_train_dataset["question_body"].apply(len)
X_train_dataset["answer_num_chars"] = X_train_dataset["answer"].apply(len)

# Feature engineering for validation dataset
X_valid_dataset["question_title_num_chars"] = X_valid_dataset["question_title"].apply(len)
X_valid_dataset["question_body_num_chars"] = X_valid_dataset["question_body"].apply(len)
X_valid_dataset["answer_num_chars"] = X_valid_dataset["answer"].apply(len)

test_dataset["question_title_num_chars"] = test_dataset["question_title"].apply(len)
test_dataset["question_body_num_chars"] = test_dataset["question_body"].apply(len)
test_dataset["answer_num_chars"] = test_dataset["answer"].apply(len)

#########################################################################################################
# Number of words in the text
X_train_dataset["question_title_num_words"] = X_train_dataset["question_title"].apply(word_count)
X_train_dataset["question_body_num_words"] = X_train_dataset["question_body"].apply(word_count)
X_train_dataset["answer_num_words"] = X_train_dataset["answer"].apply(word_count)

# validation dataset features
X_valid_dataset["question_title_num_words"] = X_valid_dataset["question_title"].apply(word_count)
X_valid_dataset["question_body_num_words"] = X_valid_dataset["question_body"].apply(word_count)
X_valid_dataset["answer_num_words"] = X_valid_dataset["answer"].apply(word_count)

test_dataset["question_title_num_words"] = test_dataset["question_title"].apply(word_count)
test_dataset["question_body_num_words"] = test_dataset["question_body"].apply(word_count)
test_dataset["answer_num_words"] = test_dataset["answer"].apply(word_count)


#######################################################################################################
# Number of unique words in the text
X_train_dataset["question_title_num_unique_words"] = X_train_dataset["question_title"].apply(unique_word_count)
X_train_dataset["question_body_num_unique_words"] = X_train_dataset["question_body"].apply(unique_word_count)
X_train_dataset["answer_num_unique_words"] = X_train_dataset["answer"].apply(unique_word_count)

# Validation dataset
X_valid_dataset["question_title_num_unique_words"] = X_valid_dataset["question_title"].apply(unique_word_count)
X_valid_dataset["question_body_num_unique_words"] = X_valid_dataset["question_body"].apply(unique_word_count)
X_valid_dataset["answer_num_unique_words"] = X_valid_dataset["answer"].apply(unique_word_count)

test_dataset["question_title_num_unique_words"] = test_dataset["question_title"].apply(unique_word_count)
test_dataset["question_body_num_unique_words"] = test_dataset["question_body"].apply(unique_word_count)
test_dataset["answer_num_unique_words"] = test_dataset["answer"].apply(unique_word_count)

## TF-IDF based features:

* Word Level N-Gram TF-IDF of **question_title**
* Word Level N-Gram TF-IDF of **question_body**
* Word Level N-Gram TF-IDF of **answer**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

vectorizer = TfidfVectorizer(min_df=2)
tsvd = TruncatedSVD(n_components = 128, n_iter=5)


qt_tfidf = vectorizer.fit_transform(X_train_dataset['preprocessed_question_title'].values)
tfidf_question_title_train = tsvd.fit_transform(qt_tfidf)


qb_tfidf = vectorizer.fit_transform(X_train_dataset['preprocessed_question_body'].values)
tfidf_question_body_train = tsvd.fit_transform(qb_tfidf)


ans_tfidf = vectorizer.fit_transform(X_train_dataset['preprocessed_answer'].values)
tfidf_answer_train = tsvd.fit_transform(ans_tfidf)

In [None]:
qt_tfidf = vectorizer.fit_transform(X_valid_dataset['preprocessed_question_title'].values)
tfidf_question_title_valid = tsvd.fit_transform(qt_tfidf)


qb_tfidf = vectorizer.fit_transform(X_valid_dataset['preprocessed_question_body'].values)
tfidf_question_body_valid = tsvd.fit_transform(qb_tfidf)


ans_tfidf = vectorizer.fit_transform(X_valid_dataset['preprocessed_answer'].values)
tfidf_answer_valid = tsvd.fit_transform(ans_tfidf)

In [None]:
qt_tfidf = vectorizer.fit_transform(test_dataset['preprocessed_question_title'].values)
tfidf_question_title_test = tsvd.fit_transform(qt_tfidf)


qb_tfidf = vectorizer.fit_transform(test_dataset['preprocessed_question_body'].values)
tfidf_question_body_test = tsvd.fit_transform(qb_tfidf)


ans_tfidf = vectorizer.fit_transform(test_dataset['preprocessed_answer'].values)
tfidf_answer_test = tsvd.fit_transform(ans_tfidf)

In [None]:
X_train_dataset["tfidf_question_title"] = list(tfidf_question_title_train)
X_train_dataset["tfidf_question_body"] = list(tfidf_question_body_train)
X_train_dataset["tfidf_answer"] = list(tfidf_answer_train)

In [None]:
X_valid_dataset["tfidf_question_title"] = list(tfidf_question_title_valid)
X_valid_dataset["tfidf_question_body"] = list(tfidf_question_body_valid)
X_valid_dataset["tfidf_answer"] = list(tfidf_answer_valid)

In [None]:
test_dataset["tfidf_question_title"] = list(tfidf_question_title_test)
test_dataset["tfidf_question_body"] = list(tfidf_question_body_test)
test_dataset["tfidf_answer"] = list(tfidf_answer_test)

## Features using web scraping 


## `answer_user_page` features:


In [None]:
!pip install bs4

In [None]:

from bs4 import BeautifulSoup
from urllib import request


def get_user_rating(url):
    try:
        get = request.urlopen(url).read()
        src = BeautifulSoup(get, 'html.parser')
        #print(src)
        reputation, gold = [], []
        silver, bronze = [], []
        reputation = int(''.join(src.find_all("div", class_ = 'fs-body3 fc-dark')[0].text.strip().split(',')))
        try:
            gold = int(''.join(src.find_all('div', class_='fs-title fw-bold fc-black-800')[0].text.strip().split(',')))
        except:
            gold = 0

        try:    
            silver = int(''.join(src.find_all('div', class_='fs-title fw-bold fc-black-800')[1].text.strip().split(',')))
        except:
            silver = 0

        try:
            bronze = int(''.join(src.find_all('div', class_='fs-title fw-bold fc-black-800')[2].text.strip().split(',')))
        except:
            bronze = 0

        output = [reputation, gold, silver, bronze]
    except:
        output = [0]*4

    return output
'''
data = []
for url in tqdm(X_train_dataset['answer_user_page']):
    #print(url)
    data.append(get_user_rating(url))
    columns = ['reputation', 'gold', 'silver', 'bronze']  
scraped = pd.DataFrame(data, columns=columns)
scraped.to_csv(f'train_web_scrap_features.csv', index=False)

data = []
for url in tqdm(X_valid_dataset['answer_user_page']):
    #print(url)
    data.append(get_user_rating(url))
    columns = ['reputation', 'gold', 'silver', 'bronze']  
scraped = pd.DataFrame(data, columns=columns)
scraped.to_csv(f'valid_web_scrap_features.csv', index=False)
'''

train_web_scraping_feature = pd.read_csv('../input/feature-engineering/train_web_scrap_features.csv')
valid_web_scraping_feature = pd.read_csv('../input/feature-engineering/valid_web_scrap_features.csv')

In [None]:
train_web_scraping_feature

In [None]:
valid_web_scraping_feature

### References for feature engineering:
* https://www.kaggle.com/c/google-quest-challenge/discussion/130041 - meta features.
* https://www.kaggle.com/codename007/start-from-here-quest-complete-eda-fe?scriptVersionId=25618132&cellId=65 - tfidf, count based features.
* https://towardsdatascience.com/hands-on-transformers-kaggle-google-quest-q-a-labeling-affd3dad7bcb - web scraping features

> **Note:** We have experimented with bi-direction LSTM as base model and acheived spearman score of 0.2712, then we tried with universal sentense encoder and achevied an spearman score of 0.37123 as the best score. Now lets experiment with transformed based models

In [None]:
from prettytable import PrettyTable


myTable = PrettyTable(["USE Model", "Features", "Spearman scroe"])


myTable.add_row(["Bi-LSTM", "Three basic features", "0.27561"])
myTable.add_row(["Bi-LSTM", "Three basic features + 18 FE Features(meta, TF-IDF, Web scraping)", "0.00253"])
myTable.add_row(["Bi-LSTM", "Three basic features + 13 FE Features(meta, Web scraping)", "0.01255"])
myTable.add_row(["Bi-LSTM", "Three basic features + 9 FE Features(meta features)", "0.28656"])
myTable.add_row(["Bi-LSTM", "Three basic features + 13 FE features with 100 dim embeddings(meta, Web scraping)", "-0.0041"])
myTable.add_row(["USE", "Three basic features", "0.33029"])
myTable.add_row(["USE", "Three basic features + 9 Meta Features", "0.37133"])
myTable.add_row(["USE", "Three basic features + L2 distance feature + 9 meta Features", "0.36575"])
myTable.add_row(["USE", "Three basic features + cosine distance + 9 Meta features", "0.37153"])
myTable.add_row(["USE", "Three basic features + L2 distance +cosine distance + 9 Meta features", "0.37061"])


print(myTable)

In [None]:
!pip install bert-for-tf2

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
import tensorflow_hub as hub
import tensorflow as tf
from bert import bert_tokenization as tokenization
import tensorflow.keras.backend as K
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil

np.set_printoptions(suppress=True)

In [None]:
hub_url_bert = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"
bert_layer = hub.KerasLayer(hub_url_bert, trainable=False)


In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()


tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

print("Vocab size:", len(tokenizer.vocab))

In [None]:
X_train_dataset.shape, X_valid_dataset.shape, test_dataset.shape

In [None]:
X_train_dataset.columns

# Transforming input features for bert model

### Functions to get `Input Ids` , `Input mask`, `Input segment` for bert

In [None]:
def extract_masks(tokens, max_seq_length):
    
    """Mask for padding"""
    
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))



def extract_segments(tokens, max_seq_length):
    
    """Segments: 0 for the first sequence, 1 for the second"""
    
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))



def extract_ids(tokens, tokenizer, max_seq_length):
    
    """Token ids from Tokenizer vocab"""
    
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

### In the below `_traim_input` function:

* if the input sentence has the number of tokens > 512, the 
sentence is trimmed down to 512. To trim the number of tokens, 256 tokens from 
the start and 256 tokens from the end are kept and the remaining tokens are dropped.

> **Ex.** suppose an answer has 700 tokens, to trim this down to 512, 256 tokens from the
beginning are taken and 256 tokens from the end are taken and concatenated to make 
512 tokens. The remaining [700-(256+256) = 288] tokens that are in the middle of the 
answer are dropped. 

* The logic makes sense because in large texts, the beginning part
usually describes what the text is all about and the end part describes the conclusion
of the text. This is also closely related to the target features that we need to predict.

In [None]:
def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=30, q_max_len=239, a_max_len=239):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

### In the below `_convert_to_bert_inputs` function

* Concatinate the three text features in to one single features and convert the input to bert compatable inputs

In [None]:
def _convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length):
    """Converts tokenized input to ids, masks and segments for BERT"""
    
    text = ["[CLS]"] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = extract_ids(text, tokenizer, max_sequence_length)
    input_masks = extract_masks(text, max_sequence_length)
    input_segments = extract_segments(text, max_sequence_length)

    return [input_ids, input_masks, input_segments]

In [None]:
# Transforming bert training dataset to bert compatible input

input_ids, input_masks, input_segments = [], [], []
max_sequence_length = 512
for _, instance in tqdm(X_train_dataset.iterrows()):
    t, q, a = instance.question_title, instance.question_body, instance.answer

    t, q, a = _trim_input(t, q, a, max_sequence_length)

    ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

X_train_bert =  [np.asarray(input_ids, dtype=np.int32), 
                np.asarray(input_masks, dtype=np.int32), 
                np.asarray(input_segments, dtype=np.int32)]

In [None]:
# Transforming bert validation dataset to bert compatible input

input_ids, input_masks, input_segments = [], [], []
max_sequence_length = 512
for _, instance in tqdm(X_valid_dataset.iterrows()):
    t, q, a = instance.question_title, instance.question_body, instance.answer

    t, q, a = _trim_input(t, q, a, max_sequence_length)

    ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

X_valid_bert =  [np.asarray(input_ids, dtype=np.int32), 
                np.asarray(input_masks, dtype=np.int32), 
                np.asarray(input_segments, dtype=np.int32)]




In [None]:
# Transforming bert test dataset to bert compatible input

input_ids, input_masks, input_segments = [], [], []
max_sequence_length = 512
for _, instance in tqdm(test_dataset.iterrows()):
    t, q, a = instance.question_title, instance.question_body, instance.answer

    t, q, a = _trim_input(t, q, a, max_sequence_length)

    ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

X_test_bert =  [np.asarray(input_ids, dtype=np.int32), 
                np.asarray(input_masks, dtype=np.int32), 
                np.asarray(input_segments, dtype=np.int32)]

In [None]:
len(X_train_bert), X_train_bert[0].shape, X_train_bert[1].shape, X_train_bert[2].shape

# Fine-tuning bert model

In [None]:
from scipy.stats import spearmanr

class SpearmanCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

    def on_epoch_end(self, epoch, logs={}):
        y_pred_val = self.model.predict(self.x_val)
        rho_val = np.mean([spearmanr(self.y_val[:, ind], y_pred_val[:, ind] + np.random.normal(0, 1e-7, y_pred_val.shape[0])).correlation for ind in range(y_pred_val.shape[1])])
        print(rho_val)
        print('\nval_spearman-corr: %s' % (str(round(rho_val, 6))), end=100*' '+'\n')
        return rho_val

In [None]:
tf.keras.backend.clear_session()

max_seq_length = 512

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")

input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")

segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")

bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

bert_model = tf.keras.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=sequence_output)


In [None]:

input_word_ids = tf.keras.layers.Input(
    (512,), dtype=tf.int32, name='input_word_ids')
input_masks = tf.keras.layers.Input(
    (512,), dtype=tf.int32, name='input_masks')
input_segments = tf.keras.layers.Input(
    (512,), dtype=tf.int32, name='input_segments')


sequence_output = bert_model([input_word_ids, input_masks, input_segments])

x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.2)(x)
out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

model = tf.keras.Model(
    inputs=[input_word_ids, input_masks, input_segments], outputs=out
)
    
model.summary()

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
np.array(y_valid_dataset)

In [None]:
custom_callback = SpearmanCallback(
        validation_data=(X_valid_bert, np.array(y_valid_dataset))
)


In [None]:
y_train_dataset = np.asarray(y_train_dataset)

model.compile(loss='binary_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)) # 3e-5

history = model.fit(X_train_bert, y_train_dataset, epochs=10, 
          validation_data=(X_valid_bert, np.array(y_valid_dataset)),
              batch_size=4, callbacks=[custom_callback])

In [None]:
pd.DataFrame(history.history).plot()
plt.title("Fine tuning Bert model")

# Tweaking bert model

* Increase the dropouts rate for avoiding overfitting.
* Adding regularization to the layers to reduce overfitting.
* Adding one or more dense layers before final dense layer.

In [None]:
# Building the model


input_word_ids = tf.keras.layers.Input(
    (512,), dtype=tf.int32, name='input_word_ids')
input_masks = tf.keras.layers.Input(
    (512,), dtype=tf.int32, name='input_masks')
input_segments = tf.keras.layers.Input(
    (512,), dtype=tf.int32, name='input_segments')


sequence_output = bert_model([input_word_ids, input_masks, input_segments])

x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.5)(x) # increased from 0.2 to 0.5
x = tf.keras.layers.Dense(128,  kernel_regularizer= tf.keras.regularizers.l1())(x)
x = tf.keras.layers.Dense(64, kernel_regularizer= tf.keras.regularizers.l1())(x) 

x = tf.keras.layers.Dense(32)(x)

out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

model = tf.keras.Model(
    inputs=[input_word_ids, input_masks, input_segments], outputs=out
)
    
model.summary()

In [None]:
tf.keras.utils.plot_model(model)

In [None]:
custom_callback = SpearmanCallback(
        validation_data=(X_valid_bert, np.array(y_valid_dataset))
)


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)) # 3e-5

history = model.fit(X_train_bert, y_train_dataset, epochs=30, 
          validation_data=(X_valid_bert, np.array(y_valid_dataset)),
              batch_size=4, callbacks=[custom_callback])

In [None]:
pd.DataFrame(history.history).plot()
plt.title("Tweaking Bert model")

### Reference: 
* https://www.kaggle.com/abhinand05/bert-for-humans-tutorial-baseline-version-2