In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.environ["WANDB_API_KEY"] = "fd2bb4031eda4c72558d6d4b140e340de466e148"
import wandb
wandb.init(project="google-quest-q-a-labelling", config={'BERT_features': 'last_hidden_CLS'})

In [None]:
wandb.log({'test': 0})

In [None]:
TRAIN_PATH = "/kaggle/input/google-quest-challenge/train.csv"

TEST_PATH = "/kaggle/input/google-quest-challenge/test.csv"

SAMPLE_SUBMISSION_PATH = "/kaggle/input/google-quest-challenge/sample_submission.csv"

SUBMISSION_PATH = "submission.csv"

MODEL_TYPE = 'bert-base-uncased'
BERT_BASE_UNCASED_LOCATION = "/kaggle/input/bert-base-uncased"
BERT_BASE_UNCASED_TOKENIZER_LOCATION = "/kaggle/input/bert-base-uncased/vocab.txt"

In [None]:
import pandas as pd
test_data = pd.read_csv(TEST_PATH)
train_data = pd.read_csv(TRAIN_PATH)
sample_submission_data = pd.read_csv(SAMPLE_SUBMISSION_PATH)

In [None]:
prediction_columns = [col for col in train_data.columns if col not in test_data]
input_columns = [col for col in test_data.columns]
print(prediction_columns)
print(input_columns)

In [None]:
X, y = train_data[input_columns], train_data[prediction_columns]

In [None]:
# TODO: DELETE!
# X = X.iloc[:101,:]
# y = y.iloc[:101,:]


In [None]:
X_full = pd.concat([train_data[input_columns], test_data[input_columns]])

In [None]:
#  BERT features
from transformers import BertTokenizer, BertModel

model = BertModel.from_pretrained(BERT_BASE_UNCASED_LOCATION)
model.to('cuda')
tokenizer = BertTokenizer.from_pretrained(BERT_BASE_UNCASED_LOCATION)

BERT_columns = ['question_title', 'question_body', 'answer']
BERT_max_sentence_size = 512
BERT_embedding_size = 768

def extract_BERT_last_hidden_CLS(outputs):
    return outputs[0][0][0].cpu().detach().numpy()

def BERT_tokens(string):
    tokens = tokenizer(string, return_tensors="pt")
    for k in tokens.keys():
        tokens[k] = tokens[k].to('cuda')
    return tokens


def get_BERT_features(string):
    inputs = BERT_tokens(string)
    outputs = model(**inputs)
    return extract_BERT_last_hidden_CLS(outputs)

def BERT_features_e2e(X, trained_pca=None):
    from tqdm import tqdm
    _BERT_features = []
    for col in BERT_columns:
        features = []
        for idx in tqdm(range(len(X[col]))):
            row = X[col][idx]
            features.append(get_BERT_features(row[:BERT_max_sentence_size]))
        _BERT_features.append(pd.DataFrame(features))
    BERT_features = pd.concat(_BERT_features, axis=1)

    _reduced_BERT_features = []

    from sklearn.decomposition import PCA
    pca = PCA(n_components=70)

    for i, col in enumerate(BERT_columns):
        pca_input_features = BERT_features.iloc[:, i*BERT_embedding_size:(i+1)*BERT_embedding_size]
        if not trained_pca:
            trained_pca = pca.fit(pca_input_features)
        features = trained_pca.transform(pca_input_features)
        _reduced_BERT_features.append(pd.DataFrame(features))

    reduced_BERT_features = pd.concat(_reduced_BERT_features, axis=1)
    return reduced_BERT_features, trained_pca

# reduced_BERT_features = BERT_features_e2e(X)

In [None]:
#. non-BERT features

def non_BERT_features_e2e(X, X_full):

    from sklearn import preprocessing

    host_category_encoder = preprocessing.OneHotEncoder(drop='first').fit(X_full[['host', 'category']])

    host_category_encoding = host_category_encoder.transform(X[['host', 'category']]).toarray()

    assert host_category_encoding.shape[1] == len(X_full['host'].unique()) + len(X_full['category'].unique()) - 2
    assert host_category_encoding.shape[0] == len(X['host'])

    X_non_BERT_features = pd.concat([pd.DataFrame(host_category_encoding), X['question_title'].map(len), X['question_body'].map(len), X['answer'].map(len)], axis=1)

    non_embedding_non_BERT_features = X_non_BERT_features.iloc[:,-3:]

    return non_embedding_non_BERT_features

# non_embedding_non_BERT_features = non_BERT_features_e2e(X, X_full)

In [None]:
def features_e2e(X, X_full, pca=None):
    BERT_features, pca = BERT_features_e2e(X, pca)
    return pd.concat([non_BERT_features_e2e(X, X_full), BERT_features], axis=1), pca

In [None]:
# train test split

X, pca = features_e2e(X, X_full)

assert X.shape[0] == y.shape[0]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

assert len(X) == len(X_train) + len(X_test)

In [None]:
# model

from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression().fit(X_train, y_train)

y_pred = linear_regression.predict(X_test)
y_pred = pd.DataFrame(y_pred)



assert len(y_test) == len(y_pred)

In [None]:
try:
    wandb.sklearn.plot_regressor(linear_regression, X_train, X_test, y_train, y_test, 'LinearRegression')
except ValueError:
    pass
#help(wandb.sklearn.plot_regressor)

In [None]:
# evaluation
from matplotlib import pyplot as plt
import math
from scipy import stats

def evaluate(y_test, y_pred):
    assert y_test.shape == y_pred.shape
    correlations = [stats.spearmanr(y_test.iloc[:, col_index], y_pred.iloc[:, col_index]).correlation for col_index in range(y_test.shape[1])]
    score = sum([*filter(lambda x: not math.isnan(x), correlations)]) / len([*filter(lambda x: not math.isnan(x), correlations)])
    return score, correlations


validation_score, correlations = evaluate(y_test, y_pred)
print("score = {}".format(validation_score))
wandb.log({'validation_score': validation_score})


def visualise_correlations(correlations):
    worst_columns = [prediction_columns[correlations.index(corr)] for corr in sorted(correlations)[:3]]
    top_columns = [prediction_columns[correlations.index(corr)] for corr in sorted(correlations)[-3:]]
    print("worst predicted columns are: " + ', '.join(worst_columns))
    print("top predicted columns are:  " + ',  '.join(top_columns))
    plt.plot(correlations, marker='o')
    plt.ylim(0, 1)
    plt.title("Correlation score for each column", c="w")
    plt.show()
    
visualise_correlations(correlations)

## Submission

In [None]:
#model.eval()

In [None]:
test_X = test_data[input_columns]

# DELETE
# test_X = test_X.iloc[:101,:]

test_X, _ = features_e2e(test_X, X_full, pca)

In [None]:
# predict
y_pred = linear_regression.predict(test_X)
y_pred = pd.DataFrame(y_pred)

y_pred = y_pred.rename(columns={index: column_name for index, column_name in enumerate(sample_submission_data.columns[1:])})

y_pred.insert(0, 'qa_id', test_data['qa_id'])

for col in y_pred.columns[1:]:
    y_pred[col] = y_pred[col].apply(lambda x: round(x, 3))
    y_pred[col] = y_pred[col].apply(lambda x: x if x < 1 else 0.999)
    y_pred[col] = y_pred[col].apply(lambda x: x if x > 0 else 0.001)

    

assert y_pred.shape == sample_submission_data.shape
assert len(y_pred.columns) == len(sample_submission_data.columns)
assert all([y_pred.columns[i] == sample_submission_data.columns[i] for i, _ in enumerate(y_pred.columns)])

In [None]:
y_pred.to_csv(SUBMISSION_PATH, index=False)