## Referenced from https://github.com/amaiya/ktrain

### Example notebook for BERT text classification using ktrain package. 
Example task: predict deception ("Straightforward" or "Cassandra") using input text.

### Notes:
- BERT model takes extremely long to train, even the supposedly faster method (DistillBERT, which is implemented in this notebook) also takes quite some time.
- However, validation accuracy looks promising. In the event that we use this BERT methods for individual models, need to save models and load them each time.

In [32]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import ktrain
from ktrain import text

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Input, InputLayer, Dropout, Dense, Flatten, Embedding, Add, Concatenate
from tensorflow.keras import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

import pandas as pd

import import_ipynb
import metadata_options

In [20]:
#######
# KTRAIN MODEL OPTIONS: #
#distilbert-base-uncased, bert-base-uncased, albert-base-v2, roberta-base
######
k_train_model = 'distilbert-base-uncased'

In [11]:
# Data with Throughput & WorkTime
df = pd.read_csv('./data/kokil dec 6 reprepare/conf_pc_worker_sem.csv')
df = df.dropna()

In [12]:
######################################
## Weighted Onehot Encoding options ##
######################################
throughput_option = 'TP1'
worktime_option = 'WT1'
pc_agreement_option = 'PC1'
textlength_option = 'TL1'
special_option = 'SP1'
k_option_for_tp = 3

df_throughput, df_worktime, df_agreement, df_textlength, df_special = metadata_options.set_OHE_pipeline_options(df, throughput_option, worktime_option, pc_agreement_option, textlength_option, special_option, k_option_for_tp)

TP1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
WT1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
PC1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
TL1: weighted by 1 normalised number of characters per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
SP1: weighted by average of TP1 and TP2 per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)


In [13]:
# train_test_split using Stratified Shaffled Splits
y = df["Input.deception_quadrant"].copy()
X = df.drop(["Input.deception_quadrant"], axis=1)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
splits_generator = sss.split(X, y)

for train_idx, test_idx in splits_generator:
    indices_train = train_idx
    indices_test = test_idx

train = df.take(indices_train)
test = df.take(indices_test)

In [15]:
y_train_deception = train["Input.deception_quadrant"].tolist()
y_train_rapport = train['Answer.3rapport.yes_label'].tolist()
y_train_share_information = train['Answer.4shareinformation.yes_label'].tolist()
y_train_reasoning = train['Answer.2reasoning.yes_label'].tolist()
y_train_gamemove = train['Answer.1gamemove.yes_label'].tolist()

y_test_deception = test['Input.deception_quadrant'].tolist()

y_test_rapport = test['Answer.3rapport.yes_label'].tolist()
y_test_share_information = test['Answer.4shareinformation.yes_label'].tolist()
y_test_reasoning = test['Answer.2reasoning.yes_label'].tolist()
y_test_gamemove = test['Answer.1gamemove.yes_label'].tolist()

In [25]:
x_train = train['Input.full_text'].tolist()
x_test = test['Input.full_text'].tolist()

### Individual Models

In [21]:
# Game move classifier
t_gamemove_label = list(set(y_train_gamemove))
t_gamemove = text.Transformer(k_train_model, maxlen=500, classes=t_gamemove_label)

In [27]:
trn_gamemove = t_gamemove.preprocess_train(x_train, y_train_gamemove)
val_gamemove = t_gamemove.preprocess_test(x_test, y_test_gamemove)

gamemove_model = t_gamemove.get_classifier()
learner_gamemove = ktrain.get_learner(gamemove_model, 
                                      train_data=trn_gamemove, 
                                      val_data=val_gamemove, batch_size=6)
learner_gamemove.fit_onecycle(3e-5, 1)
learner_gamemove.validate(class_names=t_gamemove.get_classes())

preprocessing train...
language: en
train sequence lengths:
	mean : 16
	95percentile : 32
	99percentile : 44




Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 17
	95percentile : 33
	99percentile : 44




begin training using onecycle policy with max lr of 3e-05...
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       142
         1.0       0.94      1.00      0.97      2132

    accuracy                           0.94      2274
   macro avg       0.47      0.50      0.48      2274
weighted avg       0.88      0.94      0.91      2274



  _warn_prf(average, modifier, msg_start, len(result))


array([[   0,  142],
       [   0, 2132]], dtype=int64)

In [37]:
pred_gamemove = ktrain.get_predictor(gamemove_model, preproc=t_gamemove)
y_pred_test_gamemove = pred_gamemove.predict(x_test)
y_pred_train_gamemove = pred_gamemove.predict(x_train)

In [None]:
# Reasoning Classifier 
t_reasoning_label = list(set(y_train_reasoning))
t_reasoning = text.Transformer(k_train_model, maxlen=500, classes=t_reasoning_label)

trn_reasoning = t_reasoning.preprocess_train(x_train, y_train_reasoning)
val_reasoning = t_reasoning.preprocess_test(x_test, y_test_reasoning)

reasoning_model = t_reasoning.get_classifier()
learner_reasoning = ktrain.get_learner(reasoning_model, 
                                      train_data=trn_reasoning, 
                                      val_data=val_reasoning, batch_size=6)
learner_reasoning.fit_onecycle(3e-5, 1)
learner_reasoning.validate(class_names=t_reasoning.get_classes())

In [None]:
pred_reasoning = ktrain.get_predictor(reasoning_model, preproc=t_reasoning)
y_pred_test_reasoning = pred_reasoning.predict(x_test)
y_pred_train_reasoning = pred_reasoning.predict(x_train)

In [None]:
# Share info classifier 
t_share_information_label = list(set(y_train_share_information))
t_share_information = text.Transformer(k_train_model, maxlen=500, classes=t_reasoning_label)

trn_share_information = t_share_information.preprocess_train(x_train, y_train_share_information)
val_share_information = t_share_information.preprocess_test(x_test, y_test_share_information)

share_information_model = t_share_information.get_classifier()
learner_share_information = ktrain.get_learner(share_information_model, 
                                      train_data=trn_share_information, 
                                      val_data=val_share_information, batch_size=6)
learner_share_information.fit_onecycle(3e-5, 1)
learner_share_information.validate(class_names=t_share_information.get_classes())

In [None]:
pred_share_information = ktrain.get_predictor(share_information_model, preproc=t_share_information)
y_pred_test_share_information = pred_share_information.predict(x_test)
y_pred_train_share_information = pred_share_information.predict(x_train)

In [None]:
# Rapport classifier 
t_rapport_label = list(set(y_train_rapport))
t_rapport = text.Transformer(k_train_model, maxlen=500, classes=t_rapport_label)

trn_rapport = t_rapport.preprocess_train(x_train, y_train_rapport)
val_rapport = t_rapport.preprocess_test(x_test, y_test_rapport)

rapport_model = t_rapport.get_classifier()
learner_rapport = ktrain.get_learner(rapport_model, 
                                      train_data=trn_rapport, 
                                      val_data=val_rapport, batch_size=6)
learner_rapport.fit_onecycle(3e-5, 1)
learner_rapport.validate(class_names=t_rapport.get_classes())

In [None]:
pred_rapport = ktrain.get_predictor(rapport_model, preproc=t_rapport)
y_pred_test_rapport = pred_rapport.predict(x_test)
y_pred_train_rapport = pred_rapport.predict(x_train)

In [None]:
# Deception classifier 
t_deception_label = list(set(y_train_deception))
t_deception = text.Transformer(k_train_model, maxlen=500, classes=t_deception_label)

trn_deception = t_deception.preprocess_train(x_train, y_train_deception)
val_deception = t_deception.preprocess_test(x_test, y_test_deception)

deception_model = t_rapport.get_classifier()
learner_deception = ktrain.get_learner(deception_model, 
                                      train_data=trn_deception, 
                                      val_data=val_deception, batch_size=6)
learner_deception.fit_onecycle(3e-5, 1)
learner_deception.validate(class_names=t_deception.get_classes())

In [None]:
pred_deception = ktrain.get_predictor(deception_model, preproc=t_deception)
y_pred_test_deception = pred_deception.predict(x_test)
y_pred_train_deception = pred_deception.predict(x_train)

### One hot encodings

In [None]:
# Train encodings
pred_df_arr_full = []
pred_df_arr = []
for i in range(0, len(y_pred_train_reasoning)):
    pred_obj_1 = {}
    pred_obj_1['gamemove'] = y_pred_train_gamemove[i]
    pred_obj_1['reasoning'] = y_pred_train_reasoning[i]
    pred_obj_1['shareinfo'] = y_pred_train_share_information[i]
    pred_df_arr.append(pred_obj_1)
    
    pred_obj_2 = pred_obj_1.copy()
    pred_obj_2['rapport'] = y_pred_train_rapport[i]
    pred_df_arr_full.append(pred_obj_2)
    
pred_df_full = pd.DataFrame(pred_df_arr_full)
pred_df = pd.DataFrame(pred_df_arr)

In [None]:
# Test encodings
pred_test_df_arr_full = []
pred_test_df_arr = []

for i in range(0, len(y_pred_test_reasoning)):
    pred_obj_1 = {}
    pred_obj_1['gamemove'] = y_pred_test_gamemove[i][0]
    pred_obj_1['reasoning'] = y_pred_test_reasoning[i][0]
    pred_obj_1['shareinfo'] = y_pred_test_share_information[i][0]
    pred_test_df_arr.append(pred_obj_1)
    
    pred_obj_2 = pred_obj_1.copy()
    pred_obj_2['rapport'] = y_pred_test_rapport[i][0]
    pred_test_df_arr_full.append(pred_obj_2)
    
pred_test_df_full = pd.DataFrame(pred_test_df_arr_full)
pred_test_df = pd.DataFrame(pred_test_df_arr)

### Joint model with one hot encoding 

In [None]:
new_deception_test = test["Input.deception_quadrant"].copy()
new_deception_test['Input.deception_quadrant'] = test["Input.deception_quadrant"].apply(lambda x : 1 if x == "Straightforward" else 0)
y_test_deception = new_deception_test['Input.deception_quadrant'].tolist()

new_deception_train = train["Input.deception_quadrant"].copy()
new_deception_train['Input.deception_quadrant'] = train["Input.deception_quadrant"].apply(lambda x : 1 if x == "Straightforward" else 0)
y_train_deception = new_deception_train['Input.deception_quadrant'].tolist()

y_test_rapport = np.asarray(y_test_rapport)
y_train_rapport = np.asarray(y_train_rapport)

In [None]:
print('Joint full model with one hot encoding, predicting deception')
joint_full_model = models_nn.create_joint_model(pred_df_full)
joint_full_model.summary()
history = joint_full_model.fit(x=pred_df_full, 
                               y=y_train_deception, 
                               epochs=32, 
                               batch_size=64, 
                               validation_data=(pred_test_df_full, y_test_deception))

In [None]:
joint_predict = joint_full_model.predict(pred_test_df_full)
joint_predict_round = joint_predict.round()
precision_recall_fscore_support(y_test_deception, np.array(joint_predict_round), average='macro')