In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from transformers import BertTokenizer
from transformers import TFBertModel
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from spellchecker import SpellChecker
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

In [27]:
df = pd.read_csv('ASAP Dataset/Preprocessed_df.csv')

In [28]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,word_len,chars_len,avg_word_length,avg_sentence_length,pos_ratios,num_sentences,num_paragraphs,sentiment_polariy,sentiment_subjectivity,preprocessed_text
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,386,1875,3.984456,1.0,"{'NNP': 0.031088082901554404, 'JJ': 0.05181347...",16,1,0.310471,0.385613,dear local newspaper think effect computer peo...
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,464,2288,4.030172,1.0,"{'NNP': 0.03879310344827586, ',': 0.0258620689...",20,1,0.274,0.613167,dear believe using computer benefit u many way...
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,313,1541,4.035144,1.0,"{'NNP': 0.04153354632587859, ',': 0.0287539936...",14,1,0.340393,0.498657,dear people use computer everyone agrees benef...
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,611,3165,4.328969,1.0,"{'NNP': 0.11620294599018004, ',': 0.0212765957...",27,1,0.266828,0.441795,dear local newspaper found many expert say com...
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,517,2569,4.071567,1.0,"{'NNP': 0.017408123791102514, ',': 0.025145067...",30,1,0.199684,0.485814,dear know computer positive effect people comp...


In [29]:
df = df.dropna(axis = 1, how = 'any')

In [30]:
drop_columns = ['essay_id', 'pos_ratios', 'essay', 'rater1_domain1', 'rater2_domain1']
df.drop(drop_columns, axis = 1, inplace = True)

In [31]:
def calc_precision(y_true, y_pred, average='macro'):
    """
    Calculates the precision score between the true and predicted values
    """
    precision = precision_score(y_true, y_pred, average=average)
    return precision

def calc_recall(y_true, y_pred, average='macro'):
    """
    Calculates the recall score between the true and predicted values
    """
    recall = recall_score(y_true, y_pred, average=average)
    return recall

def calc_f1_score(y_true, y_pred, average='macro'):
    """
    Calculates the f1-score between the true and predicted values
    """
    f1 = f1_score(y_true, y_pred, average=average)
    return f1

def calc_cohen_kappa_score(y_true, y_pred):
    """
    Calculates the cohen kappa score between the true and predicted values
    """
    kappa_score = cohen_kappa_score(y_true, y_pred, weights = 'quadratic')
    return kappa_score

def calc_accuracy(y_true, y_pred):
    """
    Calculates the accuracy score between the true and predicted values
    """
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

In [32]:
def print_metrics_function(y_actual, y_predictions, print_metrics = False):
    
    accuracy = calc_accuracy(y_actual, y_predictions)
    precision = calc_precision(y_actual, y_predictions)
    recall = calc_recall(y_actual, y_predictions)
    f1 = calc_f1_score(y_actual, y_predictions)
    kappa_score = calc_cohen_kappa_score(y_actual, y_predictions)
    
    if print_metrics:
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-Score:", f1)
        print("Cohen Kappa Score:", kappa_score)

    return accuracy, precision, recall, f1, kappa_score

In [33]:
def dataset_preparation(data, target = 'domain1_score'):
    
    X = data.drop([target], axis = 1)
    y = data[target]
    
    return X, y

In [34]:
def choose_classifiers(classifier_name = "logistic_regression"):
    """
    Takes a regressor as input and returns a corresponding classifier object
    """
    
    if classifier_name == 'logistic_regression':
        return LogisticRegression()
    elif classifier_name == 'decision_tree_classifier':
        return DecisionTreeClassifier()
    elif classifier_name == 'random_forest_classifier':
        return RandomForestClassifier()
    elif classifier_name == 'gradient_boosting_classifier':
        return GradientBoostingClassifier()
    elif classifier_name == 'adaboost_classifier':
        return AdaBoostClassifier()
    elif classifier_name == 'k_neighbors_classifier':
        return KNeighborsClassifier()
    elif classifier_name == 'support_vector_classifier':
        return SVC()
    elif classifier_name == 'xgboost_classifier':
        return XGBClassifier()
    elif classifier_name == 'gaussian_naive_bayes_classifier':
        return GaussianNB()
    else:
        raise ValueError(f"Classifier {classifier_name} not supported for this problem.")

In [35]:
df_essay_set = df[df.essay_set == 1]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [36]:
def spell_corrector(tokens):
    spell_checker = SpellChecker()
    correct_tokens = []
    for token in tqdm(tokens):
        if spell_checker.correction(token.lower()):
            correct_tokens.append(spell_checker.correction(token.lower()))
        else:
            correct_tokens.append(token.lower())
    
    return ' '.join(correct_tokens)

In [37]:
X_train.head()

Unnamed: 0,essay_set,word_len,chars_len,avg_word_length,avg_sentence_length,num_sentences,num_paragraphs,sentiment_polariy,sentiment_subjectivity,preprocessed_text
1346,1,431,1993,3.781903,1.0,37,1,0.096094,0.511334,dear local people using computer year good hea...
1349,1,289,1498,4.283737,1.0,15,1,0.066142,0.514918,dear newspaper believe computer positive affec...
7,1,556,2724,4.03777,1.0,39,1,0.26203,0.5745,people agree computer make life le complicated...
1251,1,481,2259,3.837838,1.0,31,1,0.09966,0.426994,dear world changed much better technology beco...
661,1,488,2369,3.954918,1.0,26,1,0.05306,0.429295,technology growing changing rapidly look apple...


### BERT Architecture

In [38]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [39]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:16<00:00,  5.45it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:04<00:00,  5.59it/s]


#### Adding new features from feature engineering

In [40]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [41]:
def cross_validation_function(input_data, output_data, ml_model = "logistic_regression",
                             print_results = True, n_folds = 5, return_kappa_scores = True):
    
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_list = []
    kappa_score_list = []
    
    k_fold = KFold(n_splits=n_folds, shuffle=True, random_state=101)
    X = pd.DataFrame(input_data.numpy()).copy()
    y = output_data.copy()

    for fold, (train_indices, val_indices) in enumerate(k_fold.split(X, y)):
        
        train_embeddings, train_y = X.iloc[train_indices], y.iloc[train_indices]
        val_embeddings, val_y = X.iloc[val_indices], y.iloc[val_indices]
        model = choose_classifiers(ml_model)
        model.fit(train_embeddings, train_y)
        y_predictions = model.predict(val_embeddings)
        accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(val_y, y_predictions)
        accuracy_list.append(accuracy_logistic_reg)
        precision_list.append(precision_logistic_reg)
        recall_list.append(recall_logistic_reg)
        f1_list.append(f1_logistic_reg)
        kappa_score_list.append(kappa_score_logistic_reg)
    
    if print_results:
        
        print("Accuracy: {:.4f}".format(np.max(accuracy_list)))
        print("Precision: {:.4f}".format(np.max(precision_list)))
        print("Recall: {:.4f}".format(np.max(recall_list)))
        print("F1 score: {:.4f}".format(np.max(f1_list)))
        print("Kappa score: {:.4f}".format(np.max(kappa_score_list)))
        
    if return_kappa_scores:
        
        return round(np.max(kappa_score_list), 3)

In [42]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set1 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.5263
Precision: 0.4037
Recall: 0.4122
F1 score: 0.3739
Kappa score: 0.7927


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.5368
Precision: 0.4617
Recall: 0.4008
F1 score: 0.3891
Kappa score: 0.7731


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5018
Precision: 0.1932
Recall: 0.2654
F1 score: 0.2188
Kappa score: 0.6589


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.4790
Precision: 0.3512
Recall: 0.3299
F1 score: 0.2910
Kappa score: 0.7312


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.5228
Precision: 0.1770
Recall: 0.1927
F1 score: 0.1713
Kappa score: 0.6565


### Model with Metrics (Essay Set - 2)

In [43]:
df_essay_set = df[df.essay_set == 2]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [44]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:16<00:00,  5.41it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:04<00:00,  5.39it/s]


In [45]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [46]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set2 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.6806
Precision: 0.6100
Recall: 0.4372
F1 score: 0.4577
Kappa score: 0.6767


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6736
Precision: 0.6225
Recall: 0.4873
F1 score: 0.5279
Kappa score: 0.6341


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.6701
Precision: 0.3333
Recall: 0.3530
F1 score: 0.3424
Kappa score: 0.5789


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.6458
Precision: 0.4791
Recall: 0.3638
F1 score: 0.3803
Kappa score: 0.5841


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.6979
Precision: 0.4195
Recall: 0.3826
F1 score: 0.3890
Kappa score: 0.6294


### Model with Metrics (Essay Set - 3)

In [47]:
df_essay_set = df[df.essay_set == 3]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [48]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:12<00:00,  7.12it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:02<00:00, 10.60it/s]


In [49]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [50]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set3 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.6848
Precision: 0.7678
Recall: 0.5915
F1 score: 0.6182
Kappa score: 0.6857


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6739
Precision: 0.7497
Recall: 0.5580
F1 score: 0.5871
Kappa score: 0.6806


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5870
Precision: 0.5448
Recall: 0.4801
F1 score: 0.4691
Kappa score: 0.6449


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.6341
Precision: 0.7022
Recall: 0.4946
F1 score: 0.5127
Kappa score: 0.6236


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.6739
Precision: 0.5169
Recall: 0.5343
F1 score: 0.5113
Kappa score: 0.6986


### Model with Metrics (Essay Set - 4)

In [51]:
df_essay_set = df[df.essay_set == 4]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [52]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 89/89 [00:11<00:00,  7.44it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:02<00:00,  9.49it/s]


In [53]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [54]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set4 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.6360
Precision: 0.6564
Recall: 0.6140
F1 score: 0.6275
Kappa score: 0.7440


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6360
Precision: 0.6535
Recall: 0.6161
F1 score: 0.6273
Kappa score: 0.7382


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5830
Precision: 0.5731
Recall: 0.5766
F1 score: 0.5650
Kappa score: 0.7167


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.5830
Precision: 0.5866
Recall: 0.5840
F1 score: 0.5848
Kappa score: 0.7233


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.6113
Precision: 0.5102
Recall: 0.5134
F1 score: 0.4809
Kappa score: 0.6575


### Model with Metrics (Essay Set - 5)

In [55]:
df_essay_set = df[df.essay_set == 5]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [56]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [00:13<00:00,  6.71it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:02<00:00,  8.99it/s]


In [57]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [58]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set5 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.7197
Precision: 0.7630
Recall: 0.7175
F1 score: 0.6969
Kappa score: 0.8240


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6817
Precision: 0.5722
Recall: 0.5605
F1 score: 0.5569
Kappa score: 0.8182


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5744
Precision: 0.3132
Recall: 0.3361
F1 score: 0.2976
Kappa score: 0.5894


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.6471
Precision: 0.5318
Recall: 0.5421
F1 score: 0.5360
Kappa score: 0.7742


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.6955
Precision: 0.5774
Recall: 0.5601
F1 score: 0.5669
Kappa score: 0.7950


### Model with Metrics (Essay Set - 6)

In [59]:
df_essay_set = df[df.essay_set == 6]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [60]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:16<00:00,  5.56it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:03<00:00,  6.29it/s]


In [61]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [62]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set6 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.6910
Precision: 0.6642
Recall: 0.5349
F1 score: 0.5534
Kappa score: 0.7551


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6424
Precision: 0.6944
Recall: 0.5355
F1 score: 0.5464
Kappa score: 0.7430


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5139
Precision: 0.4812
Recall: 0.4547
F1 score: 0.4258
Kappa score: 0.6854


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.5764
Precision: 0.5358
Recall: 0.4971
F1 score: 0.4899
Kappa score: 0.6841


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.6424
Precision: 0.5688
Recall: 0.3985
F1 score: 0.4066
Kappa score: 0.6161


### Model with Metrics (Essay Set - 7)

In [63]:
df_essay_set = df[df.essay_set == 7]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [64]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [00:15<00:00,  5.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:03<00:00,  6.19it/s]


In [65]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [66]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set7 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.1753
Precision: 0.1488
Recall: 0.1286
F1 score: 0.1116
Kappa score: 0.6873


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.1713
Precision: 0.1158
Recall: 0.1165
F1 score: 0.1041
Kappa score: 0.6895


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.1873
Precision: 0.0544
Recall: 0.0883
F1 score: 0.0442
Kappa score: 0.4780


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.1315
Precision: 0.1184
Recall: 0.1681
F1 score: 0.1295
Kappa score: 0.5982


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.1952
Precision: 0.0284
Recall: 0.0891
F1 score: 0.0378
Kappa score: 0.4162


### Model with Metrics (Essay Set - 8)

In [67]:
df_essay_set = df[df.essay_set == 8]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [68]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [00:06<00:00,  5.30it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.70it/s]


In [69]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [70]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set8 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.2241
Precision: 0.1046
Recall: 0.1048
F1 score: 0.0910
Kappa score: 0.6475


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.2500
Precision: 0.0591
Recall: 0.0589
F1 score: 0.0376
Kappa score: 0.4778


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.2845
Precision: 0.0221
Recall: 0.0808
F1 score: 0.0345
Kappa score: 0.5583


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.1983
Precision: 0.1076
Recall: 0.1197
F1 score: 0.0942
Kappa score: 0.5377


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.2696
Precision: 0.0119
Recall: 0.0476
F1 score: 0.0188
Kappa score: 0.1712


In [73]:
logistic_regression_qwk = [logistic_regression_qwk_set1, logistic_regression_qwk_set2, logistic_regression_qwk_set3,
                           logistic_regression_qwk_set4, logistic_regression_qwk_set5, logistic_regression_qwk_set6,
                           logistic_regression_qwk_set7, logistic_regression_qwk_set8]
random_forest_classifier_qwk = [random_forest_classifier_qwk_set1, random_forest_classifier_qwk_set2, random_forest_classifier_qwk_set3,
                                random_forest_classifier_qwk_set4, random_forest_classifier_qwk_set5, random_forest_classifier_qwk_set6,
                                random_forest_classifier_qwk_set7, random_forest_classifier_qwk_set8]
adaboost_classifier_qwk = [adaboost_classifier_qwk_set1, adaboost_classifier_qwk_set2, adaboost_classifier_qwk_set3,
                           adaboost_classifier_qwk_set4, adaboost_classifier_qwk_set5, adaboost_classifier_qwk_set6,
                           adaboost_classifier_qwk_set7, adaboost_classifier_qwk_set8]
k_neighbors_classifier_qwk = [k_neighbors_classifier_qwk_set1, k_neighbors_classifier_qwk_set2, k_neighbors_classifier_qwk_set3,
                              k_neighbors_classifier_qwk_set4, k_neighbors_classifier_qwk_set5, k_neighbors_classifier_qwk_set6,
                              k_neighbors_classifier_qwk_set7, k_neighbors_classifier_qwk_set8]
support_vector_classifier_qwk = [support_vector_classifier_qwk_set1, support_vector_classifier_qwk_set2, support_vector_classifier_qwk_set3,
                                 support_vector_classifier_qwk_set4, support_vector_classifier_qwk_set5, support_vector_classifier_qwk_set6,
                                 support_vector_classifier_qwk_set7, support_vector_classifier_qwk_set8]

metrics_list = [logistic_regression_qwk, random_forest_classifier_qwk, adaboost_classifier_qwk,
               k_neighbors_classifier_qwk, support_vector_classifier_qwk]

results_df = pd.DataFrame(metrics_list)

results_df.rename(columns = {0: 'Prompt-1', 1: 'Prompt-2', 2: 'Prompt-3', 
                            3: 'Prompt-4', 4: 'Prompt-5', 5: 'Prompt-6',
                            6: 'Prompt-7', 7: 'Prompt-8'}, inplace = True)

results_df.rename(index = {0: 'BERT + LR', 1: 'BERT + RF', 
                           2: 'BERT + Adaboost', 3: 'BERT + KNN', 4: 'BERT + SVC'}, inplace = True)

In [78]:
# results_df.to_csv('Results/BERT + features.csv', index = True)