In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from spellchecker import SpellChecker
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from transformers import TFGPT2Model, GPT2Tokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle

import warnings
warnings.filterwarnings("ignore")

In [46]:
df = pd.read_csv('ASAP Dataset/Preprocessed_df.csv')

In [47]:
# Removing any missing values from the data
df = df.dropna(axis = 1, how = 'any')

In [48]:
drop_columns = ['essay_id', 'pos_ratios', 'essay', 'rater1_domain1', 'rater2_domain1']
df.drop(drop_columns, axis = 1, inplace = True)

In [49]:
def calculate_precision(y_true, y_pred, average='macro'):
    precision = precision_score(y_true, y_pred, average=average)
    return precision

def calculate_recall(y_true, y_pred, average='macro'):
    recall = recall_score(y_true, y_pred, average=average)
    return recall

def calculate_f1_score(y_true, y_pred, average='macro'):
    f1 = f1_score(y_true, y_pred, average=average)
    return f1

def calculate_cohen_kappa_score(y_true, y_pred):
    kappa_score = cohen_kappa_score(y_true, y_pred, weights = 'quadratic')
    return kappa_score

def calculate_accuracy(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

def print_metrics_function(y_actual, y_predictions):
    
    accuracy = calculate_accuracy(y_actual, y_predictions)
    precision = calculate_precision(y_actual, y_predictions)
    recall = calculate_recall(y_actual, y_predictions)
    f1 = calculate_f1_score(y_actual, y_predictions)
    kappa_score = calculate_cohen_kappa_score(y_actual, y_predictions)

    return accuracy, precision, recall, f1, kappa_score

In [50]:
def dataset_preparation(data, target = 'domain1_score'):
    
    X = data.drop([target], axis = 1)
    y = data[target]
    
    return X, y

In [51]:
def choose_classifiers(classifier_name = "logistic_regression"):
    
    if classifier_name == 'logistic_regression':
        return LogisticRegression()
    elif classifier_name == 'random_forest_classifier':
        return RandomForestClassifier()
    elif classifier_name == 'gradient_boosting_classifier':
        return GradientBoostingClassifier()
    elif classifier_name == 'adaboost_classifier':
        return AdaBoostClassifier()
    elif classifier_name == 'k_neighbors_classifier':
        return KNeighborsClassifier()
    elif classifier_name == 'support_vector_classifier':
        return SVC()
    else:
        raise ValueError(f"Classifier {classifier_name} not supported for this problem.")

In [53]:
df_essay_set = df[df.essay_set == 1]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25,
                                                 random_state = 101, shuffle = True)

In [54]:
# This downloads the pre-trained weights from the huggingface website 
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
gpt_model = TFGPT2Model.from_pretrained('gpt2')
print(f"Total number of parameters: {gpt_model.count_params()}")

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


Total number of parameters: 124439808


### GPT-2 Architecture

#### Extracting GPT - 2 Embeddings

In [55]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 67/67 [00:20<00:00,  3.24it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:07<00:00,  3.02it/s]


In [56]:
embeddings_train.shape

TensorShape([1069, 768])

In [57]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

with open('Models/bow_vectorizer1.pkl', 'wb') as f:
    pickle.dump(bow_vectorizer, f)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

with open('Models/tfidf_vectorizer1.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

with open('Models/scaler1.pkl', 'wb') as f:
    pickle.dump(scaler, f)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

with open('Models/pca1.pkl', 'wb') as f:
    pickle.dump(pca_bow, f)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

with open('Models/pca2.pkl', 'wb') as f:
    pickle.dump(pca_tfidf, f)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

with open('Models/scaler2.pkl', 'wb') as f:
    pickle.dump(scaler, f)

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [29]:
def cross_validation_function(input_data, output_data, ml_model = "logistic_regression",
                             print_results = True, n_folds = 5, return_kappa_scores = True,
                             save_model = False):
    
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_list = []
    kappa_score_list = []
    
    k_fold = KFold(n_splits=n_folds, shuffle=True, random_state=101)
    X = pd.DataFrame(input_data.numpy()).copy()
    y = output_data.copy()

    for fold, (train_indices, val_indices) in enumerate(k_fold.split(X, y)):
        
        train_embeddings, train_y = X.iloc[train_indices], y.iloc[train_indices]
        val_embeddings, val_y = X.iloc[val_indices], y.iloc[val_indices]
        model = choose_classifiers(ml_model)
        model.fit(train_embeddings, train_y)
        y_predictions = model.predict(val_embeddings)
        
        if save_model:
            with open('Models/best_model.pkl', 'wb') as f:
                pickle.dump(model, f)
               
        accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(val_y, y_predictions)
        accuracy_list.append(accuracy_logistic_reg)
        precision_list.append(precision_logistic_reg)
        recall_list.append(recall_logistic_reg)
        f1_list.append(f1_logistic_reg)
        kappa_score_list.append(kappa_score_logistic_reg)
    
    if print_results:
        
        print("Accuracy: {:.4f}".format(np.max(accuracy_list)))
        print("Precision: {:.4f}".format(np.max(precision_list)))
        print("Recall: {:.4f}".format(np.max(recall_list)))
        print("F1 score: {:.4f}".format(np.max(f1_list)))
        print("Kappa score: {:.4f}".format(np.max(kappa_score_list)))
        
    if return_kappa_scores:
        
        return round(np.max(kappa_score_list), 3)

In [30]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set1 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier",
                                                             save_model = True)

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.4112
Precision: 0.2136
Recall: 0.3158
F1 score: 0.2353
Kappa score: 0.6461


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.4907
Precision: 0.4199
Recall: 0.4151
F1 score: 0.4103
Kappa score: 0.7968


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.4860
Precision: 0.1966
Recall: 0.2739
F1 score: 0.2219
Kappa score: 0.6484


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.2394
Precision: 0.1451
Recall: 0.2248
F1 score: 0.0956
Kappa score: 0.2855


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.4085
Precision: 0.1387
Recall: 0.1159
F1 score: 0.0721
Kappa score: 0.0905


### Model with Metrics (Essay Set - 2)

In [269]:
df_essay_set = df[df.essay_set == 2]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [270]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:40<00:00,  2.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:09<00:00,  2.34it/s]


In [271]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [272]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set2 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.6250
Precision: 0.4814
Recall: 0.3757
F1 score: 0.3934
Kappa score: 0.5783


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6910
Precision: 0.4745
Recall: 0.3742
F1 score: 0.3928
Kappa score: 0.6480


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.6493
Precision: 0.3364
Recall: 0.3970
F1 score: 0.3631
Kappa score: 0.6054


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.4167
Precision: 0.2728
Recall: 0.3191
F1 score: 0.2417
Kappa score: 0.3079


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.5799
Precision: 0.2170
Recall: 0.2512
F1 score: 0.2250
Kappa score: 0.3853


### Model with Metrics (Essay Set - 3)

In [273]:
df_essay_set = df[df.essay_set == 3]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [274]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:17<00:00,  4.84it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:03<00:00,  6.99it/s]


In [275]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [276]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set3 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.5399
Precision: 0.4744
Recall: 0.4721
F1 score: 0.4705
Kappa score: 0.4953


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.7029
Precision: 0.6196
Recall: 0.5447
F1 score: 0.5421
Kappa score: 0.7073


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5507
Precision: 0.6314
Recall: 0.4779
F1 score: 0.4922
Kappa score: 0.6039


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.3949
Precision: 0.2408
Recall: 0.3234
F1 score: 0.2041
Kappa score: 0.1188


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.5616
Precision: 0.4573
Recall: 0.4140
F1 score: 0.4162
Kappa score: 0.4922


### Model with Metrics (Essay Set - 4)

In [277]:
df_essay_set = df[df.essay_set == 4]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [278]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 89/89 [00:17<00:00,  4.99it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:03<00:00,  6.37it/s]


In [279]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [280]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set4 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.5972
Precision: 0.6008
Recall: 0.6012
F1 score: 0.5783
Kappa score: 0.7003


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6572
Precision: 0.6479
Recall: 0.6295
F1 score: 0.6342
Kappa score: 0.7457


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5760
Precision: 0.5663
Recall: 0.5603
F1 score: 0.5541
Kappa score: 0.7001


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.4170
Precision: 0.3477
Recall: 0.3330
F1 score: 0.2725
Kappa score: 0.2306


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.4735
Precision: 0.4756
Recall: 0.3419
F1 score: 0.2696
Kappa score: 0.4827


### Model with Metrics (Essay Set - 5)

In [281]:
df_essay_set = df[df.essay_set == 5]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [282]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [00:19<00:00,  4.57it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:03<00:00,  6.06it/s]


In [283]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [284]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set5 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.5590
Precision: 0.4658
Recall: 0.4901
F1 score: 0.4690
Kappa score: 0.7208


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6713
Precision: 0.5542
Recall: 0.5499
F1 score: 0.5419
Kappa score: 0.8142


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5779
Precision: 0.4298
Recall: 0.3872
F1 score: 0.3544
Kappa score: 0.6635


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.3633
Precision: 0.4282
Recall: 0.2950
F1 score: 0.2341
Kappa score: 0.3841


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.4325
Precision: 0.3776
Recall: 0.2464
F1 score: 0.1940
Kappa score: 0.3547


### Model with Metrics (Essay Set - 6)

In [285]:
df_essay_set = df[df.essay_set == 6]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [286]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:22<00:00,  3.96it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:05<00:00,  4.51it/s]


In [287]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [288]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set6 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.5417
Precision: 0.4840
Recall: 0.4609
F1 score: 0.4703
Kappa score: 0.7093


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6632
Precision: 0.6059
Recall: 0.5034
F1 score: 0.5137
Kappa score: 0.7029


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5799
Precision: 0.3583
Recall: 0.3694
F1 score: 0.3398
Kappa score: 0.6089


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.3646
Precision: 0.4364
Recall: 0.4467
F1 score: 0.3115
Kappa score: 0.5422


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.5208
Precision: 0.1042
Recall: 0.2000
F1 score: 0.1370
Kappa score: 0.0000


### Model with Metrics (Essay Set - 7)

In [289]:
df_essay_set = df[df.essay_set == 7]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [290]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [00:23<00:00,  3.38it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:04<00:00,  4.53it/s]


In [291]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [292]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set7 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.1155
Precision: 0.1001
Recall: 0.0923
F1 score: 0.0813
Kappa score: 0.5224


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.1952
Precision: 0.2008
Recall: 0.1528
F1 score: 0.1433
Kappa score: 0.6623


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.1833
Precision: 0.0284
Recall: 0.1103
F1 score: 0.0366
Kappa score: 0.5554


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.0478
Precision: 0.0682
Recall: 0.0757
F1 score: 0.0277
Kappa score: 0.1870


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.1633
Precision: 0.0384
Recall: 0.0769
F1 score: 0.0373
Kappa score: 0.1454


### Model with Metrics (Essay Set - 8)

In [293]:
df_essay_set = df[df.essay_set == 8]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [294]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [00:16<00:00,  2.23it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.31it/s]


In [295]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [296]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set8 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.2155
Precision: 0.0813
Recall: 0.0988
F1 score: 0.0853
Kappa score: 0.4233


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.2783
Precision: 0.0346
Recall: 0.0690
F1 score: 0.0438
Kappa score: 0.5358


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.2586
Precision: 0.0219
Recall: 0.0808
F1 score: 0.0344
Kappa score: 0.5801


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.1304
Precision: 0.0368
Recall: 0.0853
F1 score: 0.0412
Kappa score: 0.2151


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.2696
Precision: 0.0118
Recall: 0.0476
F1 score: 0.0187
Kappa score: 0.0000


In [298]:
logistic_regression_qwk = [logistic_regression_qwk_set1, logistic_regression_qwk_set2, logistic_regression_qwk_set3,
                           logistic_regression_qwk_set4, logistic_regression_qwk_set5, logistic_regression_qwk_set6,
                           logistic_regression_qwk_set7, logistic_regression_qwk_set8]
random_forest_classifier_qwk = [random_forest_classifier_qwk_set1, random_forest_classifier_qwk_set2, random_forest_classifier_qwk_set3,
                                random_forest_classifier_qwk_set4, random_forest_classifier_qwk_set5, random_forest_classifier_qwk_set6,
                                random_forest_classifier_qwk_set7, random_forest_classifier_qwk_set8]
adaboost_classifier_qwk = [adaboost_classifier_qwk_set1, adaboost_classifier_qwk_set2, adaboost_classifier_qwk_set3,
                           adaboost_classifier_qwk_set4, adaboost_classifier_qwk_set5, adaboost_classifier_qwk_set6,
                           adaboost_classifier_qwk_set7, adaboost_classifier_qwk_set8]
k_neighbors_classifier_qwk = [k_neighbors_classifier_qwk_set1, k_neighbors_classifier_qwk_set2, k_neighbors_classifier_qwk_set3,
                              k_neighbors_classifier_qwk_set4, k_neighbors_classifier_qwk_set5, k_neighbors_classifier_qwk_set6,
                              k_neighbors_classifier_qwk_set7, k_neighbors_classifier_qwk_set8]
support_vector_classifier_qwk = [support_vector_classifier_qwk_set1, support_vector_classifier_qwk_set2, support_vector_classifier_qwk_set3,
                                 support_vector_classifier_qwk_set4, support_vector_classifier_qwk_set5, support_vector_classifier_qwk_set6,
                                 support_vector_classifier_qwk_set7, support_vector_classifier_qwk_set8]

metrics_list = [logistic_regression_qwk, random_forest_classifier_qwk, adaboost_classifier_qwk,
               k_neighbors_classifier_qwk, support_vector_classifier_qwk]

results_df = pd.DataFrame(metrics_list)

results_df.rename(columns = {0: 'Prompt-1', 1: 'Prompt-2', 2: 'Prompt-3', 
                            3: 'Prompt-4', 4: 'Prompt-5', 5: 'Prompt-6',
                            6: 'Prompt-7', 7: 'Prompt-8'}, inplace = True)

results_df.rename(index = {0: 'GPT-2 + LR', 1: 'GPT-2 + RF', 
                           2: 'GPT-2 + Adaboost', 3: 'GPT-2 + KNN', 4: 'GPT-2 + SVC'}, inplace = True)

In [301]:
# results_df.to_csv('Results/gpt-2 + features.csv', index = True)