In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from spellchecker import SpellChecker
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from transformers import TFGPT2Model, GPT2Tokenizer
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('ASAP Dataset/Preprocessed_df.csv')

In [3]:
# Removing any missing values from the data
df = df.dropna(axis = 1, how = 'any')

In [4]:
drop_columns = ['essay_id', 'pos_ratios', 'essay', 'rater1_domain1', 'rater2_domain1']
df.drop(drop_columns, axis = 1, inplace = True)

In [5]:
def calc_precision(y_true, y_pred, average='macro'):
    """
    Calculates the precision score between the true and predicted values
    """
    precision = precision_score(y_true, y_pred, average=average)
    return precision

def calc_recall(y_true, y_pred, average='macro'):
    """
    Calculates the recall score between the true and predicted values
    """
    recall = recall_score(y_true, y_pred, average=average)
    return recall

def calc_f1_score(y_true, y_pred, average='macro'):
    """
    Calculates the f1-score between the true and predicted values
    """
    f1 = f1_score(y_true, y_pred, average=average)
    return f1

def calc_cohen_kappa_score(y_true, y_pred):
    """
    Calculates the cohen kappa score between the true and predicted values
    """
    kappa_score = cohen_kappa_score(y_true, y_pred, weights = 'quadratic')
    return kappa_score

def calc_accuracy(y_true, y_pred):
    """
    Calculates the accuracy score between the true and predicted values
    """
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

In [6]:
def print_metrics_function(y_actual, y_predictions):
    
    # Calculate and print accuracy
    accuracy = calc_accuracy(y_actual, y_predictions)
    print("Accuracy:", accuracy)
    
    # Calculate and print precision
    precision = calc_precision(y_actual, y_predictions)
    print("Precision:", precision)

    # Calculate and print recall
    recall = calc_recall(y_actual, y_predictions)
    print("Recall:", recall)

    # Calculate and print f1-score
    f1 = calc_f1_score(y_actual, y_predictions)
    print("F1-Score:", f1)

    # Calculate and print Cohen Kappa Score
    kappa_score = calc_cohen_kappa_score(y_actual, y_predictions)
    print("Cohen Kappa Score:", kappa_score)

    return accuracy, precision, recall, f1, kappa_score

In [7]:
def dataset_preparation(data, target = 'domain1_score'):
    
    X = data.drop([target], axis = 1)
    y = data[target]
    
    return X, y

In [8]:
def choose_classifiers(classifier_name = "logistic_regression"):
    """
    Takes a regressor as input and returns a corresponding classifier object
    """
    
    if classifier_name == 'logistic_regression':
        return LogisticRegression()
    elif classifier_name == 'decision_tree_classifier':
        return DecisionTreeClassifier()
    elif classifier_name == 'random_forest_classifier':
        return RandomForestClassifier()
    elif classifier_name == 'gradient_boosting_classifier':
        return GradientBoostingClassifier()
    elif classifier_name == 'adaboost_classifier':
        return AdaBoostClassifier()
    elif classifier_name == 'k_neighbors_classifier':
        return KNeighborsClassifier()
    elif classifier_name == 'support_vector_classifier':
        return SVC()
    elif classifier_name == 'xgboost_classifier':
        return XGBClassifier()
    elif classifier_name == 'gaussian_naive_bayes_classifier':
        return GaussianNB()
    else:
        raise ValueError(f"Classifier {classifier_name} not supported for this problem.")

In [16]:
def spell_corrector(text):
    spell_checker = SpellChecker()
    correct_tokens = []
    for token in tqdm(text.split()):
        if spell_checker.correction(token.lower()):
            correct_tokens.append(spell_checker.correction(token.lower()))
        else:
            correct_tokens.append(token.lower())
    
    return ' '.join(correct_tokens)

In [17]:
df_essay_set = df[df.essay_set == 1]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [24]:
# This downloads the pre-trained weights from the huggingface website 
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
gpt_model = TFGPT2Model.from_pretrained('gpt2')
print(f"Total number of parameters: {gpt_model.count_params()}")

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


Total number of parameters: 124439808


### GPT-2 Architecture

#### Extracting GPT - 2 Embeddings

In [25]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:32<00:00,  2.80it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:10<00:00,  2.09it/s]


In [30]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [31]:
print("-----------------------Logistic Regression-----------------------")
model = choose_classifiers("logistic_regression")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
model = choose_classifiers("random_forest_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_random_forest, precision_random_forest, recall_random_forest, f1_random_forest, kappa_score_random_forest = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
model = choose_classifiers("adaboost_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_adaboost, precision_adaboost, recall_adaboost, f1_adaboost, kappa_score_adaboost = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
model = choose_classifiers("k_neighbors_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_k_neighbors, precision_k_neighbors, recall_k_neighbors, f1_k_neighbors, kappa_score_k_neighbors = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
model = choose_classifiers("support_vector_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_svc, precision_svc, recall_svc, f1_svc, kappa_score_svc = print_metrics_function(y_test, y_predictions)


-----------------------Logistic Regression-----------------------
Accuracy: 0.5014005602240896
Precision: 0.2227152479611077
Recall: 0.21855667517965904
F1-Score: 0.21256798059618895
Cohen Kappa Score: 0.7042866017809071


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.5266106442577031
Precision: 0.49137386476945044
Recall: 0.4176711292739518
F1-Score: 0.4336629093855165
Cohen Kappa Score: 0.7908329120069797


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.46218487394957986
Precision: 0.18874293785310733
Recall: 0.2134375
F1-Score: 0.18168804941791397
Cohen Kappa Score: 0.5567525128495394


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.453781512605042
Precision: 0.27413076560744953
Recall: 0.25999773412071797
F1-Score: 0.261224280112592
Cohen Kappa Score: 0.6812457156682215


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.3641456582633053
Precision: 

### Model with Metrics (Essay Set - 2)

In [32]:
df_essay_set = df[df.essay_set == 2]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [33]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [01:00<00:00,  1.49it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:47<00:00,  2.05s/it]


In [34]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [35]:
print("-----------------------Logistic Regression-----------------------")
model = choose_classifiers("logistic_regression")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
model = choose_classifiers("random_forest_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_random_forest, precision_random_forest, recall_random_forest, f1_random_forest, kappa_score_random_forest = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
model = choose_classifiers("adaboost_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_adaboost, precision_adaboost, recall_adaboost, f1_adaboost, kappa_score_adaboost = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
model = choose_classifiers("k_neighbors_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_k_neighbors, precision_k_neighbors, recall_k_neighbors, f1_k_neighbors, kappa_score_k_neighbors = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
model = choose_classifiers("support_vector_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_svc, precision_svc, recall_svc, f1_svc, kappa_score_svc = print_metrics_function(y_test, y_predictions)


-----------------------Logistic Regression-----------------------
Accuracy: 0.6555555555555556
Precision: 0.4364506172839506
Recall: 0.34369006443771166
F1-Score: 0.36128145108871235
Cohen Kappa Score: 0.5928899082568808


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6388888888888888
Precision: 0.626152957243408
Recall: 0.47007123003637463
F1-Score: 0.4890214890381218
Cohen Kappa Score: 0.6206836414370422


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.6
Precision: 0.41165107971860415
Recall: 0.3218745028643948
F1-Score: 0.32709890668700736
Cohen Kappa Score: 0.37589285714285714


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.6194444444444445
Precision: 0.5651763539765404
Recall: 0.3979640006792325
F1-Score: 0.4310766116750022
Cohen Kappa Score: 0.5799769850402763


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.43333333333333335
Precision: 0.0866

### Model with Metrics (Essay Set - 3)

In [36]:
df_essay_set = df[df.essay_set == 3]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [37]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:52<00:00,  1.67it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:15<00:00,  1.43it/s]


In [38]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [39]:
print("-----------------------Logistic Regression-----------------------")
model = choose_classifiers("logistic_regression")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
model = choose_classifiers("random_forest_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_random_forest, precision_random_forest, recall_random_forest, f1_random_forest, kappa_score_random_forest = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
model = choose_classifiers("adaboost_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_adaboost, precision_adaboost, recall_adaboost, f1_adaboost, kappa_score_adaboost = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
model = choose_classifiers("k_neighbors_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_k_neighbors, precision_k_neighbors, recall_k_neighbors, f1_k_neighbors, kappa_score_k_neighbors = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
model = choose_classifiers("support_vector_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_svc, precision_svc, recall_svc, f1_svc, kappa_score_svc = print_metrics_function(y_test, y_predictions)


-----------------------Logistic Regression-----------------------
Accuracy: 0.6127167630057804
Precision: 0.4550178294618904
Recall: 0.4924148103281962
F1-Score: 0.45755161206285255
Cohen Kappa Score: 0.6599764145489861


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.37283236994219654
Precision: 0.32003496503496504
Recall: 0.2727342575767773
F1-Score: 0.22219892386128387
Cohen Kappa Score: 0.16649503493931594


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.3901734104046243
Precision: 0.32205524399912766
Recall: 0.32918367842908525
F1-Score: 0.2955242794003201
Cohen Kappa Score: 0.22189448140245183


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.569364161849711
Precision: 0.43162078007210747
Recall: 0.4534442275912092
F1-Score: 0.43456003090891526
Cohen Kappa Score: 0.6086668166798916


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.3757225433526011

### Model with Metrics (Essay Set - 4)

In [40]:
df_essay_set = df[df.essay_set == 4]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [41]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 89/89 [00:58<00:00,  1.53it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:18<00:00,  1.26it/s]


In [42]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [43]:
print("-----------------------Logistic Regression-----------------------")
model = choose_classifiers("logistic_regression")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
model = choose_classifiers("random_forest_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_random_forest, precision_random_forest, recall_random_forest, f1_random_forest, kappa_score_random_forest = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
model = choose_classifiers("adaboost_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_adaboost, precision_adaboost, recall_adaboost, f1_adaboost, kappa_score_adaboost = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
model = choose_classifiers("k_neighbors_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_k_neighbors, precision_k_neighbors, recall_k_neighbors, f1_k_neighbors, kappa_score_k_neighbors = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
model = choose_classifiers("support_vector_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_svc, precision_svc, recall_svc, f1_svc, kappa_score_svc = print_metrics_function(y_test, y_predictions)


-----------------------Logistic Regression-----------------------
Accuracy: 0.6016949152542372
Precision: 0.6019688644688646
Recall: 0.5596967910497755
F1-Score: 0.5635968359161194
Cohen Kappa Score: 0.6996803851162057


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.576271186440678
Precision: 0.559556229899367
Recall: 0.5887492463321498
F1-Score: 0.5678512272251756
Cohen Kappa Score: 0.7127724637326613


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.423728813559322
Precision: 0.4123209018120523
Recall: 0.41730635760702084
F1-Score: 0.36096740486061957
Cohen Kappa Score: 0.45484058936152505


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.536723163841808
Precision: 0.5219082598731972
Recall: 0.5099888792121658
F1-Score: 0.5130650202788021
Cohen Kappa Score: 0.6871262552183234


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.3531073446327684
Precision

### Model with Metrics (Essay Set - 5)

In [44]:
df_essay_set = df[df.essay_set == 5]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [45]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [01:02<00:00,  1.45it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:18<00:00,  1.26it/s]


In [46]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [47]:
print("-----------------------Logistic Regression-----------------------")
model = choose_classifiers("logistic_regression")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
model = choose_classifiers("random_forest_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_random_forest, precision_random_forest, recall_random_forest, f1_random_forest, kappa_score_random_forest = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
model = choose_classifiers("adaboost_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_adaboost, precision_adaboost, recall_adaboost, f1_adaboost, kappa_score_adaboost = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
model = choose_classifiers("k_neighbors_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_k_neighbors, precision_k_neighbors, recall_k_neighbors, f1_k_neighbors, kappa_score_k_neighbors = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
model = choose_classifiers("support_vector_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_svc, precision_svc, recall_svc, f1_svc, kappa_score_svc = print_metrics_function(y_test, y_predictions)


-----------------------Logistic Regression-----------------------
Accuracy: 0.5152354570637119
Precision: 0.39821699134199134
Recall: 0.43371538303474777
F1-Score: 0.3948481439820023
Cohen Kappa Score: 0.630763714593044


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.47645429362880887
Precision: 0.43426953362965637
Recall: 0.4140006670968902
F1-Score: 0.3808317906759421
Cohen Kappa Score: 0.5978692410956556


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5512465373961218
Precision: 0.4206883732457635
Recall: 0.31760632193703053
F1-Score: 0.2675042866210059
Cohen Kappa Score: 0.5608184918529746


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.5789473684210527
Precision: 0.4549963029797657
Recall: 0.4759278549349283
F1-Score: 0.4573422371852284
Cohen Kappa Score: 0.7108790909281506


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.37950138504155123
Pre

### Model with Metrics (Essay Set - 6)

In [48]:
df_essay_set = df[df.essay_set == 6]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [49]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:38<00:00,  2.33it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:16<00:00,  1.42it/s]


In [50]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [51]:
print("-----------------------Logistic Regression-----------------------")
model = choose_classifiers("logistic_regression")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
model = choose_classifiers("random_forest_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_random_forest, precision_random_forest, recall_random_forest, f1_random_forest, kappa_score_random_forest = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
model = choose_classifiers("adaboost_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_adaboost, precision_adaboost, recall_adaboost, f1_adaboost, kappa_score_adaboost = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
model = choose_classifiers("k_neighbors_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_k_neighbors, precision_k_neighbors, recall_k_neighbors, f1_k_neighbors, kappa_score_k_neighbors = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
model = choose_classifiers("support_vector_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_svc, precision_svc, recall_svc, f1_svc, kappa_score_svc = print_metrics_function(y_test, y_predictions)


-----------------------Logistic Regression-----------------------
Accuracy: 0.4361111111111111
Precision: 0.3455551257253385
Recall: 0.4158831919525561
F1-Score: 0.3339150739401052
Cohen Kappa Score: 0.6247270742358078


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.5333333333333333
Precision: 0.48719435620118795
Recall: 0.4369306608612967
F1-Score: 0.45178402681176005
Cohen Kappa Score: 0.6368383574649894


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.46944444444444444
Precision: 0.35729850727162493
Recall: 0.3583254510422141
F1-Score: 0.29201296800455134
Cohen Kappa Score: 0.5024104683195592


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.46111111111111114
Precision: 0.4513650793650793
Recall: 0.38669306608612974
F1-Score: 0.38514875540066995
Cohen Kappa Score: 0.5807978363759296


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.48055555555555557

### Model with Metrics (Essay Set - 7)

In [52]:
df_essay_set = df[df.essay_set == 7]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [53]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [00:30<00:00,  2.62it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:08<00:00,  2.25it/s]


In [54]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [55]:
print("-----------------------Logistic Regression-----------------------")
model = choose_classifiers("logistic_regression")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
model = choose_classifiers("random_forest_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_random_forest, precision_random_forest, recall_random_forest, f1_random_forest, kappa_score_random_forest = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
model = choose_classifiers("adaboost_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_adaboost, precision_adaboost, recall_adaboost, f1_adaboost, kappa_score_adaboost = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
model = choose_classifiers("k_neighbors_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_k_neighbors, precision_k_neighbors, recall_k_neighbors, f1_k_neighbors, kappa_score_k_neighbors = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
model = choose_classifiers("support_vector_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_svc, precision_svc, recall_svc, f1_svc, kappa_score_svc = print_metrics_function(y_test, y_predictions)


-----------------------Logistic Regression-----------------------
Accuracy: 0.06687898089171974
Precision: 0.03162646177352059
Recall: 0.07357444598823909
F1-Score: 0.04012977472003719
Cohen Kappa Score: 0.2772760195289655


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.1337579617834395
Precision: 0.09476721874868912
Recall: 0.0933560349683897
F1-Score: 0.05422766137625575
Cohen Kappa Score: 0.5457254745261748


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.08917197452229299
Precision: 0.026130129471143296
Recall: 0.09947089947089947
F1-Score: 0.035690645052347175
Cohen Kappa Score: 0.5523339479130822


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.09872611464968153
Precision: 0.03708636307375803
Recall: 0.09200163447430058
F1-Score: 0.04340937144275889
Cohen Kappa Score: 0.45644499435968633


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.09235668

### Model with Metrics (Essay Set - 8)

In [56]:
df_essay_set = df[df.essay_set == 8]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [57]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 512

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [00:33<00:00,  1.09it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:14<00:00,  1.50s/it]


In [58]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features], axis = 1)

In [59]:
print("-----------------------Logistic Regression-----------------------")
model = choose_classifiers("logistic_regression")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
model = choose_classifiers("random_forest_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_random_forest, precision_random_forest, recall_random_forest, f1_random_forest, kappa_score_random_forest = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
model = choose_classifiers("adaboost_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_adaboost, precision_adaboost, recall_adaboost, f1_adaboost, kappa_score_adaboost = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
model = choose_classifiers("k_neighbors_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_k_neighbors, precision_k_neighbors, recall_k_neighbors, f1_k_neighbors, kappa_score_k_neighbors = print_metrics_function(y_test, y_predictions)

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
model = choose_classifiers("support_vector_classifier")
model.fit(embeddings_train, y_train)
y_predictions = model.predict(embeddings_test)
accuracy_svc, precision_svc, recall_svc, f1_svc, kappa_score_svc = print_metrics_function(y_test, y_predictions)


-----------------------Logistic Regression-----------------------
Accuracy: 0.20689655172413793
Precision: 0.01488095238095238
Recall: 0.04566912972085386
F1-Score: 0.022254797441364604
Cohen Kappa Score: 0.4812920592193809


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.20689655172413793
Precision: 0.036631054131054136
Recall: 0.04844532861774241
F1-Score: 0.03554677940642852
Cohen Kappa Score: 0.42210084398544057


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.21379310344827587
Precision: 0.013543599257884972
Recall: 0.04690065681444992
F1-Score: 0.02095952417397389
Cohen Kappa Score: 0.5182635829662261


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.1310344827586207
Precision: 0.059889974798696906
Recall: 0.052569946238198315
F1-Score: 0.052500188936505494
Cohen Kappa Score: 0.33855411176924355


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.2
