In [100]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from transformers import BertTokenizer
from transformers import TFBertModel
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from spellchecker import SpellChecker
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

In [101]:
df = pd.read_csv('ASAP Dataset/Preprocessed_df.csv')

In [102]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,word_len,chars_len,avg_word_length,avg_sentence_length,pos_ratios,num_sentences,num_paragraphs,sentiment_polariy,sentiment_subjectivity,preprocessed_text
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,386,1875,3.984456,1.0,"{'NNP': 0.031088082901554404, 'JJ': 0.05181347...",16,1,0.310471,0.385613,dear local newspaper think effect computer peo...
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,464,2288,4.030172,1.0,"{'NNP': 0.03879310344827586, ',': 0.0258620689...",20,1,0.274,0.613167,dear believe using computer benefit u many way...
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,313,1541,4.035144,1.0,"{'NNP': 0.04153354632587859, ',': 0.0287539936...",14,1,0.340393,0.498657,dear people use computer everyone agrees benef...
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,611,3165,4.328969,1.0,"{'NNP': 0.11620294599018004, ',': 0.0212765957...",27,1,0.266828,0.441795,dear local newspaper found many expert say com...
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,517,2569,4.071567,1.0,"{'NNP': 0.017408123791102514, ',': 0.025145067...",30,1,0.199684,0.485814,dear know computer positive effect people comp...


In [103]:
df = df.dropna(axis = 1, how = 'any')

In [104]:
drop_columns = ['essay_id', 'pos_ratios', 'essay', 'rater1_domain1', 'rater2_domain1']
df.drop(drop_columns, axis = 1, inplace = True)

In [105]:
def calculate_precision(y_true, y_pred, average='macro'):
    precision = precision_score(y_true, y_pred, average=average)
    return precision

def calculate_recall(y_true, y_pred, average='macro'):
    recall = recall_score(y_true, y_pred, average=average)
    return recall

def calculate_f1_score(y_true, y_pred, average='macro'):
    f1 = f1_score(y_true, y_pred, average=average)
    return f1

def calculate_cohen_kappa_score(y_true, y_pred):
    kappa_score = cohen_kappa_score(y_true, y_pred, weights = 'quadratic')
    return kappa_score

def calculate_accuracy(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

def print_metrics_function(y_actual, y_predictions):
    
    accuracy = calculate_accuracy(y_actual, y_predictions)
    precision = calculate_precision(y_actual, y_predictions)
    recall = calculate_recall(y_actual, y_predictions)
    f1 = calculate_f1_score(y_actual, y_predictions)
    kappa_score = calculate_cohen_kappa_score(y_actual, y_predictions)

    return accuracy, precision, recall, f1, kappa_score

In [107]:
def dataset_preparation(data, target = 'domain1_score'):
    
    X = data.drop([target], axis = 1)
    y = data[target]
    
    return X, y

In [108]:
def choose_classifiers(classifier_name = "logistic_regression"):
    
    if classifier_name == 'logistic_regression':
        return LogisticRegression()
    elif classifier_name == 'decision_tree_classifier':
        return DecisionTreeClassifier()
    elif classifier_name == 'random_forest_classifier':
        return RandomForestClassifier()
    elif classifier_name == 'adaboost_classifier':
        return AdaBoostClassifier()
    elif classifier_name == 'k_neighbors_classifier':
        return KNeighborsClassifier()
    elif classifier_name == 'support_vector_classifier':
        return SVC()
    else:
        raise ValueError(f"Classifier {classifier_name} not supported for this problem.")

In [109]:
df_essay_set = df[df.essay_set == 1]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [110]:
def spell_corrector(tokens):
    spell_checker = SpellChecker()
    correct_tokens = []
    for token in tqdm(tokens):
        if spell_checker.correction(token.lower()):
            correct_tokens.append(spell_checker.correction(token.lower()))
        else:
            correct_tokens.append(token.lower())
    
    return ' '.join(correct_tokens)

In [111]:
X_train.head()

Unnamed: 0,essay_set,word_len,chars_len,avg_word_length,avg_sentence_length,num_sentences,num_paragraphs,sentiment_polariy,sentiment_subjectivity,preprocessed_text
1346,1,431,1993,3.781903,1.0,37,1,0.096094,0.511334,dear local people using computer year good hea...
1349,1,289,1498,4.283737,1.0,15,1,0.066142,0.514918,dear newspaper believe computer positive affec...
7,1,556,2724,4.03777,1.0,39,1,0.26203,0.5745,people agree computer make life le complicated...
1251,1,481,2259,3.837838,1.0,31,1,0.09966,0.426994,dear world changed much better technology beco...
661,1,488,2369,3.954918,1.0,26,1,0.05306,0.429295,technology growing changing rapidly look apple...


### BERT Architecture

In [112]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [113]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:16<00:00,  5.44it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:04<00:00,  5.52it/s]


#### Adding new features from feature engineering

In [114]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [115]:
def cross_validation_function(input_data, output_data, ml_model = "logistic_regression",
                             print_results = True, n_folds = 5, return_kappa_scores = True):
    
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_list = []
    kappa_score_list = []
    
    k_fold = KFold(n_splits=n_folds, shuffle=True, random_state=101)
    X = pd.DataFrame(input_data.numpy()).copy()
    y = output_data.copy()

    for fold, (train_indices, val_indices) in enumerate(k_fold.split(X, y)):
        
        train_embeddings, train_y = X.iloc[train_indices], y.iloc[train_indices]
        val_embeddings, val_y = X.iloc[val_indices], y.iloc[val_indices]
        model = choose_classifiers(ml_model)
        model.fit(train_embeddings, train_y)
        y_predictions = model.predict(val_embeddings)
        accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(val_y, y_predictions)
        accuracy_list.append(accuracy_logistic_reg)
        precision_list.append(precision_logistic_reg)
        recall_list.append(recall_logistic_reg)
        f1_list.append(f1_logistic_reg)
        kappa_score_list.append(kappa_score_logistic_reg)
    
    if print_results:
        
        print("Accuracy: {:.4f}".format(np.max(accuracy_list)))
        print("Precision: {:.4f}".format(np.max(precision_list)))
        print("Recall: {:.4f}".format(np.max(recall_list)))
        print("F1 score: {:.4f}".format(np.max(f1_list)))
        print("Kappa score: {:.4f}".format(np.max(kappa_score_list)))
        
    if return_kappa_scores:
        
        return round(np.max(kappa_score_list), 3)

In [116]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set1 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set1 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.3497
Precision: 0.2993
Recall: 0.3225
F1 score: 0.2507
Kappa score: 0.5091


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.5439
Precision: 0.4402
Recall: 0.3871
F1 score: 0.3864
Kappa score: 0.7710


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5018
Precision: 0.1932
Recall: 0.2654
F1 score: 0.2188
Kappa score: 0.6589


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.2175
Precision: 0.2300
Recall: 0.1554
F1 score: 0.0747
Kappa score: 0.3542


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.4667
Precision: 0.0874
Recall: 0.1294
F1 score: 0.0983
Kappa score: 0.3178


### Model with Metrics (Essay Set - 2)

In [117]:
df_essay_set = df[df.essay_set == 2]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [118]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:16<00:00,  5.37it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:04<00:00,  5.39it/s]


In [119]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [120]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set2 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set2 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.5694
Precision: 0.5171
Recall: 0.3588
F1 score: 0.3571
Kappa score: 0.4446


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6944
Precision: 0.6760
Recall: 0.4832
F1 score: 0.5245
Kappa score: 0.6437


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.6736
Precision: 0.3329
Recall: 0.3542
F1 score: 0.3427
Kappa score: 0.6436


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.4688
Precision: 0.2525
Recall: 0.2721
F1 score: 0.2144
Kappa score: 0.2815


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.5903
Precision: 0.2265
Recall: 0.2702
F1 score: 0.2461
Kappa score: 0.4229


### Model with Metrics (Essay Set - 3)

In [121]:
df_essay_set = df[df.essay_set == 3]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [122]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:12<00:00,  7.12it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 22/22 [00:02<00:00, 10.64it/s]


In [123]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [124]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set3 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set3 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.4964
Precision: 0.4359
Recall: 0.4901
F1 score: 0.4311
Kappa score: 0.3684


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6993
Precision: 0.5909
Recall: 0.5370
F1 score: 0.5306
Kappa score: 0.7105


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.6232
Precision: 0.5126
Recall: 0.4829
F1 score: 0.4752
Kappa score: 0.6485


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.3949
Precision: 0.2581
Recall: 0.3307
F1 score: 0.2213
Kappa score: 0.1335


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.6558
Precision: 0.4905
Recall: 0.5016
F1 score: 0.4954
Kappa score: 0.6172


### Model with Metrics (Essay Set - 4)

In [125]:
df_essay_set = df[df.essay_set == 4]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [126]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 89/89 [00:12<00:00,  7.37it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:02<00:00,  9.61it/s]


In [127]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [128]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set4 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set4 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.5689
Precision: 0.5374
Recall: 0.4994
F1 score: 0.5047
Kappa score: 0.5803


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6608
Precision: 0.6798
Recall: 0.6373
F1 score: 0.6494
Kappa score: 0.7593


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5866
Precision: 0.5825
Recall: 0.5873
F1 score: 0.5833
Kappa score: 0.7252


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.3958
Precision: 0.3343
Recall: 0.3229
F1 score: 0.2577
Kappa score: 0.2575


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.5689
Precision: 0.5089
Recall: 0.4617
F1 score: 0.4219
Kappa score: 0.5278


### Model with Metrics (Essay Set - 5)

In [129]:
df_essay_set = df[df.essay_set == 5]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [130]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [00:13<00:00,  6.80it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:02<00:00,  9.57it/s]


In [131]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [132]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set5 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set5 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.5174
Precision: 0.4736
Recall: 0.5706
F1 score: 0.5041
Kappa score: 0.6492


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6747
Precision: 0.5660
Recall: 0.5481
F1 score: 0.5519
Kappa score: 0.8100


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5779
Precision: 0.4298
Recall: 0.4156
F1 score: 0.3528
Kappa score: 0.7038


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.3529
Precision: 0.2306
Recall: 0.2856
F1 score: 0.2144
Kappa score: 0.3718


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.5347
Precision: 0.3899
Recall: 0.3175
F1 score: 0.2555
Kappa score: 0.5373


### Model with Metrics (Essay Set - 6)

In [133]:
df_essay_set = df[df.essay_set == 6]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [134]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [00:15<00:00,  5.73it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:03<00:00,  6.94it/s]


In [135]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [136]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set6 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set6 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.4965
Precision: 0.4517
Recall: 0.4603
F1 score: 0.4189
Kappa score: 0.6412


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.6910
Precision: 0.6563
Recall: 0.5260
F1 score: 0.5330
Kappa score: 0.7439


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.5903
Precision: 0.3707
Recall: 0.3682
F1 score: 0.3382
Kappa score: 0.6087


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.3785
Precision: 0.4553
Recall: 0.3875
F1 score: 0.3149
Kappa score: 0.5403


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.5625
Precision: 0.5762
Recall: 0.2616
F1 score: 0.2219
Kappa score: 0.3700


### Model with Metrics (Essay Set - 7)

In [137]:
df_essay_set = df[df.essay_set == 7]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [138]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [00:14<00:00,  5.38it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:02<00:00,  7.05it/s]


In [139]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [140]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set7 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set7 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.1036
Precision: 0.1107
Recall: 0.0938
F1 score: 0.0807
Kappa score: 0.3514


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.1992
Precision: 0.1208
Recall: 0.1205
F1 score: 0.1096
Kappa score: 0.6670


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.1992
Precision: 0.0217
Recall: 0.0895
F1 score: 0.0343
Kappa score: 0.4962


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.0438
Precision: 0.0702
Recall: 0.0629
F1 score: 0.0238
Kappa score: 0.1866


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.1793
Precision: 0.0231
Recall: 0.0852
F1 score: 0.0335
Kappa score: 0.3814


### Model with Metrics (Essay Set - 8)

In [141]:
df_essay_set = df[df.essay_set == 8]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [142]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
BATCH_SIZE = 16
MAX_LENGTH = 300

train_encodings = tokenizer(list(X_train['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(BATCH_SIZE)

test_encodings = tokenizer(list(X_test['preprocessed_text']), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(BATCH_SIZE)

embeddings_train = []
for batch in tqdm(train_dataset):
    embeddings_train.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in tqdm(test_dataset):
    embeddings_test.append(bert_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████████| 37/37 [00:06<00:00,  5.41it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.78it/s]


In [143]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_bow = bow_vectorizer.transform(X_test['preprocessed_text']).toarray()

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['preprocessed_text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test['preprocessed_text']).toarray()

scaler = StandardScaler()
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)

pca_bow = PCA()
X_train_bow_pca = pca_bow.fit_transform(X_train_bow)
X_test_bow_pca = pca_bow.transform(X_test_bow)

variance_ratio_bow = np.cumsum(pca_bow.explained_variance_ratio_)
n_components_bow = np.argmax(variance_ratio_bow >= 0.95) + 1
X_train_bow_pca = X_train_bow_pca[:, :n_components_bow]
X_test_bow_pca = X_test_bow_pca[:, :n_components_bow]

pca_tfidf = PCA()
X_train_tfidf_pca = pca_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_pca = pca_tfidf.transform(X_test_tfidf)

variance_ratio_tfidf = np.cumsum(pca_tfidf.explained_variance_ratio_)
n_components_tfidf = np.argmax(variance_ratio_tfidf >= 0.95) + 1
X_train_tfidf_pca = X_train_tfidf_pca[:, :n_components_tfidf]
X_test_tfidf_pca = X_test_tfidf_pca[:, :n_components_tfidf]

X_train_bow = tf.convert_to_tensor(X_train_bow_pca, dtype = tf.float32)
X_test_bow = tf.convert_to_tensor(X_test_bow_pca, dtype = tf.float32)

X_train_tfidf = tf.convert_to_tensor(X_train_tfidf_pca, dtype = tf.float32)
X_test_tfidf = tf.convert_to_tensor(X_test_tfidf_pca, dtype = tf.float32)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(['preprocessed_text'], axis = 1))
X_test_scaled = scaler.transform(X_test.drop(['preprocessed_text'], axis = 1))
X_train_features = tf.constant(X_train_scaled.astype('float32'))
X_test_features = tf.constant(X_test_scaled.astype('float32'))

embeddings_train = tf.concat([embeddings_train, X_train_features, X_train_bow], axis = 1)
embeddings_test = tf.concat([embeddings_test, X_test_features, X_test_bow], axis = 1)

In [144]:
print("-----------------------Logistic Regression-----------------------")
logistic_regression_qwk_set8 = cross_validation_function(embeddings_train, y_train)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
random_forest_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "random_forest_classifier")

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
adaboost_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "adaboost_classifier")

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
k_neighbors_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "k_neighbors_classifier")

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
support_vector_classifier_qwk_set8 = cross_validation_function(embeddings_train, y_train, ml_model = "support_vector_classifier")

-----------------------Logistic Regression-----------------------
Accuracy: 0.1897
Precision: 0.0665
Recall: 0.0948
F1 score: 0.0763
Kappa score: 0.3588


-----------------------Random Forest Classifier-----------------------
Accuracy: 0.2696
Precision: 0.0785
Recall: 0.0661
F1 score: 0.0578
Kappa score: 0.5341


-----------------------Adaboost Classifier-----------------------
Accuracy: 0.2845
Precision: 0.0219
Recall: 0.0773
F1 score: 0.0334
Kappa score: 0.5583


-----------------------K Neibhors Classifier-----------------------
Accuracy: 0.1304
Precision: 0.0597
Recall: 0.0749
F1 score: 0.0354
Kappa score: 0.2127


-----------------------Support Vector Classifier-----------------------
Accuracy: 0.2696
Precision: 0.0118
Recall: 0.0476
F1 score: 0.0187
Kappa score: 0.0000


In [145]:
logistic_regression_qwk = [logistic_regression_qwk_set1, logistic_regression_qwk_set2, logistic_regression_qwk_set3,
                           logistic_regression_qwk_set4, logistic_regression_qwk_set5, logistic_regression_qwk_set6,
                           logistic_regression_qwk_set7, logistic_regression_qwk_set8]
random_forest_classifier_qwk = [random_forest_classifier_qwk_set1, random_forest_classifier_qwk_set2, random_forest_classifier_qwk_set3,
                                random_forest_classifier_qwk_set4, random_forest_classifier_qwk_set5, random_forest_classifier_qwk_set6,
                                random_forest_classifier_qwk_set7, random_forest_classifier_qwk_set8]
adaboost_classifier_qwk = [adaboost_classifier_qwk_set1, adaboost_classifier_qwk_set2, adaboost_classifier_qwk_set3,
                           adaboost_classifier_qwk_set4, adaboost_classifier_qwk_set5, adaboost_classifier_qwk_set6,
                           adaboost_classifier_qwk_set7, adaboost_classifier_qwk_set8]
k_neighbors_classifier_qwk = [k_neighbors_classifier_qwk_set1, k_neighbors_classifier_qwk_set2, k_neighbors_classifier_qwk_set3,
                              k_neighbors_classifier_qwk_set4, k_neighbors_classifier_qwk_set5, k_neighbors_classifier_qwk_set6,
                              k_neighbors_classifier_qwk_set7, k_neighbors_classifier_qwk_set8]
support_vector_classifier_qwk = [support_vector_classifier_qwk_set1, support_vector_classifier_qwk_set2, support_vector_classifier_qwk_set3,
                                 support_vector_classifier_qwk_set4, support_vector_classifier_qwk_set5, support_vector_classifier_qwk_set6,
                                 support_vector_classifier_qwk_set7, support_vector_classifier_qwk_set8]

metrics_list = [logistic_regression_qwk, random_forest_classifier_qwk, adaboost_classifier_qwk,
               k_neighbors_classifier_qwk, support_vector_classifier_qwk]

results_df = pd.DataFrame(metrics_list)

results_df.rename(columns = {0: 'Prompt-1', 1: 'Prompt-2', 2: 'Prompt-3', 
                            3: 'Prompt-4', 4: 'Prompt-5', 5: 'Prompt-6',
                            6: 'Prompt-7', 7: 'Prompt-8'}, inplace = True)

results_df.rename(index = {0: 'BERT + LR', 1: 'BERT + RF', 
                           2: 'BERT + Adaboost', 3: 'BERT + KNN', 4: 'BERT + SVC'}, inplace = True)

In [147]:
# results_df.to_csv('Results/BERT + features.csv', index = True)