In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from transformers import BertTokenizer
from transformers import TFBertModel
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from spellchecker import SpellChecker
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('ASAP Dataset/Preprocessed_df.csv')

In [3]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,word_len,chars_len,avg_word_length,avg_sentence_length,pos_ratios,num_sentences,num_paragraphs,sentiment_polariy,sentiment_subjectivity,preprocessed_text
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,386,1875,3.984456,1.0,"{'NNP': 0.031088082901554404, 'JJ': 0.05181347...",16,1,0.310471,0.385613,dear local newspaper think effect computer peo...
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,464,2288,4.030172,1.0,"{'NNP': 0.03879310344827586, ',': 0.0258620689...",20,1,0.274,0.613167,dear believe using computer benefit u many way...
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,313,1541,4.035144,1.0,"{'NNP': 0.04153354632587859, ',': 0.0287539936...",14,1,0.340393,0.498657,dear people use computer everyone agrees benef...
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,611,3165,4.328969,1.0,"{'NNP': 0.11620294599018004, ',': 0.0212765957...",27,1,0.266828,0.441795,dear local newspaper found many expert say com...
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,517,2569,4.071567,1.0,"{'NNP': 0.017408123791102514, ',': 0.025145067...",30,1,0.199684,0.485814,dear know computer positive effect people comp...


In [4]:
df = df.dropna(axis = 1, how = 'any')

In [5]:
drop_columns = ['essay_id', 'pos_ratios', 'essay']
df.drop(drop_columns, axis = 1, inplace = True)

In [6]:
def calc_precision(y_true, y_pred, average='macro'):
    """
    Calculates the precision score between the true and predicted values
    """
    precision = precision_score(y_true, y_pred, average=average)
    return precision

def calc_recall(y_true, y_pred, average='macro'):
    """
    Calculates the recall score between the true and predicted values
    """
    recall = recall_score(y_true, y_pred, average=average)
    return recall

def calc_f1_score(y_true, y_pred, average='macro'):
    """
    Calculates the f1-score between the true and predicted values
    """
    f1 = f1_score(y_true, y_pred, average=average)
    return f1

def calc_cohen_kappa_score(y_true, y_pred):
    """
    Calculates the cohen kappa score between the true and predicted values
    """
    kappa_score = cohen_kappa_score(y_true, y_pred, weights = 'quadratic')
    return kappa_score

def calc_accuracy(y_true, y_pred):
    """
    Calculates the accuracy score between the true and predicted values
    """
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

In [7]:
def print_metrics_function(y_actual, y_predictions):
    
    # Calculate and print accuracy
    accuracy = calc_accuracy(y_actual, y_predictions)
    print("Accuracy:", accuracy)
    
    # Calculate and print precision
    precision = calc_precision(y_actual, y_predictions)
    print("Precision:", precision)

    # Calculate and print recall
    recall = calc_recall(y_actual, y_predictions)
    print("Recall:", recall)

    # Calculate and print f1-score
    f1 = calc_f1_score(y_actual, y_predictions)
    print("F1-Score:", f1)

    # Calculate and print Cohen Kappa Score
    kappa_score = calc_cohen_kappa_score(y_actual, y_predictions)
    print("Cohen Kappa Score:", kappa_score)

    return accuracy, precision, recall, f1, kappa_score

In [8]:
def dataset_preparation(data, target = 'domain1_score'):
    
    X = data.drop([target], axis = 1)
    y = data[target]
    
    return X, y

In [9]:
df_essay_set = df[df.essay_set == 1]
X, y = dataset_preparation(df_essay_set)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2)

In [10]:
def spell_corrector(tokens):
    spell_checker = SpellChecker()
    correct_tokens = []
    for token in tqdm(tokens):
        if spell_checker.correction(token.lower()):
            correct_tokens.append(spell_checker.correction(token.lower()))
        else:
            correct_tokens.append(token.lower())
    
    return ' '.join(correct_tokens)

In [11]:
X_train.head()

Unnamed: 0,essay_set,rater1_domain1,rater2_domain1,word_len,chars_len,avg_word_length,avg_sentence_length,num_sentences,num_paragraphs,sentiment_polariy,sentiment_subjectivity,preprocessed_text
1346,1,4,5,431,1993,3.781903,1.0,37,1,0.096094,0.511334,dear local people using computer year good hea...
1349,1,4,4,289,1498,4.283737,1.0,15,1,0.066142,0.514918,dear newspaper believe computer positive affec...
7,1,5,5,556,2724,4.03777,1.0,39,1,0.26203,0.5745,people agree computer make life le complicated...
1251,1,4,4,481,2259,3.837838,1.0,31,1,0.09966,0.426994,dear world changed much better technology beco...
661,1,4,5,488,2369,3.954918,1.0,26,1,0.05306,0.429295,technology growing changing rapidly look apple...


### BERT Architecture

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
BATCH_SIZE = 2
num_batches = len(X_train['preprocessed_text']) // BATCH_SIZE

bert_features_train = []
bert_features_test = []

for i in tqdm(range(num_batches)):
    batch_texts = X_train.iloc[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]
    train_encodings = tokenizer(list(batch_texts), truncation = True, padding = True, max_length = 512,
                               return_tensors = 'tf')
    batch_features = bert_model(train_encodings)
    batch_features = batch_features[0].numpy()
    bert_features_train.append(batch_features)
    
for i in tqdm(range(num_batches)):
    batch_texts = X_test.iloc[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]
    test_encodings = tokenizer(list(batch_texts), truncation = True, padding = True, max_length = 512,
                          return_tensors = 'tf')
    batch_features = bert_model(test_encodings)
    batch_features = batch_features[0].numpy()
    bert_features_test.append(batch_features)    

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
100%|████████████████████████████████████████████████████████████████████████████████| 713/713 [00:51<00:00, 13.94it/s]
100%|██████████████████████████████████████

In [19]:
tf.concat(bert_features_train, axis = 0)

<tf.Tensor: shape=(8556, 8, 768), dtype=float32, numpy=
array([[[-0.23505591,  0.07495087, -0.08867416, ..., -0.30985525,
          0.03627761,  0.3585786 ],
        [ 0.15717404, -0.1462929 , -0.16976476, ..., -0.1420402 ,
          0.20012036, -0.6288921 ],
        [ 0.7382288 , -0.35825378,  0.742987  , ...,  0.30036747,
          0.25264427, -0.5589581 ],
        ...,
        [-0.1078686 , -0.24818267,  0.35578594, ..., -0.24034728,
         -0.12976818,  0.13508783],
        [-0.02624794, -0.33122844,  0.38419724, ..., -0.24512577,
         -0.0840089 ,  0.11145853],
        [ 0.00433122, -0.33889863,  0.45211914, ..., -0.17592625,
         -0.06446395,  0.12025243]],

       [[-0.5869002 , -0.15931612, -0.18594632, ..., -0.38056552,
          0.07712404,  0.8974017 ],
        [ 0.28894907, -1.0519651 ,  0.33448958, ..., -0.39941883,
          0.4670426 , -0.3098287 ],
        [-0.01595033, -0.3958352 ,  0.03754324, ..., -0.6405394 ,
          0.01470817, -0.19635493],
        ...

In [None]:
print("-----------------------Logistic Regression-----------------------")
model = choose_classifiers("logistic_regression")
model.fit(X_train_final, y_train_set1)
y_predictions = model.predict(X_test_final)
accuracy_logistic_reg, precision_logistic_reg, recall_logistic_reg, f1_logistic_reg, kappa_score_logistic_reg = print_metrics_function(y_test_set1, y_predictions)

print("\n")
print("-----------------------Random Forest Classifier-----------------------")
model = choose_classifiers("random_forest_classifier")
model.fit(X_train_final, y_train_set1)
y_predictions = model.predict(X_test_final)
accuracy_random_forest, precision_random_forest, recall_random_forest, f1_random_forest, kappa_score_random_forest = print_metrics_function(y_test_set1, y_predictions)

print("\n")
print("-----------------------Adaboost Classifier-----------------------")
model = choose_classifiers("adaboost_classifier")
model.fit(X_train_final, y_train_set1)
y_predictions = model.predict(X_test_final)
accuracy_adaboost, precision_adaboost, recall_adaboost, f1_adaboost, kappa_score_adaboost = print_metrics_function(y_test_set1, y_predictions)

print("\n")
print("-----------------------K Neibhors Classifier-----------------------")
model = choose_classifiers("k_neighbors_classifier")
model.fit(X_train_final, y_train_set1)
y_predictions = model.predict(X_test_final)
accuracy_k_neighbors, precision_k_neighbors, recall_k_neighbors, f1_k_neighbors, kappa_score_k_neighbors = print_metrics_function(y_test_set1, y_predictions)

print("\n")
print("-----------------------Support Vector Classifier-----------------------")
model = choose_classifiers("support_vector_classifier")
model.fit(X_train_final, y_train_set1)
y_predictions = model.predict(X_test_final)
accuracy_svc, precision_svc, recall_svc, f1_svc, kappa_score_svc = print_metrics_function(y_test_set1, y_predictions)


<tf.Tensor: shape=(12, 8, 768), dtype=float32, numpy=
array([[[-0.23505591,  0.07495087, -0.08867416, ..., -0.30985525,
          0.03627761,  0.3585786 ],
        [ 0.15717404, -0.1462929 , -0.16976476, ..., -0.1420402 ,
          0.20012036, -0.6288921 ],
        [ 0.7382288 , -0.35825378,  0.742987  , ...,  0.30036747,
          0.25264427, -0.5589581 ],
        ...,
        [-0.1078686 , -0.24818267,  0.35578594, ..., -0.24034728,
         -0.12976818,  0.13508783],
        [-0.02624794, -0.33122844,  0.38419724, ..., -0.24512577,
         -0.0840089 ,  0.11145853],
        [ 0.00433122, -0.33889863,  0.45211914, ..., -0.17592625,
         -0.06446395,  0.12025243]],

       [[-0.5869002 , -0.15931612, -0.18594632, ..., -0.38056552,
          0.07712404,  0.8974017 ],
        [ 0.28894907, -1.0519651 ,  0.33448958, ..., -0.39941883,
          0.4670426 , -0.3098287 ],
        [-0.01595033, -0.3958352 ,  0.03754324, ..., -0.6405394 ,
          0.01470817, -0.19635493],
        ...,


In [None]:
y_predictions = model.predict(test_data)