In [36]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from transformers import TFGPT2Model, GPT2Tokenizer
from sklearn.model_selection import train_test_split

import xgboost as xgb
import numpy as np
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
df = pd.read_csv('Datasets/Preprocessed_df.csv')

In [3]:
df.drop(['full_text', 'text_id'], axis = 1, inplace = True)

In [4]:
def calc_mse(y_true, y_pred):
    """
    Calculates the mean squared error (MSE) between the true and predicted values
    """
    mse = mean_squared_error(y_true, y_pred)
    return mse

def calc_mse(y_true, y_pred):
    """
    Calculates the mean squared error (MSE) between the true and predicted values
    """
    mse = mean_squared_error(y_true, y_pred)
    return mse

def calc_mae(y_true, y_pred):
    """
    Calculates the mean absolute error (MAE) between the true and predicted values
    """
    mae = mean_absolute_error(y_true, y_pred)
    return mae

def calc_rmse(y_true, y_pred):
    """
    Calculates the root mean squared error (RMSE) between the true and predicted values
    """
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return rmse

def calc_mape(y_true, y_pred):
    """
    Calculates the mean absolute percentage error (MAPE) between the true and predicted values
    """
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mape

def calc_r2_score(y_true, y_pred):
    """
    Calculates the R2 score between the true and predicted values
    """
    r2 = r2_score(y_true, y_pred)
    return r2

In [5]:
# Calculate and print MSE
def print_metrics_function(y_actual, y_predictions):
    
    # Calculate and print MSE
    mse = calc_mse(y_actual, y_predictions)
    print("MSE:", mse)

    # Calculate and print RMSE
    rmse = calc_rmse(y_actual, y_predictions)
    print("RMSE:", rmse)
    
    # Calculate and print MAE
    mae = calc_mae(y_actual, y_predictions)
    print("MAE:", mae)

    # Calculate and print MAPE
    mape = calc_mape(y_actual, y_predictions)
    print("MAPE:", mape)

    # Calculate and print R2 score
    r2 = calc_r2_score(y_actual, y_predictions)
    print("R2 Score:", r2)
    
    return mse, rmse, mae, mape, r2

In [6]:
cohesion = df['cohesion']
syntax = df['syntax']
vocabulary = df['vocabulary']
phraseology = df['phraseology']
grammar = df['grammar']
conventions = df['conventions']

preprocessed_text = df['preprocessed_text']

In [7]:
X = preprocessed_text
y_cohesion = cohesion
y_syntax = syntax
y_vocabulary = vocabulary
y_phraseology = phraseology
y_grammar = grammar
y_conventions = conventions

X_train, X_test, y_train_cohesion, y_test_cohesion = train_test_split(X, y_cohesion, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_syntax, y_test_syntax = train_test_split(X, y_syntax, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_vocabulary, y_test_vocabulary = train_test_split(X, y_vocabulary, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_phraseology, y_test_phraseology = train_test_split(X, y_phraseology, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_grammar, y_test_grammar = train_test_split(X, y_grammar, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_conventions, y_test_conventions = train_test_split(X, y_conventions, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

In [8]:
print("The shape of input train data: {}".format(X_train.shape))
print("The shape of input test data: {}".format(X_test.shape))
print("------------------------------------------")
print("The shape of output train data: {}".format(y_train_cohesion.shape))
print("The shape of output test data: {}".format(y_test_cohesion.shape))

The shape of input train data: (3128,)
The shape of input test data: (783,)
------------------------------------------
The shape of output train data: (3128,)
The shape of output test data: (783,)


In [9]:
# This downloads the pre-trained weights from the huggingface website 
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
gpt_model = TFGPT2Model.from_pretrained('gpt2')
max_len = 512
print(f"Total number of parameters: {gpt_model.count_params()}")

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


Total number of parameters: 124439808


### GPT - 2 Architecture

#### Extracting GPT - 2 Embeddings

In [12]:
# This code can take about 5 - 10 minutes to run depending on the speed of the system
batch_size = 2

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=max_len, return_tensors='tf')
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train_cohesion)).batch(batch_size)

test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=max_len, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test_cohesion)).batch(batch_size)

embeddings_train = []
for batch in train_dataset:
    embeddings_train.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_train = tf.concat(embeddings_train, axis=0)

embeddings_test = []
for batch in test_dataset:
    embeddings_test.append(gpt_model(batch[0]['input_ids'])[0][:, -1, :])
embeddings_test = tf.concat(embeddings_test, axis=0)

#### Train and Evaluate XGBoost Regressor

In [35]:
X_train_xgb = embeddings_train.numpy()
X_test_xgb = embeddings_test.numpy()

model = xgb.XGBRegressor(objective = 'reg:absoluteerror', n_estimators = 10,
                    learning_rate = 0.001, max_depth = 4)
model.fit(X_train_xgb, y_train_cohesion)
y_predictions = model.predict(X_test_xgb)

print_metrics_function(y_test_cohesion, y_predictions)

MSE: 0.4653183170267626
RMSE: 0.6821424462872565
MAE: 0.5260195558555282
MAPE: 17.11389506369954
R2 Score: -0.059919383206717525


(0.4653183170267626,
 0.6821424462872565,
 0.5260195558555282,
 17.11389506369954,
 -0.059919383206717525)