In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
from prettytable import PrettyTable

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import RMSprop, SGD, Adam

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [3]:
df = pd.read_csv('Datasets/Preprocessed_df.csv')

In [4]:
df.drop(['full_text', 'text_id'], axis = 1, inplace = True)

In [5]:
def calc_mse(y_true, y_pred):
    """
    Calculates the mean squared error (MSE) between the true and predicted values
    """
    mse = mean_squared_error(y_true, y_pred)
    return mse

def calc_mse(y_true, y_pred):
    """
    Calculates the mean squared error (MSE) between the true and predicted values
    """
    mse = mean_squared_error(y_true, y_pred)
    return mse

def calc_mae(y_true, y_pred):
    """
    Calculates the mean absolute error (MAE) between the true and predicted values
    """
    mae = mean_absolute_error(y_true, y_pred)
    return mae

def calc_rmse(y_true, y_pred):
    """
    Calculates the root mean squared error (RMSE) between the true and predicted values
    """
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return rmse

def calc_mape(y_true, y_pred):
    """
    Calculates the mean absolute percentage error (MAPE) between the true and predicted values
    """
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mape

def calc_r2_score(y_true, y_pred):
    """
    Calculates the R2 score between the true and predicted values
    """
    r2 = r2_score(y_true, y_pred)
    return r2

In [6]:
# Calculate and print MSE
def print_metrics_function(y_actual, y_predictions):
    
    # Calculate and print MSE
    mse = calc_mse(y_actual, y_predictions)
    print("MSE:", mse)

    # Calculate and print RMSE
    rmse = calc_rmse(y_actual, y_predictions)
    print("RMSE:", rmse)
    
    # Calculate and print MAE
    mae = calc_mae(y_actual, y_predictions)
    print("MAE:", mae)

    # Calculate and print MAPE
    mape = calc_mape(y_actual, y_predictions)
    print("MAPE:", mape)

    # Calculate and print R2 score
    r2 = calc_r2_score(y_actual, y_predictions)
    print("R2 Score:", r2)
    
    return mse, rmse, mae, mape, r2

In [7]:
cohesion = df['cohesion']
syntax = df['syntax']
vocabulary = df['vocabulary']
phraseology = df['phraseology']
grammar = df['grammar']
conventions = df['conventions']

preprocessed_text = df['preprocessed_text']

In [8]:
X = preprocessed_text
y_cohesion = cohesion
y_syntax = syntax
y_vocabulary = vocabulary
y_phraseology = phraseology
y_grammar = grammar
y_conventions = conventions

X_train, X_test, y_train_cohesion, y_test_cohesion = train_test_split(X, y_cohesion, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_syntax, y_test_syntax = train_test_split(X, y_syntax, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_vocabulary, y_test_vocabulary = train_test_split(X, y_vocabulary, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_phraseology, y_test_phraseology = train_test_split(X, y_phraseology, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_grammar, y_test_grammar = train_test_split(X, y_grammar, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

X_train, X_test, y_train_conventions, y_test_conventions = train_test_split(X, y_conventions, 
                                                                     shuffle = True, 
                                                                     random_state = 101, 
                                                                     test_size = 0.2)

In [47]:
vocab_size = 15000
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<OOV>")
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

In [48]:
max_sequence_len = 1000
padded_sequences_train = pad_sequences(sequences_train, maxlen = max_sequence_len,
                                      padding = 'post',
                                      truncating = 'post')
padded_sequences_test = pad_sequences(sequences_test, maxlen = max_sequence_len,
                                     padding = 'post', 
                                     truncating = 'post')

### Cohesion Score Prediction

In [None]:
embedding_dim = 128
lstm_units = 32
epochs = 10
batch_size = 64

model = Sequential([Embedding(vocab_size, embedding_dim, input_length = max_sequence_len),
                   LSTM(lstm_units, dropout = 0.2, return_sequences = True),
                   Dropout(0.2),
                   LSTM(lstm_units, dropout = 0.2),
                    Dense(10, activation = "relu"),
                   Dense(1, activation = "linear")])

optimizer = Adam(lr = 0.01)
model.compile(optimizer = optimizer, loss = 'mean_absolute_error', metrics = ['mean_absolute_error'])

model.fit(padded_sequences_train, y_train_cohesion, batch_size = batch_size, validation_data = (padded_sequences_test, y_test_cohesion),
         epochs = epochs)

### Syntax Score Prediction

In [56]:
embedding_dim = 128
lstm_units = 32
epochs = 10
batch_size = 64

model = Sequential([Embedding(vocab_size, embedding_dim, input_length = max_sequence_len),
                   LSTM(lstm_units, dropout = 0.2, return_sequences = True),
                   Dropout(0.2),
                   LSTM(lstm_units, dropout = 0.2),
                    Dense(10, activation = "relu"),
                   Dense(1, activation = "linear")])

optimizer = Adam(lr = 0.01)
model.compile(optimizer = optimizer, loss = 'mean_absolute_error', metrics = ['mean_absolute_error'])

model.fit(padded_sequences_train, y_train_syntax, batch_size = batch_size, validation_data = (padded_sequences_test, y_test_syntax),
         epochs = epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x20e80c9a970>

### Vocabulary Score Prediction

In [58]:
embedding_dim = 128
lstm_units = 32
epochs = 10
batch_size = 64

model = Sequential([Embedding(vocab_size, embedding_dim, input_length = max_sequence_len),
                   LSTM(lstm_units, dropout = 0.2, return_sequences = True),
                   Dropout(0.2),
                   LSTM(lstm_units, dropout = 0.2),
                    Dense(10, activation = "relu"),
                   Dense(1, activation = "linear")])

optimizer = Adam(lr = 0.01)
model.compile(optimizer = optimizer, loss = 'mean_absolute_error', metrics = ['mean_absolute_error'])

model.fit(padded_sequences_train, y_train_vocabulary, batch_size = batch_size, validation_data = (padded_sequences_test, y_test_vocabulary),
         epochs = epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x20e91c672b0>

### Phraseology Score Prediction

In [57]:
embedding_dim = 128
lstm_units = 32
epochs = 10
batch_size = 64

model = Sequential([Embedding(vocab_size, embedding_dim, input_length = max_sequence_len),
                   LSTM(lstm_units, dropout = 0.2, return_sequences = True),
                   Dropout(0.2),
                   LSTM(lstm_units, dropout = 0.2),
                    Dense(10, activation = "relu"),
                   Dense(1, activation = "linear")])

optimizer = Adam(lr = 0.01)
model.compile(optimizer = optimizer, loss = 'mean_absolute_error', metrics = ['mean_absolute_error'])

model.fit(padded_sequences_train, y_train_phraseology, batch_size = batch_size, validation_data = (padded_sequences_test, y_test_phraseology),
         epochs = epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x20e8a4f4eb0>

### Grammar Score Prediction

In [59]:
embedding_dim = 128
lstm_units = 32
epochs = 10
batch_size = 64

model = Sequential([Embedding(vocab_size, embedding_dim, input_length = max_sequence_len),
                   LSTM(lstm_units, dropout = 0.2, return_sequences = True),
                   Dropout(0.2),
                   LSTM(lstm_units, dropout = 0.2),
                    Dense(10, activation = "relu"),
                   Dense(1, activation = "linear")])

optimizer = Adam(lr = 0.01)
model.compile(optimizer = optimizer, loss = 'mean_absolute_error', metrics = ['mean_absolute_error'])

model.fit(padded_sequences_train, y_train_grammar, batch_size = batch_size, validation_data = (padded_sequences_test, y_test_grammar),
         epochs = epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x20e9955c400>

### Convention Score Prediction

In [61]:
embedding_dim = 128
lstm_units = 32
epochs = 10
batch_size = 64

model = Sequential([Embedding(vocab_size, embedding_dim, input_length = max_sequence_len),
                   LSTM(lstm_units, dropout = 0.2, return_sequences = True),
                   Dropout(0.2),
                   LSTM(lstm_units, dropout = 0.2),
                    Dense(10, activation = "relu"),
                   Dense(1, activation = "linear")])

optimizer = Adam(lr = 0.01)
model.compile(optimizer = optimizer, loss = 'mean_absolute_error', metrics = ['mean_absolute_error'])

model.fit(padded_sequences_train, y_train_conventions, batch_size = batch_size, validation_data = (padded_sequences_test, y_test_conventions),
         epochs = epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x20e9f4b0040>