In [160]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf

from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dropout
from sklearn.metrics import cohen_kappa_score
from tensorflow.keras.layers import Input, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from sklearn.metrics import accuracy_score

import tensorflow as tf
from sklearn.metrics import cohen_kappa_score
from tensorflow.keras.layers import concatenate

import warnings
warnings.filterwarnings("ignore")

In [161]:
df = pd.read_csv('ASAP Dataset/Preprocessed_df.csv')

In [162]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,word_len,chars_len,avg_word_length,avg_sentence_length,pos_ratios,num_sentences,num_paragraphs,sentiment_polariy,sentiment_subjectivity,preprocessed_text
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,386,1875,3.984456,1.0,"{'NNP': 0.031088082901554404, 'JJ': 0.05181347...",16,1,0.310471,0.385613,dear local newspaper think effect computer peo...
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,464,2288,4.030172,1.0,"{'NNP': 0.03879310344827586, ',': 0.0258620689...",20,1,0.274,0.613167,dear believe using computer benefit u many way...
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,313,1541,4.035144,1.0,"{'NNP': 0.04153354632587859, ',': 0.0287539936...",14,1,0.340393,0.498657,dear people use computer everyone agrees benef...
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,611,3165,4.328969,1.0,"{'NNP': 0.11620294599018004, ',': 0.0212765957...",27,1,0.266828,0.441795,dear local newspaper found many expert say com...
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,517,2569,4.071567,1.0,"{'NNP': 0.017408123791102514, ',': 0.025145067...",30,1,0.199684,0.485814,dear know computer positive effect people comp...


In [163]:
df = df.dropna(axis = 1, how = 'any')

In [164]:
drop_columns = ['essay_id', 'pos_ratios', 'essay', 'rater1_domain1', 'rater2_domain1']
df.drop(drop_columns, axis = 1, inplace = True)

In [165]:
def calculate_precision(y_true, y_pred, average='macro'):
    precision = precision_score(y_true, y_pred, average=average)
    return precision

def calculate_recall(y_true, y_pred, average='macro'):
    recall = recall_score(y_true, y_pred, average=average)
    return recall

def calculate_f1_score(y_true, y_pred, average='macro'):
    f1 = f1_score(y_true, y_pred, average=average)
    return f1

def calculate_cohen_kappa_score(y_true, y_pred):
    kappa_score = cohen_kappa_score(y_true, y_pred, weights = 'quadratic')
    return kappa_score

def calculate_accuracy(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

def print_metrics_function(y_actual, y_predictions):
    
    accuracy = calculate_accuracy(y_actual, y_predictions)
    precision = calculate_precision(y_actual, y_predictions)
    recall = calculate_recall(y_actual, y_predictions)
    kappa_score = calculate_cohen_kappa_score(y_actual, y_predictions)

    return accuracy, precision, recall, f1, kappa_score

In [168]:
def dataset_preparation(data, target = 'domain1_score'):
    
    X = data.drop([target], axis = 1)
    y = data[target]
    
    return X, y

### Model with Metrics (Essay Set - 1)

In [169]:
df_essay_set = df[df.essay_set == 1]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [170]:
print("The shape of the input train data: {}".format(X_train.shape))
print("The shape of the input test data: {}".format(X_test.shape))
print("---------------------------------------------------")
print("The shape of the output train data: {}".format(y_train.shape))
print("The shape of the output test data: {}".format(y_test.shape))

The shape of the input train data: (1426, 10)
The shape of the input test data: (357, 10)
---------------------------------------------------
The shape of the output train data: (1426, 13)
The shape of the output test data: (357, 13)


In [171]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [172]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b5bbdb4730>

In [173]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.3445378151260504
Precision: 0.1839159439447185
Recall: 0.167596345187071
F1-Score: 0.150305207126839
Cohen Kappa Score: 0.34645857179608586


(0.3445378151260504,
 0.1839159439447185,
 0.167596345187071,
 0.150305207126839,
 0.34645857179608586)

### Model with Metrics (Essay Set - 2)

In [174]:
df_essay_set = df[df.essay_set == 2]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [175]:
print("The shape of the input train data: {}".format(X_train.shape))
print("The shape of the input test data: {}".format(X_test.shape))
print("---------------------------------------------------")
print("The shape of the output train data: {}".format(y_train.shape))
print("The shape of the output test data: {}".format(y_test.shape))

The shape of the input train data: (1440, 10)
The shape of the input test data: (360, 10)
---------------------------------------------------
The shape of the output train data: (1440, 7)
The shape of the output test data: (360, 7)


In [176]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [177]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b5a4ca8370>

In [178]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.5055555555555555
Precision: 0.2729098488390213
Recall: 0.2901935812531839
F1-Score: 0.28066848634424596
Cohen Kappa Score: 0.3961397058823529


(0.5055555555555555,
 0.2729098488390213,
 0.2901935812531839,
 0.28066848634424596,
 0.3961397058823529)

### Model with Metrics (Essay Set - 4)

In [206]:
df_essay_set = df[df.essay_set == 4]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [207]:
print("The shape of the input train data: {}".format(X_train.shape))
print("The shape of the input test data: {}".format(X_test.shape))
print("---------------------------------------------------")
print("The shape of the output train data: {}".format(y_train.shape))
print("The shape of the output test data: {}".format(y_test.shape))

The shape of the input train data: (1416, 10)
The shape of the input test data: (354, 10)
---------------------------------------------------
The shape of the output train data: (1416, 4)
The shape of the output test data: (354, 4)


In [208]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [209]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=2, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b5c2b79970>

In [210]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.5932203389830508
Precision: 0.6125039361979661
Recall: 0.533907416091646
F1-Score: 0.548260722338866
Cohen Kappa Score: 0.6454184111505644


(0.5932203389830508,
 0.6125039361979661,
 0.533907416091646,
 0.548260722338866,
 0.6454184111505644)

### Model with Metrics (Essay Set - 5)

In [211]:
df_essay_set = df[df.essay_set == 5]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [212]:
print("The shape of the input train data: {}".format(X_train.shape))
print("The shape of the input test data: {}".format(X_test.shape))
print("---------------------------------------------------")
print("The shape of the output train data: {}".format(y_train.shape))
print("The shape of the output test data: {}".format(y_test.shape))

The shape of the input train data: (1444, 10)
The shape of the input test data: (361, 10)
---------------------------------------------------
The shape of the output train data: (1444, 5)
The shape of the output test data: (361, 5)


In [213]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [214]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b5bbe6e9d0>

In [215]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.4930747922437673
Precision: 0.3903352650101876
Recall: 0.3668858418891784
F1-Score: 0.37316175994098716
Cohen Kappa Score: 0.5901031608163583


(0.4930747922437673,
 0.3903352650101876,
 0.3668858418891784,
 0.37316175994098716,
 0.5901031608163583)

### Model with Metrics (Essay Set - 6)

In [216]:
df_essay_set = df[df.essay_set == 6]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [217]:
print("The shape of the input train data: {}".format(X_train.shape))
print("The shape of the input test data: {}".format(X_test.shape))
print("---------------------------------------------------")
print("The shape of the output train data: {}".format(y_train.shape))
print("The shape of the output test data: {}".format(y_test.shape))

The shape of the input train data: (1440, 10)
The shape of the input test data: (360, 10)
---------------------------------------------------
The shape of the output train data: (1440, 5)
The shape of the output test data: (360, 5)


In [218]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [219]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b5ae054430>

In [220]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.55
Precision: 0.4080289876858707
Recall: 0.39879325876435706
F1-Score: 0.4019728936534427
Cohen Kappa Score: 0.5869056897895557


(0.55,
 0.4080289876858707,
 0.39879325876435706,
 0.4019728936534427,
 0.5869056897895557)

### Model with Metrics (Essay Set - 8)

In [225]:
df_essay_set = df[df.essay_set == 8]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [226]:
print("The shape of the input train data: {}".format(X_train.shape))
print("The shape of the input test data: {}".format(X_test.shape))
print("---------------------------------------------------")
print("The shape of the output train data: {}".format(y_train.shape))
print("The shape of the output test data: {}".format(y_test.shape))

The shape of the input train data: (578, 10)
The shape of the input test data: (145, 10)
---------------------------------------------------
The shape of the output train data: (578, 61)
The shape of the output test data: (145, 61)


In [227]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [228]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b5ad8c2910>

In [229]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.14482758620689656
Precision: 0.019693877551020405
Recall: 0.04356921813818366
F1-Score: 0.025971682336431417
Cohen Kappa Score: -0.09739861043787368


(0.14482758620689656,
 0.019693877551020405,
 0.04356921813818366,
 0.025971682336431417,
 -0.09739861043787368)