In [44]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf

from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dropout
from sklearn.metrics import cohen_kappa_score
from tensorflow.keras.layers import Input, GlobalMaxPooling1D, Concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from sklearn.metrics import accuracy_score

import tensorflow as tf
from sklearn.metrics import cohen_kappa_score
from tensorflow.keras.layers import concatenate

import warnings
warnings.filterwarnings("ignore")

In [45]:
df = pd.read_csv('ASAP Dataset/Preprocessed_df.csv')

In [46]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,word_len,chars_len,avg_word_length,avg_sentence_length,pos_ratios,num_sentences,num_paragraphs,sentiment_polariy,sentiment_subjectivity,preprocessed_text
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,386,1875,3.984456,1.0,"{'NNP': 0.031088082901554404, 'JJ': 0.05181347...",16,1,0.310471,0.385613,dear local newspaper think effect computer peo...
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,464,2288,4.030172,1.0,"{'NNP': 0.03879310344827586, ',': 0.0258620689...",20,1,0.274,0.613167,dear believe using computer benefit u many way...
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,313,1541,4.035144,1.0,"{'NNP': 0.04153354632587859, ',': 0.0287539936...",14,1,0.340393,0.498657,dear people use computer everyone agrees benef...
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,611,3165,4.328969,1.0,"{'NNP': 0.11620294599018004, ',': 0.0212765957...",27,1,0.266828,0.441795,dear local newspaper found many expert say com...
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,517,2569,4.071567,1.0,"{'NNP': 0.017408123791102514, ',': 0.025145067...",30,1,0.199684,0.485814,dear know computer positive effect people comp...


In [47]:
df = df.dropna(axis = 1, how = 'any')

In [48]:
drop_columns = ['essay_id', 'pos_ratios', 'essay', 'rater1_domain1', 'rater2_domain1']
df.drop(drop_columns, axis = 1, inplace = True)

In [49]:
def calc_precision(y_true, y_pred, average='macro'):
    """
    Calculates the precision score between the true and predicted values
    """
    precision = precision_score(y_true, y_pred, average=average)
    return precision

def calc_recall(y_true, y_pred, average='macro'):
    """
    Calculates the recall score between the true and predicted values
    """
    recall = recall_score(y_true, y_pred, average=average)
    return recall

def calc_f1_score(y_true, y_pred, average='macro'):
    """
    Calculates the f1-score between the true and predicted values
    """
    f1 = f1_score(y_true, y_pred, average=average)
    return f1

def calc_cohen_kappa_score(y_true, y_pred):
    """
    Calculates the cohen kappa score between the true and predicted values
    """
    kappa_score = cohen_kappa_score(y_true, y_pred, weights = 'quadratic')
    return kappa_score

def calc_accuracy(y_true, y_pred):
    """
    Calculates the accuracy score between the true and predicted values
    """
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

In [50]:
def print_metrics_function(y_actual, y_predictions):
    
    # Calculate and print accuracy
    accuracy = calc_accuracy(y_actual, y_predictions)
    print("Accuracy:", accuracy)
    
    # Calculate and print precision
    precision = calc_precision(y_actual, y_predictions)
    print("Precision:", precision)

    # Calculate and print recall
    recall = calc_recall(y_actual, y_predictions)
    print("Recall:", recall)

    # Calculate and print f1-score
    f1 = calc_f1_score(y_actual, y_predictions)
    print("F1-Score:", f1)

    # Calculate and print Cohen Kappa Score
    kappa_score = calc_cohen_kappa_score(y_actual, y_predictions)
    print("Cohen Kappa Score:", kappa_score)

    return accuracy, precision, recall, f1, kappa_score

In [51]:
def dataset_preparation(data, target = 'domain1_score'):
    
    X = data.drop([target], axis = 1)
    y = data[target]
    
    return X, y

In [52]:
def spell_corrector(tokens):
    spell_checker = SpellChecker()
    correct_tokens = []
    for token in tqdm(tokens):
        if spell_checker.correction(token.lower()):
            correct_tokens.append(spell_checker.correction(token.lower()))
        else:
            correct_tokens.append(token.lower())
    
    return ' '.join(correct_tokens)

In [53]:
def dataset_preparation(data, target = 'domain1_score'):
    
    X = data.drop([target], axis = 1)
    y = data[target]
    
    return X, y

### Model with Metrics (Essay Set - 1)

In [56]:
df_essay_set = df[df.essay_set == 1]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [57]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [66]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b5680dddf0>

In [67]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.3333333333333333
Precision: 0.13276675474969002
Recall: 0.16239541708291708
F1-Score: 0.14241225605270294
Cohen Kappa Score: 0.4073113882641587


(0.3333333333333333,
 0.13276675474969002,
 0.16239541708291708,
 0.14241225605270294,
 0.4073113882641587)

### Model with Metrics (Essay Set - 2)

In [98]:
df_essay_set = df[df.essay_set == 2]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [57]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [66]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b5680dddf0>

In [67]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.3333333333333333
Precision: 0.13276675474969002
Recall: 0.16239541708291708
F1-Score: 0.14241225605270294
Cohen Kappa Score: 0.4073113882641587


(0.3333333333333333,
 0.13276675474969002,
 0.16239541708291708,
 0.14241225605270294,
 0.4073113882641587)

### Model with Metrics (Essay Set - 3)

In [99]:
df_essay_set = df[df.essay_set == 3]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [69]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [70]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b577576e50>

In [71]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.5333333333333333
Precision: 0.41988319273199226
Recall: 0.31073339231931074
F1-Score: 0.3278568922005577
Cohen Kappa Score: 0.4396887159533074


(0.5333333333333333,
 0.41988319273199226,
 0.31073339231931074,
 0.3278568922005577,
 0.4396887159533074)

### Model with Metrics (Essay Set - 4)

In [105]:
df_essay_set = df[df.essay_set == 4]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [78]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [82]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 1000
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=2, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10

InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  indices[4,1006] = -1 is not in [0, 5423)
	 [[node functional_21/embedding_12/embedding_lookup (defined at Users\suhas maddali\AppData\Local\Temp\ipykernel_4016\4016339990.py:36) ]]
	 [[functional_21/embedding_12/embedding_lookup/_24]]
  (1) Invalid argument:  indices[4,1006] = -1 is not in [0, 5423)
	 [[node functional_21/embedding_12/embedding_lookup (defined at Users\suhas maddali\AppData\Local\Temp\ipykernel_4016\4016339990.py:36) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_17808]

Errors may have originated from an input operation.
Input Source operations connected to node functional_21/embedding_12/embedding_lookup:
 functional_21/embedding_12/embedding_lookup/17500 (defined at Anaconda_latest\envs\englishlanguagelearning_gpu\lib\contextlib.py:113)

Input Source operations connected to node functional_21/embedding_12/embedding_lookup:
 functional_21/embedding_12/embedding_lookup/17500 (defined at Anaconda_latest\envs\englishlanguagelearning_gpu\lib\contextlib.py:113)

Function call stack:
train_function -> train_function


In [67]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.3333333333333333
Precision: 0.13276675474969002
Recall: 0.16239541708291708
F1-Score: 0.14241225605270294
Cohen Kappa Score: 0.4073113882641587


(0.3333333333333333,
 0.13276675474969002,
 0.16239541708291708,
 0.14241225605270294,
 0.4073113882641587)

### Model with Metrics (Essay Set - 5)

In [106]:
df_essay_set = df[df.essay_set == 5]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [84]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [85]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b5ad772d00>

In [86]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.519774011299435
Precision: 0.5171502914337018
Recall: 0.49888738527500504
F1-Score: 0.5004295459696358
Cohen Kappa Score: 0.6503452243958574


(0.519774011299435,
 0.5171502914337018,
 0.49888738527500504,
 0.5004295459696358,
 0.6503452243958574)

### Model with Metrics (Essay Set - 6)

In [107]:
df_essay_set = df[df.essay_set == 6]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [88]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [89]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b5ad6adb80>

In [90]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.5346260387811634
Precision: 0.41769759450171823
Recall: 0.4458085458886206
F1-Score: 0.42794265718441943
Cohen Kappa Score: 0.6356448967576434


(0.5346260387811634,
 0.41769759450171823,
 0.4458085458886206,
 0.42794265718441943,
 0.6356448967576434)

### Model with Metrics (Essay Set - 7)

In [108]:
df_essay_set = df[df.essay_set == 7]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [92]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [93]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b5a4ae0340>

In [94]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.49444444444444446
Precision: 0.3984834302794085
Recall: 0.3670094837724896
F1-Score: 0.37290524272114906
Cohen Kappa Score: 0.5657507360157017


(0.49444444444444446,
 0.3984834302794085,
 0.3670094837724896,
 0.37290524272114906,
 0.5657507360157017)

### Model with Metrics (Essay Set - 8)

In [109]:
df_essay_set = df[df.essay_set == 8]
X, y = dataset_preparation(df_essay_set)
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, 
                                                    random_state = 101, test_size = 0.2,)

In [96]:
X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values

In [97]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['preprocessed_text'])
sequences_train = tokenizer.texts_to_sequences(X_train['preprocessed_text'])
sequences_test = tokenizer.texts_to_sequences(X_test['preprocessed_text'])

# Pad the sequences to a fixed length
max_length = 750
padded_sequences_train = pad_sequences(sequences_train, maxlen=max_length)
padded_sequences_test = pad_sequences(sequences_test, maxlen=max_length)

X_train_additional_features = X_train[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')
X_test_additional_features = X_test[['word_len', 'chars_len', 'avg_word_length', 'avg_sentence_length',
       'num_sentences', 'num_paragraphs', 'sentiment_polariy',
       'sentiment_subjectivity']].values.astype('int32')

# Define the model architecture
text_input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length)(text_input)
conv_layer = Conv1D(filters=10, kernel_size=5, activation='relu')(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(conv_layer)
dropout_layer = Dropout(0.2)(pooling_layer)
dense_layer1 = Dense(units=16, activation='relu')(dropout_layer)
output_layer = Dense(units=y_train.shape[1], activation='softmax')(dense_layer1)
model = Model(inputs = text_input, outputs = output_layer)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

padded_sequences_train_concat = concatenate([padded_sequences_train, X_train_additional_features])
padded_sequences_test_concat = concatenate([padded_sequences_test, X_test_additional_features])

# Train the model
model.fit(padded_sequences_train_concat, y_train, epochs=10, batch_size=32, validation_data=(padded_sequences_test_concat, y_test))


Epoch 1/10

InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  indices[0,756] = -1 is not in [0, 8333)
	 [[node functional_29/embedding_16/embedding_lookup (defined at Users\suhas maddali\AppData\Local\Temp\ipykernel_4016\4194731421.py:36) ]]
	 [[functional_29/embedding_16/embedding_lookup/_24]]
  (1) Invalid argument:  indices[0,756] = -1 is not in [0, 8333)
	 [[node functional_29/embedding_16/embedding_lookup (defined at Users\suhas maddali\AppData\Local\Temp\ipykernel_4016\4194731421.py:36) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_26571]

Errors may have originated from an input operation.
Input Source operations connected to node functional_29/embedding_16/embedding_lookup:
 functional_29/embedding_16/embedding_lookup/26263 (defined at Anaconda_latest\envs\englishlanguagelearning_gpu\lib\contextlib.py:113)

Input Source operations connected to node functional_29/embedding_16/embedding_lookup:
 functional_29/embedding_16/embedding_lookup/26263 (defined at Anaconda_latest\envs\englishlanguagelearning_gpu\lib\contextlib.py:113)

Function call stack:
train_function -> train_function


In [67]:
y_predictions = model.predict(padded_sequences_test_concat)
y_predictions = np.argmax(y_predictions, axis = 1)
print_metrics_function(np.argmax(y_test, axis = 1), y_predictions)

Accuracy: 0.3333333333333333
Precision: 0.13276675474969002
Recall: 0.16239541708291708
F1-Score: 0.14241225605270294
Cohen Kappa Score: 0.4073113882641587


(0.3333333333333333,
 0.13276675474969002,
 0.16239541708291708,
 0.14241225605270294,
 0.4073113882641587)