In [None]:
import pandas as pd
df_train = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
df_test = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')
df_train["text"] = df_train["essay_id"].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/train/{x}.txt').read())
df_test["text"] = df_test["essay_id"].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/test/{x}.txt').read())
df_train.head()

In [None]:
import seaborn as sns
sns.set_theme(style="darkgrid")
sns.set(rc={"figure.figsize": (10, 10)})

sns.countplot(x='discourse_type', hue='discourse_effectiveness', data = df_train)

In [None]:
feedback_map = {"Adequate":1,"Effective":2,"Ineffective":0}
df_train["feedback"] = df_train["discourse_effectiveness"].map(feedback_map)

In [None]:
#Tokenization
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


sentences = df_train.discourse_type.values + ' ' + df_train.text.values
labels = df_train.feedback.values
testing_sentences = df_test.discourse_type.values + ' ' + df_test.text.values

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = int(sentences.shape[0] * 0.8)
training_sentences = sentences[0:training_size]
valid_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
valid_labels = labels[training_size:]

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)       #out of vocabulary token
tokenizer.fit_on_texts(training_sentences)
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

valid_sequences = tokenizer.texts_to_sequences(valid_sentences)
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
valid_padded = np.array(valid_padded)
valid_labels = np.array(valid_labels)
testing_padded = np.array(testing_padded)

In [None]:
from tensorflow.keras.optimizers import Adam

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer=Adam(),metrics=['accuracy'])
model.summary()

In [None]:
num_epochs = 3
history = model.fit(training_padded, training_labels, validation_data=(valid_padded, valid_labels),
                    epochs=num_epochs, verbose=2)

In [None]:
pred_labels = model.predict(testing_padded)

In [None]:
sample_submission = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/sample_submission.csv')
sample_submission.head()

In [None]:
sample_submission['discourse_id'] = df_test['discourse_id']
sample_submission['Ineffective'] = pred_labels[:,0]
sample_submission['Adequate'] = pred_labels[:,1]
sample_submission['Effective'] = pred_labels[:,2]
sample_submission.to_csv("submission.csv", index=False)