### Read train and test csv and import the all essay texts in new column "text"

In [None]:
import pandas as pd
df_train = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
df_test = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')
df_train["text"] = df_train["essay_id"].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/train/{x}.txt').read())
#df_test["text"] = df_test["essay_id"].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/test/{x}.txt').read())
df_train.head()


### Convert Discourse Effectiveness to numeric codes - 0, 1, 2

In [None]:
effectiveness_map = {"Ineffective":0, "Adequate":1,"Effective":2}
df_train["target"] = df_train["discourse_effectiveness"].map(effectiveness_map)

### Load the Bert Base Tokenizer

In [None]:
from transformers import BertTokenizer
#initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('../input/huggingface-bert-variants/bert-base-cased/bert-base-cased')

### Check if Discourse Effectiveness is related with Discourse Type

In [None]:
import seaborn as sns
sns.set_theme(style="darkgrid")
sns.set(rc={"figure.figsize": (10, 10)})

sns.countplot(x='discourse_type', hue='discourse_effectiveness', data = df_train)

#### It looks like Discourse Effectiveness and Discourse Type have some relation
### Add Discourse Type to part of input text to the model

In [None]:
df_train['text']  = df_train['discourse_type'] + tokenizer.sep_token + df_train['text']
df_test['text']  = df_train['discourse_type'] + tokenizer.sep_token + df_train['discourse_text']

### Initialize Input Ids and Attention Masks tensors

In [None]:
import numpy as np
X_input_ids = np.zeros((len(df_train), 256))
X_attn_masks = np.zeros((len(df_train), 256))

### Function to Bert Encode the input text
#### This function will convert the input text to input ids and attention masks using bert tokenizer

In [None]:
def encode_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['text'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

### Encode the train text

In [None]:
from tqdm.auto import tqdm

X_input_ids, X_attn_masks = encode_data(df_train, X_input_ids, X_attn_masks, tokenizer)

### Prepare the y label tensor (Discourse effectiveness)

In [None]:
labels = np.zeros((len(df_train), 3))
labels[np.arange(len(df_train)), df_train['target'].values] = 1
labels

#### Creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading

In [None]:
import tensorflow as tf

def DatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

dataset = dataset.map(DatasetMapFunction)     # converting to required format for tensorflow dataset
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

### Split the train dataset into training and validation dataset in 80:20

In [None]:
p = 0.8
train_size = int((len(df_train)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

### Load Bert Base Cased Model

In [None]:
from transformers import TFBertModel
model = TFBertModel.from_pretrained('../input/huggingface-bert-variants/bert-base-cased/bert-base-cased') # bert base model with pretrained weights

### Define the model with 2 input layers for input_ids and attn_masks

In [None]:
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

discourse_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
discourse_model.summary()

### Set the Loss, Optimizer and Metrics parameters for the model

In [None]:
from tensorflow.keras.optimizers import Adam
discourse_model.compile(optimizer=Adam(learning_rate=1e-5, decay=1e-6), 
                        loss='categorical_crossentropy', 
                        metrics=['accuracy'])

### Fit the model
##### Epoch was set to 5 as the validation accuracy seems to deteriorate for greater epochs

In [None]:
history = discourse_model.fit(
    train_dataset,
    steps_per_epoch=200,
    validation_data=val_dataset,
    epochs=5
)

### Encode the train dataset and Predict

In [None]:
tokenizer = BertTokenizer.from_pretrained('../input/huggingface-bert-variants/bert-base-cased/bert-base-cased')
df_test['text']  = df_test['discourse_type'] + tokenizer.sep_token + df_test['text']
X_test_input_ids = np.zeros((len(df_test), 256))
X_test_attn_masks = np.zeros((len(df_test), 256))
X_test_input_ids, X_test_attn_masks = encode_data(df_test, X_test_input_ids, X_test_attn_masks, tokenizer)

pred_labels = discourse_model.predict([X_test_input_ids, X_test_attn_masks] )

### Generate the Sample Submission file

In [None]:
sample_submission = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/sample_submission.csv')
sample_submission.head()

In [None]:
sample_submission['discourse_id'] = df_test['discourse_id']
sample_submission['Ineffective'] = pred_labels[:,0]
sample_submission['Adequate'] = pred_labels[:,1]
sample_submission['Effective'] = pred_labels[:,2]
sample_submission.to_csv("submission.csv", index=False)

In [None]:
sample_submission