In [None]:
!pip install -q pandarallel
!pip install -q spacy 
!pip install -q spacy_cld
!pip install -q pyspellchecker
!python -m spacy download xx_ent_wiki_sm > /dev/null

In [None]:
import os
import gc
import time
import random

import numpy as np
import pandas as pd

import spacy
from spacy_cld import LanguageDetector
import xx_ent_wiki_sm

from spellchecker import SpellChecker

import matplotlib.pyplot as plt
%matplotlib inline

import tqdm
from tqdm.notebook import tqdm
tqdm.pandas()

import re
import nltk

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, StratifiedKFold

from wordcloud import WordCloud, STOPWORDS
from sklearn.metrics import accuracy_score, roc_auc_score
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras import layers
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tokenizers import BertWordPieceTokenizer
from colorama import Fore, Back, Style, init
import plotly.graph_objects as go

from tensorflow.keras.layers import (Dense, Input, LSTM, Bidirectional, Activation, Conv1D, GRU,
                          Embedding, Flatten, Dropout, Add, concatenate, MaxPooling1D,
                         GlobalAveragePooling1D,  GlobalMaxPooling1D, GlobalMaxPool1D,
                        SpatialDropout1D)

from tensorflow.keras import (initializers, regularizers, constraints, 
                              optimizers, layers, callbacks)
import seaborn as sns
sns.set(style="darkgrid")

# TPU configuration

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

# Configuration

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration
EPOCHS     = 50
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN    = 512
MODEL      = 'jplu/tf-xlm-roberta-large'

# Dataset

In [None]:
dataset_tr = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
dataset_te = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')
print(dataset_tr.shape)
print(dataset_te.shape)

**Concate feedback text in the train dataset**

In [None]:
dataset_tr['text'] = dataset_tr['essay_id'].apply(lambda x: \
                                    open(f'/kaggle/input/feedback-prize-effectiveness/train/{x}.txt').read())
dataset_tr.head(2)

**Map the labels to Numerical Category**

In [None]:
effectiveness_map    = {"Ineffective":0, "Adequate":1,"Effective":2}
dataset_tr["target"] = dataset_tr["discourse_effectiveness"].map(effectiveness_map)

# Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks = False, 
        return_token_type_ids  = False,
        pad_to_max_length      = True,
        max_length             = maxlen,
    )

    return np.array(enc_di['input_ids'])

def encode_data(df, ids, masks, tokenizer, maxlen=512):
    for i, text in tqdm(enumerate(df['text'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=maxlen, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :]   = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [None]:
dataset_tr['text'].shape

In [None]:
# x_train = encode_data(dataset_tr['text'], tokenizer, maxlen=MAX_LEN)
# x_train.shape

X_input_ids  = np.zeros((len(dataset_tr), MAX_LEN))
X_attn_masks = np.zeros((len(dataset_tr), MAX_LEN))

X_input_ids, X_attn_masks = encode_data(dataset_tr, X_input_ids, X_attn_masks, tokenizer, maxlen=MAX_LEN)

# Prepare labels

In [None]:
labels = np.zeros((len(dataset_tr), 3))
labels[np.arange(len(dataset_tr)), dataset_tr['target'].values] = 1

print(labels.shape)

In [None]:
# def DatasetMapFunction(input_ids, attn_masks, labels):
#     return {
#         \
#         'input_ids': input_ids,
#         'attention_mask': attn_masks
#     }, labels

# dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
# dataset = dataset.map(DatasetMapFunction)     # converting to required format for tensorflow dataset
# dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

# Split dataset

In [None]:
# p = 0.8
# # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.
# train_size    = int((len(dataset_tr)//16)*p)
# train_dataset = dataset.take(train_size)
# val_dataset   = dataset.skip(train_size)

X_train, X_test, y_train, y_test = train_test_split(X_input_ids, labels, test_size = 0.10, random_state = 0)

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_test, y_test))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

# test_dataset = (
#     tf.data.Dataset
#     .from_tensor_slices(x_test)
#     .batch(BATCH_SIZE)
# )

# Model

In [None]:
def build_model(transformer, max_len=512):
    input_word_ids  = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token       = sequence_output[:, 0, :]
    # 0 refers to output for the [CLS] token OR [all sentences,token(0 for CLS),hiddne units output]
    out = Dense(3, activation='softmax')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model             = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

In [None]:
# input_ids  = tf.keras.layers.Input(shape=(512,), name='input_ids',      dtype='int32')
# attn_masks = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

# Train Model

In [None]:
import time
n_steps = X_train.shape[0] // BATCH_SIZE


start_time    =  time.time()

train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

training_time = time.time() - start_time
print('Training time: ', training_time, ' sec')

In [None]:
# model.save('XLM_Roberta')
# model.save('XLM_Roberta.h5')

In [None]:
# Plot training & validation accuracy values
plt.plot(train_history.history['accuracy'])
plt.plot(train_history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
n_steps = X_test.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2
)

In [None]:
# Plot training & validation accuracy values
plt.plot(train_history_2.history['accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper left')
plt.show()

In [None]:
hist_df1 = pd.DataFrame(train_history.history)
hist_df2 = pd.DataFrame(train_history_2.history) 

# or save to csv: 
hist_file1 = 'train_history.csv'
with open(hist_file1, mode='w') as f:
    hist_df1.to_csv(f)

hist_file2 = 'valid_history.csv'
with open(hist_file2, mode='w') as f:
    hist_df2.to_csv(f)

# Predictions

In [None]:
dataset_te['discourse_type'].shape

In [None]:
X_test_input_ids  = np.zeros((len(dataset_te), 512))
X_test_attn_masks = np.zeros((len(dataset_te), 512))
for i, text in enumerate(dataset_te['discourse_type']):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=MAX_LEN, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        X_test_input_ids[i, :]  = tokenized_text.input_ids
        X_test_attn_masks[i, :] = tokenized_text.attention_mask


pred_labels = model.predict(X_test_input_ids)

# Submission

In [None]:
sample_submission = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/sample_submission.csv')

sample_submission['discourse_id'] = dataset_te['discourse_id']
sample_submission['Ineffective']  = pred_labels[:,0]
sample_submission['Adequate']     = pred_labels[:,1]
sample_submission['Effective']    = pred_labels[:,2]
sample_submission.to_csv("submission.csv", index=False)

In [None]:
sample_submission