In [None]:
# A dependency of the preprocessing for BERT inputs
# !pip install "tensorflow-text"
# !pip install tensorflow_hub
# !pip install tf-models-official==2.7.0

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

In [None]:
import numpy as np
import pandas as pd
import re
import string

from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout ,GlobalAveragePooling1D

from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
train = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
test = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')
sample_submission = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/sample_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

In [None]:
train.shape, test.shape

In [None]:
train.info()

In [None]:
test.info()

In [None]:
print(train['discourse_type'].value_counts())
print()
print(train['discourse_effectiveness'].value_counts())

In [None]:
train['Adequate'] = pd.get_dummies(train['discourse_effectiveness'])['Adequate']
train['Effective'] = pd.get_dummies(train['discourse_effectiveness'])['Effective']
train['Ineffective'] = pd.get_dummies(train['discourse_effectiveness'])['Ineffective']

In [None]:
train.head()

In [None]:
train['discourse_text'][10]

In [None]:
len(train['discourse_text'][10])

In [None]:
def custom_standardization(input_data):
    text = tf.strings.lower(input_data)
    
    #removing square brackets  
    text = tf.strings.regex_replace(text,'\[.*?\]', '')
    
    #removing puncuation
    text = tf.strings.regex_replace(text,'[%s]' % re.escape(string.punctuation), '')
    text = tf.strings.regex_replace(text ,'\n' , '')

    #remove words containing numbers
    text = tf.strings.regex_replace(text ,'\w*\d\w*' , '')
        
    return tf.strings.regex_replace(text,'[%s]' % re.escape(string.punctuation),'')

In [None]:
# max_features = 10000 # no of word in vocab
# sequence_length = 500

In [None]:
# vectorize_layer = TextVectorization(
#     standardize=custom_standardization,
#     max_tokens=max_features,
#     output_mode='int',
#     output_sequence_length=sequence_length
# )

In [None]:
# print(np.array(vectorize_layer.get_vocabulary()))
# print(len(np.array(vectorize_layer.get_vocabulary())))

In [None]:
# vectorize_layer.adapt(train['discourse_text'].values)
# vectorize_text = vectorize_layer(train['discourse_text'].values)
# vectorize_text

In [None]:
columns_to_br_reomve = ['discourse_id','essay_id','discourse_type','discourse_effectiveness']
train = train.drop(columns_to_br_reomve, axis=1)

In [None]:
train.head()

In [None]:
y = train.drop(['discourse_text'],axis=1)
X = train.drop(['Adequate','Effective','Ineffective'], axis=1)

In [None]:
X

In [None]:
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/1"
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"

text_test = ['this is such an amazing movie!']

# text_input = ["This is a sample sentence."]
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
encoder_inputs = preprocessor(text_input) 


# preprocessor = hub.KerasLayer(tfhub_handle_preprocess)


# encoder = hub.KerasLayer(tfhub_handle_encoder,

#     trainable=True
# )

# outputs = encoder(encoder_inputs)
# pooled_output = outputs["pooled_output"]
# sequence_output = outputs["sequence_output"]


In [None]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(3, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [None]:
# epochs = 5
# steps_per_epoch = tf.data.experimental.cardinality(X).numpy()
# num_train_steps = steps_per_epoch * epochs
# num_warmup_steps = int(0.1*num_train_steps)

# init_lr = 3e-5
# optimizer = optimization.create_optimizer(init_lr=init_lr,
#                                           num_train_steps=num_train_steps,
#                                           num_warmup_steps=num_warmup_steps,
#                                           optimizer_type='adamw'
#                                          )


classifier_model.compile(optimizer='adam',loss=loss,metrics=metrics)

In [None]:
epochs = 8
history = classifier_model.fit(X,y,epochs=epochs)

In [None]:
pred = classifier_model.predict(test['discourse_text'])
prediction = pd.DataFrame(pred, columns=["Ineffective", "Adequate", "Effective"])


In [None]:
prediction["discourse_id"] = sample_submission["discourse_id"] 
titles = ['discourse_id','Ineffective', 'Adequate', 'Effective']
prediction = prediction.reindex(columns = titles)

In [None]:
submission = pd.DataFrame(prediction)
submission.to_csv('submission.csv', index = False)