In [None]:
import numpy as np
import pandas as pd 
from transformers import BertTokenizer


import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalAveragePooling1D


# plots and images
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image


#sklearn processing
from sklearn.model_selection import KFold

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
sample_submission_raw = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/sample_submission.csv')
train_data_raw = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/train.csv')
test_data = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/test.csv')

In [None]:
kf = KFold(n_splits=5)
kfolds_data = kf.split(train_data_raw)

partition = 1
for train_index, test_index in kfolds_data:
    train = pd.DataFrame(train_data_raw.values[train_index])
    train.columns = train_data_raw.columns

    test = pd.DataFrame(train_data_raw.values[test_index])
    test.columns = train_data_raw.columns
    
    train.to_csv(f'/kaggle/working/train_partition_{partition}.csv')
    test.to_csv(f'/kaggle/working/test_partition_{partition}.csv')
    
    partition +=1

In [None]:
train = pd.read_csv('/kaggle/working/train_partition_1.csv').iloc[:,1:]
test = pd.read_csv('/kaggle/working/test_partition_1.csv').iloc[:,1:]

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

essay =  train['full_text'][0]
longest_essay_length = train['full_text'].apply(len).max()

def tokenize(essays):
    return [
            tokenizer.encode(
            x,
            padding='max_length',
            max_length=longest_essay_length,
            truncation=True,
            add_special_tokens= False, #if treue [CLS] and [SEP] get added to each tokenizing.
            ) for x in train['full_text']]

encoded_essays = tokenize(train['full_text'])
#tokenizer.decode(essay_word_indices)

encoded_essays = tf.convert_to_tensor(encoded_essays)
#encoded_score = tf.convert_to_tensor((train['cohesion'] > 3).astype(int))

cohesion_normalized = (train['cohesion'] - train['cohesion'].mean()) / train['cohesion'].std()

encoded_score = tf.convert_to_tensor(cohesion_normalized)

In [None]:
cohesion_normalized

In [None]:
vocab_size_train =tokenizer.vocab_size + 2
embedding_dim = 512*3

tokenizer.vocab_size

model = tf.keras.Sequential()
# add embedding layer (this is our imput layer)
# embeddings will be learned during training
# the weights for the embeddings are randomly initialized (just like any other layer!)
model.add(Embedding(
    vocab_size_train,
    embedding_dim,
    #input_length=100, #limits length of sentence to 100 tokens
    name="embedding")
    ),

# model.add(Conv1D(
#     filters=128,
#     kernel_size=2,
#     padding='same',
#     activation='relu'))

# model.add(tf.keras.layers.Dropout(rate=0.5))
# model.add(MaxPooling1D())

model.add(Conv1D(
    filters=64,
    kernel_size=3,
    padding='same',
    activation='relu'))

model.add(tf.keras.layers.Dropout(rate=0.5))
model.add(MaxPooling1D())


model.add(Conv1D(
    filters=32,
    kernel_size=4,
    padding='same',
    activation='relu'))

model.add(tf.keras.layers.Dropout(rate=0.5))
model.add(MaxPooling1D())


model.add(Conv1D(
    filters=16,
    kernel_size=4,
    padding='same',
    activation='relu'))

model.add(tf.keras.layers.Dropout(rate=0.5))
model.add(MaxPooling1D())


model.add(GlobalAveragePooling1D()),

# add last layer (binary classification task)
#model.add(Dense(1, activation='sigmoid'))

model.add(Dense(1, activation=None))

# save embeddings before training 
# embeddings_untrained = model.get_layer('embedding').get_weights()[0]

# print model summary
model.summary()

In [None]:
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(),
#     loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
#     metrics=['accuracy'])

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss='mse',
    metrics=[tf.keras.metrics.RootMeanSquaredError(
                name='root_mean_squared_error', dtype=None
            )])

In [None]:
hist = model.fit(
    x=encoded_essays,
    y=encoded_score,
    batch_size=64,        # mini-batch size
    validation_split=0.2, # use a fraction of the examples for validation
    verbose=1, 
    epochs=30)

In [None]:
model.predict(encoded_essays)

In [None]:
encoded_score

In [None]:
# grab history
history = hist.history

# plot loss for train and validation
fig = plt.figure(figsize=(16, 4))
ax = fig.add_subplot(1, 3, 1)

plt.plot(history['loss'], lw=2, color='darkgoldenrod')
plt.plot(history['val_loss'], lw=2, color='indianred')
plt.legend(['Train', 'Validation'], fontsize=10)
#plt.ylim(0.5,0.7)
ax.set_xlabel('Epochs', size=10)
ax.set_title('Loss');

# plot accuracy for train and validation
ax = fig.add_subplot(1, 3, 2)
plt.plot(history['root_mean_squared_error'], lw=2, color='darkgoldenrod')
plt.plot(history['val_root_mean_squared_error'], lw=2, color='indianred')
plt.legend(['Train', 'Validation'], fontsize=10)
#plt.ylim(0.7,0.8)
ax.set_xlabel('Epochs', size=10)
ax.set_title('RMSE');