# overview

For Competition [CommonLit Readability Prize](https://www.kaggle.com/c/commonlitreadabilityprize)

    release the train section for train
    release the submit section for test

    well, this notebook has a really low rank -_-! but I hope its structure could help you a little bit more.
    The only thing you should change is the get_model function
    Then you could train your own model with preprocess, K-FOLD and inferences.
    Thank you!

# 1. load data

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords

In [None]:
df_train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df_test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
# Plot the distribution of count of words
words = df_train['excerpt'].str.split().apply(len)
plt.figure(figsize=(10,5))
plt.hist(words, alpha=0.8, bins=15)
plt.legend(loc='best')
plt.xlabel('Count of words')
plt.ylabel('Count')
plt.title('Count of words in excerpt')
plt.show()

# 2. preprocess data

In [None]:
import re

In [None]:
def clean_text(txt):
    txt = re.sub("[^a-zA-Z]", " ", txt)
    txt = txt.lower()

    txt = nltk.word_tokenize(txt)
    txt = [word for word in txt if not word in set(stopwords.words("english"))]

    lemma = nltk.WordNetLemmatizer()
    txt = [lemma.lemmatize(word) for word in txt]
    txt = " ".join(txt)
    return txt

# df_train['excerpt'] = df_train['excerpt'].apply(lambda x: clean_text(x))
# df_test['excerpt'] = df_test['excerpt'].apply(lambda x: clean_text(x))

In [None]:
df_train.head()

# 3. Tokenization

# 4. Train model

In [None]:
from sklearn.model_selection import KFold
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, RobertaTokenizer, TFRobertaForSequenceClassification

In [None]:
MAXLEN = 512
BATCH_SIZE = 3
EPOCHS = 20
LR = 1e-5
N_SPLITS = 5

def get_model(bert_model):
    input_ids = tf.keras.layers.Input(shape=(MAXLEN, ), dtype='int32', name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(MAXLEN, ), dtype='int32', name='attention_mask')
#     token_type_ids = tf.keras.layers.Input(shape=(MAXLEN, ), dtype='int32', name='token_type_ids')

    X = bert_model(input_ids=input_ids, attention_mask=attention_mask)[0]
    outputs = tf.keras.layers.Dense(1, use_bias=True, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(X)
    
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=outputs)
    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam(lr=LR), metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

In [None]:
fold = 0
train = df_train['excerpt']
targets = df_train['target']

for train_idx, val_idx in KFold(N_SPLITS, shuffle=True, random_state=2021).split(train):
    # get data
    if fold != 0:
        fold += 1
        continue
    
    X_train = train[train_idx]
    X_val = train[val_idx]
    y_train = targets[train_idx]
    y_val = targets[val_idx]
    
    y_train = tf.constant(y_train, dtype=tf.float32)
    y_val = tf.constant(y_val, dtype=tf.float32)
    
    # process data
    X_train = [clean_text(x) for x in X_train]
    X_val = [clean_text(x) for x in X_val]
    
    # get model
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    bert_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

    model = get_model(bert_model)
    # model.load_weights('../input/commonlit-readability-model/checkpoint/variables/variables')
    model.summary()
    
    # tokenize data
    X_train = tokenizer(X_train, padding="max_length", max_length=MAXLEN, return_tensors='tf', truncation=True)
    X_val = tokenizer(X_val, padding="max_length", max_length=MAXLEN, return_tensors='tf', truncation=True)
    X_train = {"input_ids": X_train['input_ids'], "attention_mask": X_train['attention_mask']}
    X_val = {"input_ids": X_val['input_ids'], "attention_mask": X_val['attention_mask']}
    
    # train model
    checkpoint = [tf.keras.callbacks.ModelCheckpoint(f'Fold{fold}/checkpoint', save_weights_only=False, save_best_only=True)]
    model.fit(X_train, y_train, batch_size=BATCH_SIZE, validation_data=(X_val, y_val), epochs=EPOCHS, callbacks=[checkpoint])
    
    # save model
    tokenizer.save_pretrained(f'Fold{fold}/tokenizer/')
    bert_model.save_pretrained(f'Fold{fold}/distil_bert/')
    
    fold += 1

# 5. Save model

# 6. Submission

In [None]:
# test = df_test['excerpt']
# X_test = [clean_text(x) for x in test]
# X_test[0]

In [None]:
# result = 0
# for i in range(N_SPLITS):
#     # get model
#     tokenizer = RobertaTokenizer.from_pretrained(f'../input/commonlit-readability-model/robert-large/Fold{i}/tokenizer/')
#     bert_model = TFRobertaForSequenceClassification.from_pretrained(f'../input/commonlit-readability-model/robert-large/Fold{i}/distil_bert')

#     model = get_model(bert_model)
#     if i != 1: model.load_weights(f'../input/commonlit-readability-model/robert-large/Fold{i}/checkpoint/variables/variables')
#     else: model.load_weights(f'../input/commonlit-readability-model/robert-large/Fold{i}/checkpoint/variables')
#     # tokenize data
#     X_test_token = tokenizer(X_test, padding="max_length", max_length=MAXLEN, return_tensors='tf', truncation=True)
#     X_test_token = {"input_ids": X_test_token['input_ids'], "attention_mask": X_test_token['attention_mask']}
    
#     # predict
#     result += model.predict(X_test_token)

# # result /= N_SPLITS

In [None]:
# # result = 0
# for i in range(N_SPLITS):
#     # get model
#     tokenizer = DistilBertTokenizer.from_pretrained(f'../input/commonlit-readability-model/distil/Fold{i}/tokenizer/')
#     bert_model = TFDistilBertForSequenceClassification.from_pretrained(f'../input/commonlit-readability-model/distil/Fold{i}/distil_bert')

#     model = get_model(bert_model)
#     model.load_weights(f'../input/commonlit-readability-model/distil/Fold{i}/checkpoint/variables/variables')
    
#     # tokenize data
#     X_test_token = tokenizer(X_test, padding="max_length", max_length=MAXLEN, return_tensors='tf', truncation=True)
#     X_test_token = {"input_ids": X_test_token['input_ids'], "attention_mask": X_test_token['attention_mask']}
    
#     # predict
#     result += model.predict(X_test_token)

# result /= (N_SPLITS*2)

In [None]:
# submission_df = pd.DataFrame({'id': df_test.id, 'target': 0})
# submission_df.target = result

# submission_file = 'submission.csv'
# submission_df.to_csv(submission_file, index=False)

# submission_df