In [None]:
%config Completer.use_jedi = False

In [None]:
#  Installing ktrain
!pip install ktrain

# Importing Modules

In [None]:
# Standard imports
import os
import pprint
import json
import pandas as pd
import numpy as np
from IPython.display import display

# For plotting
import plotly.express as px
import plotly.graph_objects as go

# For Evaluation and model selection 
from sklearn.model_selection import *
from sklearn.metrics import *

# For model building
import tensorflow as tf
import ktrain
from ktrain import text

# Config


In [None]:
params = {}
params['train_csv'] = "../input/commonlitreadabilityprize/train.csv"
params['test_csv'] = "../input/commonlitreadabilityprize/test.csv"
params['sample_sub'] = "../input/commonlitreadabilityprize/sample_submission.csv"

# Loading dataset

In [None]:
train_df = pd.read_csv(params['train_csv'])
test_df = pd.read_csv(params['test_csv'])

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
train_df.head()

In [None]:
test_df

In [None]:
# Checking whether we have any duplicates in test and train
print(f"Number of ids in train : {len(train_df)}")
print(f"Number of unique ids in train : {len(train_df['id'].unique())}")

print(f"Number of ids in test : {len(test_df)}")
print(f"Number of unique ids in test : {len(test_df['id'].unique())}")

In [None]:
train_ids = set(train_df['id'].values)
test_ids = set(test_df['id'].values)

if len(train_ids.intersection(test_ids)) > 0:
    print(f"Common ids in train and test : {train_ids.intersection(test_ids)}")
else:
    print("No intersection")

# Distribution of labels

In [None]:
fig = px.histogram(train_df, x = 'target')
fig.update_layout(
        title_text = "Distribution of targets",
        title_x = 0.5,
)
fig.show()

In [None]:
print(f"Mean of my labels : {np.mean(train_df['target'])}")
print(f"Std of my labels : {np.std(train_df['target'])}")

# Creating training and validation datasets

In [None]:
max_ = 0; min_ = 1e9
for i in train_df.excerpt.values:
    max_ = max(max_, len(i))
    min_ = min(min_, len(i))

max_, min_

In [None]:
def create_folds(data, target="target", num_splits = 5): 
    data["kfold"] = -1 
    data = data.sample(frac=1).reset_index(drop=True)
    
    # Applying Sturg's rule to calculate the no. of bins for target
    num_bins = int(1 + np.log2(len(data))) 

    data.loc[:, "bins"] = pd.cut(data[target], bins=num_bins, labels=False) 
    kf = StratifiedKFold(n_splits=num_splits) 
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)): 
        data.loc[v_, 'kfold'] = f
        
    data = data.drop(["bins"], axis = 1)         
    return data 

train_df = create_folds(train_df, target = 'target', num_splits = 4)
train_df.kfold.value_counts()

In [None]:
# Splitting into train and val

train_set = train_df.loc[train_df.kfold != 3]
val_set = train_df.loc[train_df.kfold == 3]

len(train_set), len(val_set)

## Splitting into features

In [None]:
X_train = train_set.excerpt.values
X_val = val_set.excerpt.values

y_train = train_set.target.values
y_val = val_set.target.values

In [None]:
y_train

In [None]:
trn, val, preproc = text.texts_from_array(
                        x_train=X_train, y_train=y_train,                                          
                        x_test=X_val, y_test=y_val,                                          
                        ngram_range=3,                                          
                        maxlen=512,                                           
                        max_features=35000,
                        preprocess_mode='bert'
                    )

# Lets see what all regression models we have !!

In [None]:
text.print_text_regression_models()

Lets start with the infamous BERT model

In [None]:
model = text.text_regression_model('bert',
                                  train_data = trn,
                                  preproc = preproc)
# Setting our learner
learner = ktrain.get_learner(
    model, 
    train_data = trn,
    val_data = val,
    batch_size = 6
)

In [None]:
# Estimating the optimizer Learning rate

learner.lr_find()

In [None]:
learner.lr_plot()

# Training and Inspecting Model

In [None]:
learner.fit_onecycle(1e-4, 10)

In [None]:
learner.view_top_losses(n=3, preproc = preproc)

# Visualizing results

In [None]:
hist = learner.history.history
train_loss = hist['loss']
val_loss = hist['val_loss']

train_mae = hist['mean_absolute_error']
val_mae = hist['val_mean_absolute_error']

epochs = [d for d in range(1, len(train_loss)+1)]

lr = hist['lr']
iters = hist['iterations']


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=epochs, y=train_loss,
                    mode='lines+markers',
                    name='train_loss'))
fig.add_trace(go.Scatter(x=epochs, y=val_loss,
                    mode='lines+markers',
                    name='val_loss'))
fig.add_trace(go.Scatter(x=epochs, y=train_mae,
                    mode='lines+markers', name='train_mae'))
fig.add_trace(go.Scatter(x=epochs, y=val_mae,
                    mode='lines+markers', name='val_mae'))

fig.update_layout(
    title_text = "Training Results",
    title_x = .5,
    xaxis_title = "EPOCHS",
    yaxis_title = "Values"
)
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=iters, y=lr,
                    mode='markers',
                    name='Learning Rate'))

fig.update_layout(
    title_text = "Learning rate",
    title_x = .5,
    xaxis_title = "Iterations",
    yaxis_title = "Learning rates"
)
fig.show()

# Lets Build our predictor and evaluate our model

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [None]:
val_preds = []
for txt in X_val:
    val_preds.append(predictor.predict(txt))

In [None]:
# Model performance
rmse = mean_squared_error(val_preds, y_val, squared = False)
print(f"Model Score : {round(rmse,3)}")

# Saving model

In [None]:
!mkdir ./model_BERT_token_BERT_CLRP
predictor.save("./model_BERT_token_BERT_CLRP/model")

In [None]:
!pip install kaggle

In [None]:
!cp ../input/kaggle-token/kaggle_token.json ./
!mv ./kaggle_token.json ./kaggle.json

In [None]:
!ls -l ../../root
!cp ./kaggle.json ../../root/
!ls ../../root

In [None]:
!mkdir ../../root/.kaggle
!mv ../../root/kaggle.json ../../root/.kaggle/kaggle.json

!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets init -p ./model_BERT_token_BERT_CLRP

In [None]:
!cat ./model_BERT_token_BERT_CLRP/dataset-metadata.json

In [None]:
import json
with open("./model_BERT_token_BERT_CLRP/dataset-metadata.json", 'r+') as file_:
    meta_data = json.load(file_)
    meta_data['title'] = 'model_BERT_token_BERT_CLRP'
    meta_data['id'] = 'hotsonhonet/ModelsCLRP'
    file_.seek(0)        
    json.dump(meta_data, file_, indent=4)
    file_.truncate()
    
print(meta_data['title'], meta_data['id'])

!cat ./model_BERT_token_BERT_CLRP/dataset-metadata.json

In [None]:
!mv ./model_BERT_token_BERT_CLRP/model/* ./model_BERT_token_BERT_CLRP

In [None]:
!kaggle datasets create -p ./model_BERT_token_BERT_CLRP 