## CommonLit | BERT + tuned SVR

- Encoder: Pre-tuned BERT model (bert-base-uncased)
- Decoder: SVR regression model (with hyperparameter tuned using 5 fold CV)

In [None]:
import keras
import pandas as pd
from tqdm import tqdm
import numpy as np
from transformers import BertTokenizer, TFBertModel
from sklearn.svm import SVR
import numpy as np

### Load data

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
train.head()

### Prepare train data (tokenize)

In [None]:
# load the tokenizer
model_path = "../input/huggingface-bert/bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_path)

In [None]:
# tokenize the excerpt data
train_data = []
for sent in tqdm(train['excerpt'].tolist()):
    train_data.append(tokenizer(sent, padding="max_length", truncation=True, return_tensors="tf"))

### Encoder (train data)

In [None]:
# load the model
BERTmodel = TFBertModel.from_pretrained(model_path)

In [None]:
# embed the training sentences
train_sent_embedding = []
for x in tqdm(train_data):
    train_sent_embedding.append(BERTmodel(x).last_hidden_state[0][0])

### Decoder pre-requisite: Hyperparameter tuning

- Use SVR regression.
- Tune the hyperparameter by 5 fold CV `GridSearch`

In [None]:
# import
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

# set params
parameters = [{'kernel': ['poly', 'rbf', 'sigmoid'], 'gamma': [1e-4, 1e-3, 1e-2],
               'C': [1, 10, 100], 'epsilon': [10, 1, 0.1, 0.01, 0.001]}]

#
scorer = make_scorer(mean_squared_error, greater_is_better=False)
svr_gs = GridSearchCV(SVR(), parameters, cv = 5, scoring=scorer, verbose=10, n_jobs=None)
svr_gs.fit(train_sent_embedding, train['target'])

In [None]:
# Checking the score for all parameters
parameter_result = []
print("Grid scores on training set:")
means = svr_gs.cv_results_['mean_test_score']
stds = svr_gs.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, svr_gs.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
    parameter_result.append({'mean': abs(mean), 'std': std, **params})
    
# select the settings with smallest loss
parameter_result = pd.DataFrame(parameter_result)
parameter_result = parameter_result.sort_values(by=['mean'])
best_settings = parameter_result.head(1).to_dict(orient='records')[0]

### Decoder 

- Create SVR regression model with best parameters identifier in above hyperparameter tuning

In [None]:
model = SVR(C=best_settings['C'], 
            epsilon=best_settings['epsilon'], 
            gamma=best_settings['gamma'],
            kernel= best_settings['kernel'])
model.fit(train_sent_embedding, train['target'])

### Test

In [None]:
test_data = []
for sent in tqdm(test['excerpt'].tolist()):
    test_data.append(tokenizer(sent, padding="max_length", truncation=True, return_tensors="tf"))

In [None]:
# embed the training sentences
test_sent_embedding = []
for x in tqdm(test_data):
    test_sent_embedding.append(BERTmodel(x).last_hidden_state[0][0])

In [None]:
# perform predictions
y_pred = model.predict(np.array(test_sent_embedding))

### Submission

In [None]:
sub = test[['id']].copy()

In [None]:
sub['target'] = y_pred
sub.to_csv("submission.csv", index=False)