In [None]:
# https://www.kaggle.com/datafan07/eda-simple-bayesian-ridge-with-sentence-embeddings/notebook#Meta-Features
# https://www.kaggle.com/vigneshbaskaran/commonlit-pytorch-vs-sklearn-regression?scriptVersionId=63518123

# Objective
The objective of this script is to set-up a strong and quick to set up transformer baseline using Sentence transformer library and Scikit learn. The next plans are to:
1. Use Sentence transformer library and Scikit learn regression modules
2. Replace sentence transformer with native huggingface module (No specific reason to do this. I just want to learn)
3. Replace scikit learn regression with pytorch regression
3. Finetune the transformer model
4. Replace RMSE with Ranking loss

In [None]:
%%capture
! pip install  '../input/commonlit-data-download/sentence-transformers'

In [None]:
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge

In [None]:
RANDOM_STATE = 41

In [None]:
COMPETITION_DATA_PATH = Path('../input/commonlitreadabilityprize')
TRAIN_DATA_PATH = COMPETITION_DATA_PATH / 'train.csv'
TEST_DATA_PATH = COMPETITION_DATA_PATH / 'test.csv'

In [None]:
train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)
train_data, valid_data = train_test_split(train_data, test_size=0.1, random_state=RANDOM_STATE)

print(f'Length of training data: {len(train_data)}')
print(f'Length of validation data: {len(valid_data)}')
print(f'Length of test data: {len(test_data)}')

# Sentence transformer feature extraction

In [None]:
model = SentenceTransformer('../input/commonlit-data-download/paraphrase-distilroberta-base-v2')

def create_features(text_excerpts):
    features = model.encode(text_excerpts)
    return features 

def create_targets(targets):
    targets = targets.reshape(-1, 1).astype(np.float32)
    return targets

In [None]:
X_train = create_features(train_data['excerpt'].tolist())
y_train = create_targets(train_data['target'].to_numpy())
X_valid = create_features(valid_data['excerpt'].tolist())
y_valid = create_targets(valid_data['target'].to_numpy())

# Sklearn Regression

In [None]:
regressor = Ridge().fit(X_train, y_train)
y_valid_pred = regressor.predict(X_valid)
error = mean_squared_error(y_valid_pred, y_valid)
print(f'Root mean squared error: {error:.3f}')

# Make submission

In [None]:
X_test = create_features(test_data['excerpt'].tolist())
test_data['target'] = regressor.predict(X_test)
test_data[['id','target']].to_csv('submission.csv', index=False)