In [None]:
import os
import string
import re
import pickle
import gzip
import nltk
import torch
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["HF_DATASETS_OFFLINE"] = '1'
os.environ["TRANSFORMERS_OFFLINE"] = '1'


In [None]:
!pip install /kaggle/input/sentence-transformers/sentence-transformers-master

In [None]:
from sentence_transformers import SentenceTransformer

FOLDS = 5
STRANSFORMERS = {
    'sentence-transformers/paraphrase-mpnet-base-v2': ('mpnet', 768),
    'sentence-transformers/bert-base-wikipedia-sections-mean-tokens': ('wikipedia', 768)
}

In [None]:
def pickle_load(filename, gzipping=True):
    """Loads a compressed object from disk
    """
    open_f = gzip.GzipFile if gzipping else open
    with open_f(filename, 'rb') as f:
        object = pickle.load(f)
    return object

def get_encode(df, encoder, name):    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
    model = SentenceTransformer(
        encoder, 
        cache_folder=f'/kaggle/input/huggingface-v3/hf_{name}/hf_{name}/'
    )
    model.to(device)
    model.eval()
    return np.array(model.encode(df['excerpt']))

def get_embeddings(df, emb=None):
    
    ret = pd.DataFrame(index=df.index)
    
    for e, s in STRANSFORMERS.items():
        if emb and s[0] != emb:
            continue
            
        ret[s[0]] = list(get_encode(df, e, s[0]))
        ret = pd.concat(
            [ret, pd.DataFrame(
                ret[s[0]].tolist(),
                columns=[f'{s[0]}_{x}' for x in range(s[1])],
                index=ret.index)],
            axis=1, copy=False, sort=False)
    
    return ret

def create_folds(X, n_splits, seed):

    df = X[["target"]].copy()
    # calculate number of bins by Sturge's rule
    num_bins = int(np.floor(1 + np.log2(len(df))))
    
    df.loc[:, "bins"] = pd.cut(
        df["target"], bins=num_bins, labels=False
    )

    kf = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=seed)
    
    return kf.split(X=df, y=df['bins'].values)

def get_oof_svr(n_folds, x_train, y, x_test, seeds):
    
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]  
        
    oof_train = np.zeros((len(seeds), ntrain))
    oof_test = np.zeros((ntest))
    oof_test_skf = np.empty((len(seeds), n_folds, ntest))

    models = {}   
    for iseed, seed in enumerate(seeds):
        for i, (tr_i, t_i) in enumerate(
                create_folds(
                    pd.concat([x_train, y], axis=1, copy=False, sort=False),
                    n_splits=n_folds,
                    seed=seed
                )):
            print(f'\nSeed {seed}, Fold {i}')
            x_tr = pd.concat(
                [x_train.iloc[tr_i, :], get_embeddings(x_train.iloc[tr_i, :])],
                axis=1, copy=False, sort=False)
            x_te = pd.concat(
                [x_train.iloc[t_i, :], get_embeddings(x_train.iloc[t_i, :])],
                axis=1, copy=False, sort=False)
            y_tr = y[tr_i]
            y_te = y[t_i]
            svr_pipeline = Pipeline([
                ('norm', MinMaxScaler()),
                ('classifier', SVR(C=10, kernel='rbf', gamma='auto'))
            ])
            columns = [x for x in x_tr.columns if x not in ['excerpt'] + \
                       [y[0] for y in STRANSFORMERS.values()]]
            svr_pipeline.fit(x_tr[columns], y_tr)
            oof_train[iseed, t_i] = svr_pipeline.predict(x_te[columns])
            oof_test_skf[iseed, i, :] = svr_pipeline.predict(pd.concat(
                [x_test, get_embeddings(x_test)],
                axis=1, copy=False, sort=False)[columns])
            models[(seed, i)] = svr_pipeline
            
    oof_test[:] = oof_test_skf.mean(axis=1).mean(axis=0)
    oof_train = oof_train.mean(axis=0)
    
    return oof_train, oof_test, models


In [None]:
test_src = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv", index_col='id')
train_src = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv", index_col='id')

In [None]:
df_full = pd.concat([train_src, test_src], copy=True, sort=False, axis=0)
df_full['license'] = df_full['license'].fillna('nan')


le_license = LabelEncoder()
le_license.fit(df_full['license'])
df_full['license'] = le_license.transform(df_full['license'])

In [None]:
cb_models = pickle_load('../input/catboost-classifier-for-simple-normal-wiki-texts/models.zpkl')

def get_oof_classifier(
        models, n_folds, x_test, text_features, seeds):
    
    ntest = x_test.shape[0]  
    oof_test = np.zeros((ntest))
    oof_test_skf = np.empty((len(seeds), n_folds, ntest))

    test_pool = Pool(data=x_test, text_features=text_features)
    
    for iseed, seed in enumerate(seeds):
        for i in range(n_folds):
            model = models[(seed, i)]
            oof_test_skf[iseed, i, :] = model.predict_proba(test_pool)[:,1]
            
    oof_test[:] = oof_test_skf.mean(axis=1).mean(axis=0)
    
    return oof_test

In [None]:
train = df_full.loc[train_src.index].copy()
test = df_full.loc[test_src.index].copy()

In [None]:
train['text'] = train['excerpt']
test['text'] = test['excerpt']

train['wiki_sn'] = get_oof_classifier(
    models=cb_models,
    n_folds=FOLDS,
    x_test=train[['text']],
    text_features=['text'],
    seeds=[0, 42, 888]
)
test['wiki_sn'] = get_oof_classifier(
    models=cb_models,
    n_folds=FOLDS,
    x_test=test[['text']],
    text_features=['text'],
    seeds=[0, 42, 888]
)
del train['text']
del test['text']

In [None]:
train['svr'], test['svr'], models_svr = get_oof_svr(
    n_folds=FOLDS,
    x_train=train[['license', 'excerpt', 'wiki_sn']],
    y=train['target'],
    x_test=test[['license', 'excerpt', 'wiki_sn']],
    seeds=[0, 42, 888],
)


In [None]:
mean_squared_error(train['svr'], train['target'], squared=False)

In [None]:
submission_s = test['svr']
submission_s.name = 'target'
submission_s.to_csv("submission.csv", header=True, index=True)