In [None]:
import sys
sys.path.append('../input/fm-modules')

In [None]:
import os
import gc
import sys
import time
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, RepeatedKFold
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from nltk import word_tokenize,sent_tokenize
import math

In [None]:
data_dir = "../input/commonlitreadabilityprize/"
train = pd.read_csv(data_dir+'train.csv')
test = pd.read_csv(data_dir+'test.csv')
test_copy=test.copy()
sample_submission = pd.read_csv(data_dir+'sample_submission.csv')
target = train['target'].to_numpy()

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


class CLRPDataset(nn.Module):
    def __init__(self, df, tokenizer, max_len=128):
        #self.excerpt = df['excerpt'].to_numpy()
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def get_embeddings(df, path, plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df, tokenizer, config['max_len'])
    dl = DataLoader(ds,
                    batch_size=config["batch_size"],
                    shuffle=False,
                    num_workers = 4,
                    pin_memory=True,
                    drop_last=False)
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0], -1).to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:, 0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
config = {
    'batch_size': 128,
    'max_len': 256,
    'seed': 42,
}
seed_everything(seed=config['seed'])
train_embeddings_minilm =  get_embeddings(train,'../input/minilm-finetuned')
test_embeddings_minilm = get_embeddings(test,'../input/minilm-finetuned')
train_embeddings_roberta =  get_embeddings(train,'../input/roberta-finetuned')
test_embeddings_roberta = get_embeddings(test,'../input/roberta-finetuned')

In [None]:
minilm_columns=[]
i=1
while i < 385:
    s = 'f'+str(i)
    i=i+1
    minilm_columns.append(s)
minilm_train = pd.DataFrame(train_embeddings_minilm, columns = minilm_columns)
minilm_test = pd.DataFrame(test_embeddings_minilm, columns = minilm_columns)

In [None]:
roberta_columns=[]
i=1
while i < 769:
    s = 'f'+str(i+384)
    i=i+1
    roberta_columns.append(s)
roberta_train = pd.DataFrame(train_embeddings_roberta, columns = roberta_columns)
roberta_test = pd.DataFrame(test_embeddings_roberta, columns = roberta_columns)

In [None]:
train = pd.concat([minilm_train,train], axis=1)
train = pd.concat([roberta_train,train], axis=1)
test = pd.concat([minilm_test,test], axis=1)
test = pd.concat([roberta_test,test], axis=1)

In [None]:
import deepctr_torch
from sklearn.model_selection import train_test_split
from deepctr_torch.models import DeepFM
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [None]:
sparse_features = ['license']
dense_features = minilm_columns+roberta_columns

In [None]:
le = LabelEncoder()
train["license"] = le.fit_transform(train["license"])
test["license"] = le.fit_transform(test["license"])

In [None]:
train.drop(columns=['id','url_legal','excerpt','standard_error'], inplace=True)
test.drop(columns=['id','url_legal','excerpt'], inplace=True)

In [None]:
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

In [None]:
fixlen_feature_columns = [SparseFeat(feat, train[feat].nunique()) for feat in sparse_features]+[DenseFeat(feat, 1, ) for feat in dense_features]

In [None]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [None]:
train_, valid_ = train_test_split(train, test_size=0.2, random_state=2020)
train_model_input = {name: train_[name] for name in feature_names}
valid_model_input = {name: valid_[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [None]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

In [None]:
model = NFM(linear_feature_columns,dnn_feature_columns, task='regression', device=device)
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train_['target'].values, batch_size=20, epochs=100, verbose=2,validation_split=0.05)

In [None]:
pred_ans = model.predict(valid_model_input, batch_size=20)
print("test MSE", round(mean_squared_error(valid_['target'].values, pred_ans), 4))

In [None]:
preds = model.predict(test_model_input, batch_size=20)

In [None]:
test_copy['prediction'] = preds
submission = pd.DataFrame()
submission['id'] = test_copy['id'].copy()
submission['target'] = test_copy['prediction'].copy()
submission.to_csv('submission.csv', index=False)
submission.head()