In [None]:
import sys
sys.path.append('../input/autokeras-snigdha')

In [None]:
import os
import gc
import sys
import time
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, RepeatedKFold
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
#Models
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from lightgbm import LGBMRegressor
import autokeras as ak
from sklearn.metrics import mean_squared_error

In [None]:
data_dir = '../input/commonlitreadabilityprize/'
train = pd.read_csv(data_dir + 'train.csv')
test = pd.read_csv(data_dir + 'test.csv')
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

target = train['target'].to_numpy()

In [None]:
train['feature']=train['excerpt'].map(str)+' '+train['url_legal'].map(str)+' '+train['license'].map(str)
test['feature']=test['excerpt'].map(str)+' '+test['url_legal'].map(str)+' '+test['license'].map(str)

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


class CLRPDataset(nn.Module):
    def __init__(self, df, tokenizer, max_len=128):
        #self.excerpt = df['excerpt'].to_numpy()
        self.excerpt = df['feature'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    

In [None]:
def get_embeddings(df, path, plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = AutoModel.from_pretrained(MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = CLRPDataset(df, tokenizer, config['max_len'])
    dl = DataLoader(ds,
                    batch_size=config["batch_size"],
                    shuffle=False,
                    num_workers = 4,
                    pin_memory=True,
                    drop_last=False)
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0], -1).to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:, 0].detach().cpu().numpy()
            embeddings.extend(outputs)
    return np.array(embeddings)

In [None]:
config = {
    'batch_size': 128,
    'max_len': 256,
    'seed': 42,
}
seed_everything(seed=config['seed'])

train_embeddings_roberta =  get_embeddings(train,'../input/roberta-base')
test_embeddings_roberta = get_embeddings(test,'../input/roberta-base')
train_embeddings_minilm =  get_embeddings(train,'../input/minilm-base')
test_embeddings_minilm = get_embeddings(test,'../input/minilm-base')

In [None]:
train_embeddings=np.hstack((train_embeddings_roberta,train_embeddings_minilm))
test_embeddings=np.hstack((test_embeddings_roberta,test_embeddings_minilm))

In [None]:
auto_reg = ak.StructuredDataRegressor(overwrite=True, max_trials=10)
auto_reg.fit(train_embeddings, target, epochs=20)
preds_ak = auto_reg.predict(test_embeddings)

In [None]:
test['prediction'] = preds_ak
submission = pd.DataFrame()
submission['id'] = test['id'].copy()
submission['target'] = test['prediction'].copy()
submission.to_csv('submission.csv', index=False)
submission.head()