In [1]:
#!pip install -U transformers
#!pip install -U datasets
#!pip install optuna
import os
import sys
HOME = os.path.abspath('..')
sys.path.append(HOME)
os.chdir(HOME)
import pandas as pd
#!pip install transformers
from transformers import RobertaConfig, RobertaModel,RobertaForSequenceClassification, Trainer,AutoModelForSequenceClassification, EarlyStoppingCallback 
from transformers import AutoTokenizer
from transformers.models.roberta import RobertaPreTrainedModel
import torch
from torch import nn
from transformers import TrainingArguments
import glob
import optuna

In [2]:
MODEL_NAME =  "distilbert-base-uncased" #"roberta-base" 
TARGET_COL = 'averageRating'
MODEL_FOLDER = 'only_text_features'
FINAL_MODEL_NAME = f"{MODEL_NAME}-{TARGET_COL}"
CATEGORIES_AS_TEXT = False
NUMERIC_AS_TEXT = False
DATE_AS_TEXT = False
text_input_col = 'text_input'
COLAB = False
DEBUG = False

if COLAB == True:
  if not os.path.exists('data'):
    os.mkdir('data')
  if not os.path.exists('data/processed'):
    os.mkdir('data/processed')

  drive.mount('/content/gdrive/')
  for filename in glob.glob(os.path.join('gdrive/MyDrive/atdl', '*.*')):
      shutil.copy(filename, 'data/processed')

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def process_text_data(data_:pd.DataFrame,text_col,padding ="max_length", truncation = True, na_filler = ""):

    '''
    
    '''
    data = data_.copy()
    data[text_col] = data[text_col].fillna(na_filler)
    encodings = tokenizer(data[text_col].tolist(), padding=padding, truncation=truncation)
    return encodings
    

def columns_to_single_text(df,cols_to_transform,new_col_name = 'text_input',sep = tokenizer.sep_token,nan_replacement = tokenizer.unk_token ):

  '''
  
  Creates a new column called new_col_name with with all columns in cols_to_transform concatenated into a single text
  '''
  df[new_col_name] = df[cols_to_transform].astype(str).replace('nan',nan_replacement).agg(f' {sep} '.join, axis=1)


class NAFiller:

  def __init__(self,train):
    self.train = train

  def fit(self,column = 'Budget',groupby=['top_genre','top_country']):
    self.mapping = self.train.groupby(groupby)[column].median().reset_index()
    self.mapping = self.mapping.rename(columns={column:'na_filler'})
    self.median = self.train[column].median()
    self.column=column


  def transform(self,test,round = False):
    self.na_filler = test.merge(self.mapping,how='left')['na_filler']
    self.na_filler = self.na_filler.fillna(self.median)

    test[self.column] = test[self.column].reset_index(drop=True).fillna(self.na_filler).values

    if round:
      test[self.column].round().astype(int)
      


  def fit_transform(self,test,column = 'Budget',groupby=['top_genre','top_country']):
    self.fit(column,groupby)
    self.transform()
    self.column=column
        

def create_dataset_split(split,text_cols,text_input_col,TARGET_COL):

  #If all columns in text_cols are combined into a single text. A n
  columns_to_single_text(split,text_cols)

  #Get split encodings
  split_encodings = process_text_data(split,text_input_col)

  #get labels
  split_labels = split[TARGET_COL].tolist()

  #Create dataset objects
  split_dataset = IMDbDataset(split_encodings, split_labels)

  return split_dataset

In [5]:
all_cols =  ['Budget',
             'averageRating',
             'cast',
             'countries',
             'director',
             'genres',
             'imdb_id',
             'languages',
             'overview',
             'production companies',
             'release_date',
             'revenue_worldwide_BOM',
             'runtimeMinutes',
             'title']

categoric_cols = ['cast',
                  'countries',
                  'director',
                  'genres',
                  'languages',
                  'production companies']

text_cols = ['title','overview']                  
date_cols = ['release_date']
numeric_cols = ['Budget','runtimeMinutes']

if CATEGORIES_AS_TEXT:
  text_cols+=categoric_cols

if NUMERIC_AS_TEXT:
  text_cols+=numeric_cols

if DATE_AS_TEXT:
  text_cols+=date_cols


train_ids = pd.read_csv('data/processed/train.csv',usecols=['imdb_id'])['imdb_id'].tolist()
val_ids = pd.read_csv('data/processed/val.csv',usecols=['imdb_id'])['imdb_id'].tolist()
test_ids = pd.read_csv('data/processed/test.csv',usecols=['imdb_id'])['imdb_id'].tolist()
df = pd.read_csv('data/processed/df.csv',usecols = all_cols,parse_dates=['release_date']).sample(frac=1) #shuffle


df[categoric_cols] = df[categoric_cols].apply(lambda x: x.str.replace('|',', '),axis=0) #Change pipe to comma, its more meaningful



In [6]:
#Additional auxilary columns
df['top_genre'] = df['genres'].apply(lambda x: x.split(', ')[0])
df['top_country'] = df['countries'].apply(lambda x: x.split(', ')[0] if isinstance(x,str) else x)
df['year'] = df['release_date'].dt.year

In [7]:
#Create splits
if DEBUG:
    train = df[df['imdb_id'].isin(train_ids)].sample(frac=0.2)
    val = df[df['imdb_id'].isin(val_ids)].sample(frac=0.2)
    test = df[df['imdb_id'].isin(test_ids)]
else:
    train = df[df['imdb_id'].isin(train_ids)]
    val = df[df['imdb_id'].isin(val_ids)]
    test = df[df['imdb_id'].isin(test_ids)]


#Fill na in some columns with statistics
naf = NAFiller(train)
naf.fit(column = 'Budget',groupby=['top_genre','top_country'])
naf.transform(train,round=True)
naf.transform(val,round=True)
naf.transform(test,round=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
'''
#If all columns in text_cols are combined into a single text. A n
columns_to_single_text(train,text_cols)
columns_to_single_text(val,text_cols)
columns_to_single_text(test,text_cols)


#Get train encodings
train_encodings = process_text_data(train,text_input_col)
val_encodings = process_text_data(val,text_input_col)
test_encodings = process_text_data(test,text_input_col)

#get labels
train_labels = train[TARGET_COL].tolist()
val_labels = val[TARGET_COL].tolist()
test_labels = test[TARGET_COL].tolist()

#Create dataset objects
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)
'''


train_dataset=create_dataset_split(train,text_cols,text_input_col,TARGET_COL)
val_dataset=create_dataset_split(val,text_cols,text_input_col,TARGET_COL)
test_dataset=create_dataset_split(test,text_cols,text_input_col,TARGET_COL)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
def get_model():
    return AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                         problem_type='regression',
                                                         num_labels=1
def get_model_by_name(model_name):
    return AutoModelForSequenceClassification.from_pretrained(model_name,
                                                         problem_type='regression',
                                                         num_labels=1
                                                        )                                                )

from itertools import product
import numpy as np
from pprint import pprint

epochs = 10
num_evals = 20
patience = 2 if DEBUG else 30
callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
eval_steps = 50 if DEBUG else 100

hparams = {'batch_size' : [8,16,32],
           'learning_rate' : [1e-5, 2e-5, 3e-5,5e-5],
           'weight_decay' : [0.1,0.01],
           'repeats': range(1)}

combs = list(product(*[range(len(i)) for i in list(hparams.values())]))
scores = np.zeros([len(i) for i in list(hparams.values())])
best_score = float('inf')


trials_df = []

for idx,comb_indexes in enumerate(combs):
    comb_values = {name:val[idx] for name,val,idx in zip(hparams.keys(),hparams.values(),comb_indexes)}
    
    
    print('training with following hparams:')
    pprint(comb_values)
    
    training_args = TrainingArguments(output_dir=f"{MODEL_NAME}-{TARGET_COL}",
                                      per_device_train_batch_size = comb_values['batch_size'],
                                      learning_rate=comb_values['learning_rate'],
                                      weight_decay=comb_values['weight_decay'],
                                      seed = 42,
                                      fp16=True,
                                      per_device_eval_batch_size = 16,
                                      warmup_ratio=0.06,
                                      num_train_epochs = epochs,
                                      evaluation_strategy = "steps",
                                      save_strategy = "steps",
                                      load_best_model_at_end=True,
                                      eval_steps = eval_steps,
                                      save_steps = eval_steps,
                                      save_total_limit = 1,
                                      log_level = 'error',
                                      disable_tqdm = True
                                      
                                    )

    trainer = Trainer(
        model_init=get_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks = callbacks
    )
    
    trainer.train()

    score = trainer.evaluate()['eval_loss']
    scores[tuple(comb_indexes)] = score
    
    comb_values['score'] = score
    trials_df.append(comb_values)
    
    if score<best_score:
        print(f'got a better model, with score {np.round(score,4)} saving...')
        best_score = score
        trainer.save_model(f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}')
        print('saved')
        

trials_df = pd.DataFrame(rows)
trials_df.to_csv(f'models/{MODEL_FOLDER}/hparams_trials.csv',index=False)
    
'''
    
training_args = TrainingArguments(output_dir=f"{MODEL_NAME}-{TARGET_COL}",
                                  seed = 42,
                                  fp16=True,
                                  per_device_eval_batch_size = 16,
                                  warmup_ratio=0.06,
                                  num_train_epochs = epochs,
                                  evaluation_strategy = "steps",
                                  save_strategy = "steps",
                                  load_best_model_at_end=True,
                                  eval_steps = eval_steps,
                                  save_steps = eval_steps,
                                  save_total_limit = 3
                                 )

trainer = Trainer(
    model_init=get_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks = callbacks
)


#Grid search
search_space = {'learning_rate' : [1e-5, 2e-5, 3e-5, 5e-5],
                'weight_decay' : [0.1,0.01],
                'per_device_train_batch_size' : [32,16,8]
               }




def hyperparameter_space(trial):

    return {
        "learning_rate": trial.suggest_categorical("learning_rate",search_space["learning_rate"]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", search_space["per_device_train_batch_size"]),
        "weight_decay": trial.suggest_categorical("weight_decay", search_space["weight_decay"])
    }


best_run = trainer.hyperparameter_search(hp_space=hyperparameter_space,
                                         n_trials=None,
                                         sampler = optuna.samplers.GridSampler(search_space),
                                         study_name = 'imdb_rating_finetune',
                                         direction='minimize',
                                         pruner=optuna.pruners.PatientPruner(None,patience=patience)
                                        )
'''


training with following hparams:
{'batch_size': 8, 'learning_rate': 1e-05, 'repeats': 0, 'weight_decay': 0.1}




{'eval_loss': 34.42953109741211, 'eval_runtime': 9.5585, 'eval_samples_per_second': 183.502, 'eval_steps_per_second': 11.508, 'epoch': 0.1}
{'eval_loss': 14.13512897491455, 'eval_runtime': 9.7196, 'eval_samples_per_second': 180.461, 'eval_steps_per_second': 11.317, 'epoch': 0.2}
{'eval_loss': 2.718254566192627, 'eval_runtime': 9.8907, 'eval_samples_per_second': 177.338, 'eval_steps_per_second': 11.122, 'epoch': 0.29}
{'eval_loss': 1.279853343963623, 'eval_runtime': 9.9863, 'eval_samples_per_second': 175.641, 'eval_steps_per_second': 11.015, 'epoch': 0.39}
{'loss': 15.0569, 'learning_rate': 8.032520325203253e-06, 'epoch': 0.49}
{'eval_loss': 1.317325234413147, 'eval_runtime': 10.1072, 'eval_samples_per_second': 173.54, 'eval_steps_per_second': 10.883, 'epoch': 0.49}
{'eval_loss': 1.1985527276992798, 'eval_runtime': 10.3331, 'eval_samples_per_second': 169.745, 'eval_steps_per_second': 10.645, 'epoch': 0.59}
{'eval_loss': 1.2244387865066528, 'eval_runtime': 10.3961, 'eval_samples_per_seco



{'eval_loss': 34.429317474365234, 'eval_runtime': 10.4166, 'eval_samples_per_second': 168.385, 'eval_steps_per_second': 10.56, 'epoch': 0.1}
{'eval_loss': 14.136425971984863, 'eval_runtime': 10.4336, 'eval_samples_per_second': 168.11, 'eval_steps_per_second': 10.543, 'epoch': 0.2}
{'eval_loss': 2.715629816055298, 'eval_runtime': 10.2708, 'eval_samples_per_second': 170.775, 'eval_steps_per_second': 10.71, 'epoch': 0.29}
{'eval_loss': 1.2795828580856323, 'eval_runtime': 10.3315, 'eval_samples_per_second': 169.772, 'eval_steps_per_second': 10.647, 'epoch': 0.39}
{'loss': 15.0474, 'learning_rate': 8.048780487804879e-06, 'epoch': 0.49}
{'eval_loss': 1.3251065015792847, 'eval_runtime': 10.3194, 'eval_samples_per_second': 169.971, 'eval_steps_per_second': 10.66, 'epoch': 0.49}
{'eval_loss': 1.1514538526535034, 'eval_runtime': 10.3115, 'eval_samples_per_second': 170.101, 'eval_steps_per_second': 10.668, 'epoch': 0.59}
{'eval_loss': 1.2293764352798462, 'eval_runtime': 10.3607, 'eval_samples_per



{'eval_loss': 27.398120880126953, 'eval_runtime': 10.3944, 'eval_samples_per_second': 168.745, 'eval_steps_per_second': 10.583, 'epoch': 0.1}
{'eval_loss': 4.283977508544922, 'eval_runtime': 10.3481, 'eval_samples_per_second': 169.499, 'eval_steps_per_second': 10.63, 'epoch': 0.2}
{'eval_loss': 1.3154034614562988, 'eval_runtime': 10.257, 'eval_samples_per_second': 171.004, 'eval_steps_per_second': 10.724, 'epoch': 0.29}
{'loss': 0.8505, 'learning_rate': 1.7132467532467535e-05, 'epoch': 1.95}
{'eval_loss': 1.0268230438232422, 'eval_runtime': 10.4, 'eval_samples_per_second': 168.654, 'eval_steps_per_second': 10.577, 'epoch': 1.95}
{'eval_loss': 1.078611969947815, 'eval_runtime': 10.3851, 'eval_samples_per_second': 168.896, 'eval_steps_per_second': 10.592, 'epoch': 2.05}
{'eval_loss': 1.1466156244277954, 'eval_runtime': 10.3869, 'eval_samples_per_second': 168.867, 'eval_steps_per_second': 10.59, 'epoch': 2.15}
{'eval_loss': 1.2273989915847778, 'eval_runtime': 10.376, 'eval_samples_per_sec



{'eval_loss': 21.828712463378906, 'eval_runtime': 10.3553, 'eval_samples_per_second': 169.382, 'eval_steps_per_second': 10.623, 'epoch': 0.1}
{'eval_loss': 1.5665903091430664, 'eval_runtime': 10.3716, 'eval_samples_per_second': 169.116, 'eval_steps_per_second': 10.606, 'epoch': 0.2}
{'eval_loss': 1.3292450904846191, 'eval_runtime': 10.2708, 'eval_samples_per_second': 170.775, 'eval_steps_per_second': 10.71, 'epoch': 0.29}
{'eval_loss': 1.2428786754608154, 'eval_runtime': 10.2902, 'eval_samples_per_second': 170.454, 'eval_steps_per_second': 10.69, 'epoch': 0.39}
{'loss': 9.6397, 'learning_rate': 2.4146341463414638e-05, 'epoch': 0.49}
{'eval_loss': 1.0912050008773804, 'eval_runtime': 10.4443, 'eval_samples_per_second': 167.939, 'eval_steps_per_second': 10.532, 'epoch': 0.49}
{'eval_loss': 1.0605424642562866, 'eval_runtime': 10.3057, 'eval_samples_per_second': 170.197, 'eval_steps_per_second': 10.674, 'epoch': 0.59}
{'eval_loss': 1.1526719331741333, 'eval_runtime': 10.3075, 'eval_samples_



{'eval_loss': 21.827655792236328, 'eval_runtime': 10.4235, 'eval_samples_per_second': 168.273, 'eval_steps_per_second': 10.553, 'epoch': 0.1}
{'eval_loss': 1.5643246173858643, 'eval_runtime': 10.3435, 'eval_samples_per_second': 169.575, 'eval_steps_per_second': 10.635, 'epoch': 0.2}
{'eval_loss': 1.3284341096878052, 'eval_runtime': 10.2814, 'eval_samples_per_second': 170.599, 'eval_steps_per_second': 10.699, 'epoch': 0.29}
{'eval_loss': 1.2422744035720825, 'eval_runtime': 10.3535, 'eval_samples_per_second': 169.412, 'eval_steps_per_second': 10.624, 'epoch': 0.39}
{'loss': 9.6295, 'learning_rate': 2.4146341463414638e-05, 'epoch': 0.49}
{'eval_loss': 1.0930235385894775, 'eval_runtime': 10.3872, 'eval_samples_per_second': 168.861, 'eval_steps_per_second': 10.59, 'epoch': 0.49}
{'eval_loss': 1.0549342632293701, 'eval_runtime': 10.3306, 'eval_samples_per_second': 169.788, 'eval_steps_per_second': 10.648, 'epoch': 0.59}
{'eval_loss': 1.3047860860824585, 'eval_runtime': 10.3297, 'eval_samples



{'eval_loss': 13.43083667755127, 'eval_runtime': 10.3965, 'eval_samples_per_second': 168.711, 'eval_steps_per_second': 10.58, 'epoch': 0.1}
{'eval_loss': 1.2651842832565308, 'eval_runtime': 10.3195, 'eval_samples_per_second': 169.97, 'eval_steps_per_second': 10.659, 'epoch': 0.2}
{'eval_loss': 1.1786669492721558, 'eval_runtime': 10.3619, 'eval_samples_per_second': 169.273, 'eval_steps_per_second': 10.616, 'epoch': 0.29}
{'eval_loss': 1.1914293766021729, 'eval_runtime': 10.365, 'eval_samples_per_second': 169.224, 'eval_steps_per_second': 10.613, 'epoch': 0.39}
{'loss': 7.9447, 'learning_rate': 4.0243902439024395e-05, 'epoch': 0.49}
{'eval_loss': 1.067075490951538, 'eval_runtime': 10.3353, 'eval_samples_per_second': 169.71, 'eval_steps_per_second': 10.643, 'epoch': 0.49}
{'eval_loss': 1.0484468936920166, 'eval_runtime': 10.3961, 'eval_samples_per_second': 168.717, 'eval_steps_per_second': 10.581, 'epoch': 0.59}
{'eval_loss': 1.2526767253875732, 'eval_runtime': 10.3676, 'eval_samples_per_



{'eval_loss': 13.430280685424805, 'eval_runtime': 10.3947, 'eval_samples_per_second': 168.74, 'eval_steps_per_second': 10.582, 'epoch': 0.1}
{'eval_loss': 1.2668094635009766, 'eval_runtime': 10.3498, 'eval_samples_per_second': 169.472, 'eval_steps_per_second': 10.628, 'epoch': 0.2}
{'eval_loss': 1.2546107769012451, 'eval_runtime': 10.3813, 'eval_samples_per_second': 168.957, 'eval_steps_per_second': 10.596, 'epoch': 0.29}
{'eval_loss': 1.2232688665390015, 'eval_runtime': 10.3707, 'eval_samples_per_second': 169.131, 'eval_steps_per_second': 10.607, 'epoch': 0.39}
{'loss': 7.9334, 'learning_rate': 4.0243902439024395e-05, 'epoch': 0.49}
{'eval_loss': 1.1976581811904907, 'eval_runtime': 10.3577, 'eval_samples_per_second': 169.343, 'eval_steps_per_second': 10.62, 'epoch': 0.49}
{'eval_loss': 1.0484919548034668, 'eval_runtime': 10.4134, 'eval_samples_per_second': 168.437, 'eval_steps_per_second': 10.563, 'epoch': 0.59}
{'eval_loss': 1.1838971376419067, 'eval_runtime': 10.3441, 'eval_samples_



{'eval_loss': 8.94864273071289, 'eval_runtime': 10.4055, 'eval_samples_per_second': 168.565, 'eval_steps_per_second': 10.571, 'epoch': 0.2}
{'eval_loss': 1.3481932878494263, 'eval_runtime': 10.3303, 'eval_samples_per_second': 169.792, 'eval_steps_per_second': 10.648, 'epoch': 0.39}
{'eval_loss': 1.0982519388198853, 'eval_runtime': 10.3627, 'eval_samples_per_second': 169.26, 'eval_steps_per_second': 10.615, 'epoch': 0.59}
{'loss': 7.1162, 'learning_rate': 2.882793017456359e-05, 'epoch': 0.98}
{'eval_loss': 1.3390294313430786, 'eval_runtime': 10.3529, 'eval_samples_per_second': 169.422, 'eval_steps_per_second': 10.625, 'epoch': 0.98}
{'eval_loss': 1.171366572380066, 'eval_runtime': 10.3566, 'eval_samples_per_second': 169.36, 'eval_steps_per_second': 10.621, 'epoch': 2.34}
{'eval_loss': 1.2678394317626953, 'eval_runtime': 10.3904, 'eval_samples_per_second': 168.81, 'eval_steps_per_second': 10.587, 'epoch': 2.54}
{'eval_loss': 1.342484951019287, 'eval_runtime': 10.387, 'eval_samples_per_se



{'eval_loss': 3.1794581413269043, 'eval_runtime': 10.3368, 'eval_samples_per_second': 169.686, 'eval_steps_per_second': 10.642, 'epoch': 0.2}
{'eval_loss': 1.350046992301941, 'eval_runtime': 10.3424, 'eval_samples_per_second': 169.593, 'eval_steps_per_second': 10.636, 'epoch': 0.39}
{'eval_loss': 1.1334184408187866, 'eval_runtime': 10.341, 'eval_samples_per_second': 169.616, 'eval_steps_per_second': 10.637, 'epoch': 0.59}
{'eval_loss': 1.1296584606170654, 'eval_runtime': 10.3679, 'eval_samples_per_second': 169.175, 'eval_steps_per_second': 10.61, 'epoch': 0.78}
{'loss': 5.9536, 'learning_rate': 4.8046550290939315e-05, 'epoch': 0.98}
{'eval_loss': 1.2229756116867065, 'eval_runtime': 10.3112, 'eval_samples_per_second': 170.105, 'eval_steps_per_second': 10.668, 'epoch': 0.98}
{'eval_loss': 1.0272977352142334, 'eval_runtime': 10.4209, 'eval_samples_per_second': 168.316, 'eval_steps_per_second': 10.556, 'epoch': 1.17}
{'eval_loss': 1.0472018718719482, 'eval_runtime': 10.2817, 'eval_samples_



{'eval_loss': 3.179612874984741, 'eval_runtime': 10.3995, 'eval_samples_per_second': 168.662, 'eval_steps_per_second': 10.577, 'epoch': 0.2}
{'eval_loss': 1.3469157218933105, 'eval_runtime': 10.2953, 'eval_samples_per_second': 170.369, 'eval_steps_per_second': 10.684, 'epoch': 0.39}
{'eval_loss': 1.1151374578475952, 'eval_runtime': 10.3626, 'eval_samples_per_second': 169.262, 'eval_steps_per_second': 10.615, 'epoch': 0.59}
{'eval_loss': 1.1542720794677734, 'eval_runtime': 10.3921, 'eval_samples_per_second': 168.783, 'eval_steps_per_second': 10.585, 'epoch': 0.78}
{'loss': 5.949, 'learning_rate': 4.8046550290939315e-05, 'epoch': 0.98}
{'eval_loss': 1.4306890964508057, 'eval_runtime': 10.2977, 'eval_samples_per_second': 170.33, 'eval_steps_per_second': 10.682, 'epoch': 0.98}
{'eval_loss': 1.0497664213180542, 'eval_runtime': 10.4172, 'eval_samples_per_second': 168.376, 'eval_steps_per_second': 10.559, 'epoch': 1.17}
{'eval_loss': 1.0504658222198486, 'eval_runtime': 10.2757, 'eval_samples_



{'eval_loss': 14.407378196716309, 'eval_runtime': 10.3689, 'eval_samples_per_second': 169.16, 'eval_steps_per_second': 10.609, 'epoch': 0.39}
{'eval_loss': 1.2505860328674316, 'eval_runtime': 10.3782, 'eval_samples_per_second': 169.007, 'eval_steps_per_second': 10.599, 'epoch': 0.78}
{'eval_loss': 1.233694314956665, 'eval_runtime': 10.302, 'eval_samples_per_second': 170.257, 'eval_steps_per_second': 10.677, 'epoch': 1.17}
{'eval_loss': 1.0769129991531372, 'eval_runtime': 10.3129, 'eval_samples_per_second': 170.078, 'eval_steps_per_second': 10.666, 'epoch': 1.56}
{'loss': 8.0091, 'learning_rate': 8.5785536159601e-06, 'epoch': 1.95}
{'eval_loss': 1.237645149230957, 'eval_runtime': 10.388, 'eval_samples_per_second': 168.849, 'eval_steps_per_second': 10.589, 'epoch': 1.95}
{'eval_loss': 1.0663503408432007, 'eval_runtime': 10.2825, 'eval_samples_per_second': 170.581, 'eval_steps_per_second': 10.698, 'epoch': 2.34}
{'eval_loss': 1.2439851760864258, 'eval_runtime': 10.3436, 'eval_samples_per_



{'eval_loss': 14.407205581665039, 'eval_runtime': 10.3609, 'eval_samples_per_second': 169.291, 'eval_steps_per_second': 10.617, 'epoch': 0.39}
{'eval_loss': 1.2502493858337402, 'eval_runtime': 10.2565, 'eval_samples_per_second': 171.014, 'eval_steps_per_second': 10.725, 'epoch': 0.78}
{'eval_loss': 1.170969843864441, 'eval_runtime': 10.2804, 'eval_samples_per_second': 170.615, 'eval_steps_per_second': 10.7, 'epoch': 1.17}
{'eval_loss': 1.0742754936218262, 'eval_runtime': 10.2678, 'eval_samples_per_second': 170.826, 'eval_steps_per_second': 10.713, 'epoch': 1.56}
{'loss': 8.0086, 'learning_rate': 8.5785536159601e-06, 'epoch': 1.95}
{'eval_loss': 1.2970401048660278, 'eval_runtime': 10.3365, 'eval_samples_per_second': 169.69, 'eval_steps_per_second': 10.642, 'epoch': 1.95}
{'eval_loss': 1.0564281940460205, 'eval_runtime': 10.2957, 'eval_samples_per_second': 170.362, 'eval_steps_per_second': 10.684, 'epoch': 2.34}
{'eval_loss': 1.2640405893325806, 'eval_runtime': 10.3168, 'eval_samples_per



{'eval_loss': 4.694136619567871, 'eval_runtime': 10.2737, 'eval_samples_per_second': 170.727, 'eval_steps_per_second': 10.707, 'epoch': 0.39}
{'eval_loss': 1.1679534912109375, 'eval_runtime': 10.2532, 'eval_samples_per_second': 171.069, 'eval_steps_per_second': 10.728, 'epoch': 0.78}
{'eval_loss': 1.3943415880203247, 'eval_runtime': 10.3063, 'eval_samples_per_second': 170.187, 'eval_steps_per_second': 10.673, 'epoch': 1.17}
{'eval_loss': 1.0522716045379639, 'eval_runtime': 10.3922, 'eval_samples_per_second': 168.78, 'eval_steps_per_second': 10.585, 'epoch': 1.56}
{'loss': 6.2247, 'learning_rate': 1.71571072319202e-05, 'epoch': 1.95}
{'eval_loss': 1.20987069606781, 'eval_runtime': 10.4251, 'eval_samples_per_second': 168.248, 'eval_steps_per_second': 10.551, 'epoch': 1.95}
{'eval_loss': 1.018438458442688, 'eval_runtime': 10.2975, 'eval_samples_per_second': 170.333, 'eval_steps_per_second': 10.682, 'epoch': 2.34}
{'eval_loss': 1.1657812595367432, 'eval_runtime': 10.3246, 'eval_samples_per



{'eval_loss': 4.6908793449401855, 'eval_runtime': 10.2115, 'eval_samples_per_second': 171.768, 'eval_steps_per_second': 10.772, 'epoch': 0.39}
{'eval_loss': 1.1719419956207275, 'eval_runtime': 10.3554, 'eval_samples_per_second': 169.381, 'eval_steps_per_second': 10.623, 'epoch': 0.78}
{'eval_loss': 1.3908175230026245, 'eval_runtime': 10.3432, 'eval_samples_per_second': 169.579, 'eval_steps_per_second': 10.635, 'epoch': 1.17}
{'eval_loss': 1.0599018335342407, 'eval_runtime': 10.2742, 'eval_samples_per_second': 170.719, 'eval_steps_per_second': 10.706, 'epoch': 1.56}
{'loss': 6.2239, 'learning_rate': 1.71571072319202e-05, 'epoch': 1.95}
{'eval_loss': 1.219828486442566, 'eval_runtime': 10.4265, 'eval_samples_per_second': 168.225, 'eval_steps_per_second': 10.55, 'epoch': 1.95}
{'eval_loss': 1.0193086862564087, 'eval_runtime': 10.3915, 'eval_samples_per_second': 168.792, 'eval_steps_per_second': 10.586, 'epoch': 2.34}
{'eval_loss': 1.165439248085022, 'eval_runtime': 10.3629, 'eval_samples_p



{'eval_loss': 1.8042019605636597, 'eval_runtime': 10.2968, 'eval_samples_per_second': 170.344, 'eval_steps_per_second': 10.683, 'epoch': 0.39}
{'eval_loss': 1.1007497310638428, 'eval_runtime': 10.2885, 'eval_samples_per_second': 170.482, 'eval_steps_per_second': 10.692, 'epoch': 0.78}
{'eval_loss': 1.4607185125350952, 'eval_runtime': 10.3246, 'eval_samples_per_second': 169.886, 'eval_steps_per_second': 10.654, 'epoch': 1.17}
{'eval_loss': 1.098671317100525, 'eval_runtime': 10.3791, 'eval_samples_per_second': 168.994, 'eval_steps_per_second': 10.598, 'epoch': 1.56}
{'loss': 5.4119, 'learning_rate': 2.5735660847880298e-05, 'epoch': 1.95}
{'eval_loss': 1.1287124156951904, 'eval_runtime': 10.3298, 'eval_samples_per_second': 169.8, 'eval_steps_per_second': 10.649, 'epoch': 1.95}
{'eval_loss': 1.1414213180541992, 'eval_runtime': 10.3893, 'eval_samples_per_second': 168.828, 'eval_steps_per_second': 10.588, 'epoch': 2.34}
{'eval_loss': 1.1621812582015991, 'eval_runtime': 10.3709, 'eval_samples



{'eval_loss': 1.8031853437423706, 'eval_runtime': 10.3428, 'eval_samples_per_second': 169.586, 'eval_steps_per_second': 10.635, 'epoch': 0.39}
{'eval_loss': 1.182713508605957, 'eval_runtime': 10.3778, 'eval_samples_per_second': 169.014, 'eval_steps_per_second': 10.6, 'epoch': 0.78}
{'eval_loss': 1.5021103620529175, 'eval_runtime': 10.349, 'eval_samples_per_second': 169.485, 'eval_steps_per_second': 10.629, 'epoch': 1.17}
{'eval_loss': 1.0886316299438477, 'eval_runtime': 10.3854, 'eval_samples_per_second': 168.891, 'eval_steps_per_second': 10.592, 'epoch': 1.56}
{'loss': 5.4161, 'learning_rate': 2.5735660847880298e-05, 'epoch': 1.95}
{'eval_loss': 1.189293622970581, 'eval_runtime': 10.3208, 'eval_samples_per_second': 169.949, 'eval_steps_per_second': 10.658, 'epoch': 1.95}
{'eval_loss': 1.1291688680648804, 'eval_runtime': 10.36, 'eval_samples_per_second': 169.305, 'eval_steps_per_second': 10.618, 'epoch': 2.34}
{'eval_loss': 1.125766634941101, 'eval_runtime': 10.3891, 'eval_samples_per_



{'eval_loss': 1.4554216861724854, 'eval_runtime': 10.3072, 'eval_samples_per_second': 170.173, 'eval_steps_per_second': 10.672, 'epoch': 0.39}
{'eval_loss': 1.0725258588790894, 'eval_runtime': 10.3392, 'eval_samples_per_second': 169.646, 'eval_steps_per_second': 10.639, 'epoch': 0.78}
{'eval_loss': 1.3536043167114258, 'eval_runtime': 10.348, 'eval_samples_per_second': 169.502, 'eval_steps_per_second': 10.63, 'epoch': 1.17}
{'eval_loss': 1.1303107738494873, 'eval_runtime': 10.3907, 'eval_samples_per_second': 168.804, 'eval_steps_per_second': 10.586, 'epoch': 1.56}
{'loss': 4.5732, 'learning_rate': 4.28927680798005e-05, 'epoch': 1.95}
{'eval_loss': 1.1339911222457886, 'eval_runtime': 10.3417, 'eval_samples_per_second': 169.605, 'eval_steps_per_second': 10.637, 'epoch': 1.95}
{'eval_loss': 1.232366919517517, 'eval_runtime': 10.3811, 'eval_samples_per_second': 168.961, 'eval_steps_per_second': 10.596, 'epoch': 2.34}
{'eval_loss': 1.085791826248169, 'eval_runtime': 10.3723, 'eval_samples_pe



{'eval_loss': 1.4519883394241333, 'eval_runtime': 10.3287, 'eval_samples_per_second': 169.819, 'eval_steps_per_second': 10.65, 'epoch': 0.39}
{'eval_loss': 1.0716913938522339, 'eval_runtime': 10.3598, 'eval_samples_per_second': 169.308, 'eval_steps_per_second': 10.618, 'epoch': 0.78}
{'eval_loss': 1.2223597764968872, 'eval_runtime': 10.277, 'eval_samples_per_second': 170.672, 'eval_steps_per_second': 10.704, 'epoch': 1.17}
{'eval_loss': 1.1179624795913696, 'eval_runtime': 10.3573, 'eval_samples_per_second': 169.349, 'eval_steps_per_second': 10.621, 'epoch': 1.56}
{'loss': 4.5706, 'learning_rate': 4.28927680798005e-05, 'epoch': 1.95}
{'eval_loss': 1.1000239849090576, 'eval_runtime': 10.4041, 'eval_samples_per_second': 168.587, 'eval_steps_per_second': 10.573, 'epoch': 1.95}
{'eval_loss': 1.1460316181182861, 'eval_runtime': 10.3716, 'eval_samples_per_second': 169.116, 'eval_steps_per_second': 10.606, 'epoch': 2.34}
{'eval_loss': 1.1451269388198853, 'eval_runtime': 10.3632, 'eval_samples_

'\n    \ntraining_args = TrainingArguments(output_dir=f"{MODEL_NAME}-{TARGET_COL}",\n                                  seed = 42,\n                                  fp16=True,\n                                  per_device_eval_batch_size = 16,\n                                  warmup_ratio=0.06,\n                                  num_train_epochs = epochs,\n                                  evaluation_strategy = "steps",\n                                  save_strategy = "steps",\n                                  load_best_model_at_end=True,\n                                  eval_steps = eval_steps,\n                                  save_steps = eval_steps,\n                                  save_total_limit = 3\n                                 )\n\ntrainer = Trainer(\n    model_init=get_model,\n    args=training_args,\n    train_dataset=train_dataset,\n    eval_dataset=val_dataset,\n    callbacks = callbacks\n)\n\n\n#Grid search\nsearch_space = {\'learning_rate\' : [1e-5, 2e-5, 3

In [11]:
'''
rows=[]
for idx,comb_indexes in enumerate(combs):
    row = {name:val[idx] for name,val,idx in zip(hparams.keys(),hparams.values(),comb_indexes)}
    row['score']=scores[tuple(comb_indexes)]
    rows.append(row)
trials_df = pd.DataFrame(rows)
trials_df.to_csv(f'models/{MODEL_FOLDER}/hparams_trials.csv',index=False)
'''

In [41]:
#Test set performance
best_model = get_model_by_name(f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}/')
trainer_best_model = Trainer(model=best_model)
test_stats_best_model = trainer_best_model.evaluate(test_dataset)
pd.DataFrame([test_stats_best_model]).to_csv(f'models/{MODEL_FOLDER}/test_stats_best_model.csv'
                                             ,index=False)

loading configuration file models/only_text_features/distilbert-base-uncased-averageRating/config.json
Model config DistilBertConfig {
  "_name_or_path": "models/only_text_features/distilbert-base-uncased-averageRating/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "regression",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file models/only_text_features/distilbert-base-uncased-averageRating/pytorch_model.bin
All model checkpoint weights were used when in