In [1]:
#!pip install -U transformers
#!pip install -U datasets
#!pip install optuna
import os
import sys
HOME = os.path.abspath('..')
sys.path.append(HOME)
os.chdir(HOME)
import pandas as pd
#!pip install transformers
from transformers import RobertaConfig, RobertaModel,RobertaForSequenceClassification, Trainer,AutoModelForSequenceClassification, EarlyStoppingCallback 
from transformers import AutoTokenizer
from transformers.models.roberta import RobertaPreTrainedModel
import torch
from torch import nn
from transformers import TrainingArguments
import glob
import optuna
from itertools import product
import numpy as np
from pprint import pprint

In [3]:
MODEL_NAME =  "distilbert-base-uncased" #"roberta-base" 
TARGET_COL = 'averageRating'
MODEL_FOLDER = 'everything_as_text'
text_input_col = 'text_input'
CATEGORIES_AS_TEXT = True
NUMERIC_AS_TEXT = True
DATE_AS_TEXT = True
ADJUST_INFLATION = False
USE_COLUMN_NAMES = False
COLAB = False
DEBUG = False

FINAL_MODEL_NAME = f"{MODEL_NAME}-{TARGET_COL}"
FINAL_MODEL_PATH = f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}'
TRIALS_DF_PATH = f'models/{MODEL_FOLDER}/hparams_trials.csv'


if ADJUST_INFLATION:
    FINAL_MODEL_NAME+='-inflation_adjusted'
    
if USE_COLUMN_NAMES:
    FINAL_MODEL_NAME+='-with_column_names'

    
if USE_COLUMN_NAMES:
    assert CATEGORIES_AS_TEXT|NUMERIC_AS_TEXT|DATE_AS_TEXT, "can't use column names as text if there are no columns to treat as text!"
    
print('Final model name: ',FINAL_MODEL_NAME)
print('Saving at: ',MODEL_FOLDER)



if COLAB == True:
  if not os.path.exists('data'):
    os.mkdir('data')
  if not os.path.exists('data/processed'):
    os.mkdir('data/processed')

  drive.mount('/content/gdrive/')
  for filename in glob.glob(os.path.join('gdrive/MyDrive/atdl', '*.*')):
      shutil.copy(filename, 'data/processed')
    


Final model name:  distilbert-base-uncased-averageRating
Saving at:  everything_as_text


In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def process_text_data(data_:pd.DataFrame,text_col,padding ="max_length", truncation = True, na_filler = ""):

    '''
    
    '''
    data = data_.copy()
    data[text_col] = data[text_col].fillna(na_filler)
    encodings = tokenizer(data[text_col].tolist(), padding=padding, truncation=truncation)
    return encodings
    

def columns_to_single_text(df,cols_to_transform,new_col_name = 'text_input',sep = tokenizer.sep_token,nan_replacement = tokenizer.unk_token ):

  '''
  
  Creates a new column called new_col_name with with all columns in cols_to_transform concatenated into a single text
  '''
  df[new_col_name] = df[cols_to_transform].astype(str).replace('nan',nan_replacement).agg(f' {sep} '.join, axis=1)


class NAFiller:

  def __init__(self,train):
    self.train = train

  def fit(self,column = 'Budget',groupby=['top_genre','top_country']):
    self.mapping = self.train.groupby(groupby)[column].median().reset_index()
    self.mapping = self.mapping.rename(columns={column:'na_filler'})
    self.median = self.train[column].median()
    self.column=column


  def transform(self,test,round = False):
    self.na_filler = test.merge(self.mapping,how='left')['na_filler']
    self.na_filler = self.na_filler.fillna(self.median)

    test[self.column] = test[self.column].reset_index(drop=True).fillna(self.na_filler).values

    if round:
      test[self.column] = test[self.column].round().astype(int)
      


  def fit_transform(self,test,column = 'Budget',groupby=['top_genre','top_country']):
    self.fit(column,groupby)
    self.transform()
    self.column=column
        

def create_dataset_split(split,text_cols,text_input_col,TARGET_COL):

  #If all columns in text_cols are combined into a single text. A n
  columns_to_single_text(split,text_cols)

  #Get split encodings
  split_encodings = process_text_data(split,text_input_col)

  #get labels
  split_labels = split[TARGET_COL].tolist()

  #Create dataset objects
  split_dataset = IMDbDataset(split_encodings, split_labels)

  return split_dataset


def get_model():
    return AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                         problem_type='regression',
                                                         num_labels=1)
def get_model_by_name(model_name):
    return AutoModelForSequenceClassification.from_pretrained(model_name,
                                                         problem_type='regression',
                                                         num_labels=1
                                                        )                                              


In [5]:
all_cols =  ['Budget',
             'averageRating',
             'cast',
             'countries',
             'director',
             'genres',
             'imdb_id',
             'languages',
             'overview',
             'production companies',
             'release_date',
             'revenue_worldwide_BOM',
             'runtimeMinutes',
             'title']

categoric_cols = ['cast',
                  'countries',
                  'director',
                  'genres',
                  'languages',
                  'production companies']

text_cols = ['title','overview']                  
date_cols = ['release_date']
numeric_cols = ['Budget','runtimeMinutes']

if CATEGORIES_AS_TEXT:
  text_cols+=categoric_cols

if NUMERIC_AS_TEXT:
  text_cols+=numeric_cols

if DATE_AS_TEXT:
  text_cols+=date_cols

train_ids = pd.read_csv('data/processed/train.csv',usecols=['imdb_id'])['imdb_id'].tolist()
val_ids = pd.read_csv('data/processed/val.csv',usecols=['imdb_id'])['imdb_id'].tolist()
test_ids = pd.read_csv('data/processed/test.csv',usecols=['imdb_id'])['imdb_id'].tolist()
df = pd.read_csv('data/processed/df.csv',usecols = all_cols,parse_dates=['release_date']).sample(frac=1,random_state=42) #shuffle


df[categoric_cols] = df[categoric_cols].apply(lambda x: x.str.replace('|',', '),axis=0) #Change pipe to comma, its more meaningful



In [6]:
#Additional auxilary columns
df['top_genre'] = df['genres'].apply(lambda x: x.split(', ')[0])
df['top_country'] = df['countries'].apply(lambda x: x.split(', ')[0] if isinstance(x,str) else x)
df['year'] = df['release_date'].dt.year

In [7]:
#Create splits
if DEBUG:
    train = df[df['imdb_id'].isin(train_ids)].sample(frac=0.2)
    val = df[df['imdb_id'].isin(val_ids)].sample(frac=0.2)
    test = df[df['imdb_id'].isin(test_ids)]
else:
    train = df[df['imdb_id'].isin(train_ids)]
    val = df[df['imdb_id'].isin(val_ids)]
    test = df[df['imdb_id'].isin(test_ids)]


#Fill na in some columns with statistics
naf = NAFiller(train)
naf.fit(column = 'Budget',groupby=['top_genre','top_country'])
naf.transform(train,round=True)
naf.transform(val,round=True)
naf.transform(test,round=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
train_dataset=create_dataset_split(train,text_cols,text_input_col,TARGET_COL)
val_dataset=create_dataset_split(val,text_cols,text_input_col,TARGET_COL)
test_dataset=create_dataset_split(test,text_cols,text_input_col,TARGET_COL)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
#Example of input to language model
train['text_input'].iloc[15]

"Dedication [SEP] A modern love story in which a misanthropic, emotionally complex author of a hit children's book is forced to team with a beautiful illustrator after his best friend and collaborator passes away. As Henry struggles with letting go of the ghosts of love and life, he discovers that sometimes you have to take a gamble at life to find love. [SEP] billy crudup, mandy moore, tom wilkinson [SEP] united states [SEP] justin theroux [SEP] comedy, drama, romance [SEP] english [SEP] first look international, hart-lunsford pictures, plum pictures [SEP] 13000000 [SEP] 95 [SEP] 2007-01-22"

In [154]:
import csv
class DictWriter:
    
    def __init__(self,file_path,field_names):
        self.field_names = field_names
        self.file_path = file_path
        self.create_file() #Crerate file if it doesnt exist.
        
    def create_file(self):
        if not os.path.exists(self.file_path):
            print('creating file')
            f = open(self.file_path, 'w')
            w = csv.DictWriter(f, field_names)
            w.writeheader()
            f.close()
        else:
            print('file already exist. Will append rows to it.')
            
    def add_rows(self,rows):  
        with open(self.file_path, 'a') as f:
            w = csv.DictWriter(f,self.field_names)
            for r in rows:    
                w.writerow(r)        

In [155]:
epochs = 15
num_evals = 20
patience = 2 if DEBUG else 30
callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
eval_steps = 50 if DEBUG else 100

'''
hparams = {'batch_size' : [8,16,32],
           'learning_rate' : [1e-5, 2e-5, 3e-5,5e-5],
           'weight_decay' : [0.1,0.01],
           'repeats': range(1)}

'''

hparams = {'batch_size' : [8],
           'learning_rate' : [1e-6,7e-6,9e-6],
           'weight_decay' : [0.1,0.01],
           'repeats': range(1)}

combs = list(product(*[range(len(i)) for i in list(hparams.values())]))
scores = np.zeros([len(i) for i in list(hparams.values())])


#trials_df_rows = []

field_names = list(hparams.keys()) + ['score']
dw = DictWriter(TRIALS_DF_PATH,field_names)

currernt_trials_df = pd.read_csv(TRIALS_DF_PATH) #This can be empty or not.
done_trials = currernt_trials_df.drop('score',axis=1).to_dict(orient='records') #empty list or not
best_score = min(float('inf'),currernt_trials_df['score'].min())

print(f'current best val score = {best_score}')

for idx,comb_indexes in enumerate(combs):
    comb_values = {name:val[idx] for name,val,idx in zip(hparams.keys(),hparams.values(),comb_indexes)}
    
    if comb_values not in done_trials: #Check if trial alrready exists. If it does, skip.
        print('training with following hparams:')
        pprint(comb_values)

        training_args = TrainingArguments(output_dir=f"{MODEL_NAME}-{TARGET_COL}",
                                          per_device_train_batch_size = comb_values['batch_size'],
                                          learning_rate=comb_values['learning_rate'],
                                          weight_decay=comb_values['weight_decay'],
                                          seed = 42,
                                          fp16=True,
                                          per_device_eval_batch_size = 16,
                                          warmup_ratio=0.06,
                                          num_train_epochs = epochs,
                                          evaluation_strategy = "steps",
                                          save_strategy = "steps",
                                          load_best_model_at_end=True,
                                          eval_steps = eval_steps,
                                          save_steps = eval_steps,
                                          save_total_limit = 1,
                                          log_level = 'error',
                                          disable_tqdm = True

                                        )

        trainer = Trainer(
            model_init=get_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            callbacks = callbacks
        )

        trainer.train()

        score = trainer.evaluate()['eval_loss']

        scores[tuple(comb_indexes)] = score #outdated

        comb_values['score'] = score

        dw.add_rows([comb_values]) #Append to dataframe

        #trials_df_rows.append(comb_values)

        if score<best_score:
            print(f'got a better model, with score {np.round(score,4)} saving...')
            best_score = score
            trainer.save_model(f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}')
            print('saved')
    else:
        print('skipping trial because already exists')


#trials_df = pd.DataFrame(trials_df_rows)
#trials_df.to_csv(f'models/{MODEL_FOLDER}/hparams_trials.csv',index=False)


file already exist. Will append rows to it.
current best val score = 0.7135345339775085
training with following hparams:
{'batch_size': 8, 'learning_rate': 1e-06, 'repeats': 0, 'weight_decay': 0.1}




{'eval_loss': 38.72096633911133, 'eval_runtime': 10.8065, 'eval_samples_per_second': 162.309, 'eval_steps_per_second': 10.179, 'epoch': 0.1}
{'eval_loss': 37.53731918334961, 'eval_runtime': 11.1651, 'eval_samples_per_second': 157.097, 'eval_steps_per_second': 9.852, 'epoch': 0.2}
{'eval_loss': 32.71261215209961, 'eval_runtime': 10.5848, 'eval_samples_per_second': 165.71, 'eval_steps_per_second': 10.392, 'epoch': 0.29}
{'eval_loss': 22.777984619140625, 'eval_runtime': 10.9662, 'eval_samples_per_second': 159.945, 'eval_steps_per_second': 10.031, 'epoch': 0.39}
{'loss': 32.4803, 'learning_rate': 8.048780487804878e-07, 'epoch': 0.49}
{'eval_loss': 15.292277336120605, 'eval_runtime': 10.9967, 'eval_samples_per_second': 159.502, 'eval_steps_per_second': 10.003, 'epoch': 0.49}
{'eval_loss': 10.02090835571289, 'eval_runtime': 10.6976, 'eval_samples_per_second': 163.963, 'eval_steps_per_second': 10.283, 'epoch': 0.59}
{'eval_loss': 6.437679290771484, 'eval_runtime': 10.8441, 'eval_samples_per_s



{'eval_loss': 38.72096633911133, 'eval_runtime': 10.7418, 'eval_samples_per_second': 163.288, 'eval_steps_per_second': 10.24, 'epoch': 0.1}
{'eval_loss': 37.53728103637695, 'eval_runtime': 11.0061, 'eval_samples_per_second': 159.366, 'eval_steps_per_second': 9.994, 'epoch': 0.2}
{'eval_loss': 32.71232223510742, 'eval_runtime': 10.8957, 'eval_samples_per_second': 160.981, 'eval_steps_per_second': 10.096, 'epoch': 0.29}
{'eval_loss': 22.777441024780273, 'eval_runtime': 10.7702, 'eval_samples_per_second': 162.857, 'eval_steps_per_second': 10.213, 'epoch': 0.39}
{'loss': 32.48, 'learning_rate': 8.048780487804878e-07, 'epoch': 0.49}
{'eval_loss': 15.291258811950684, 'eval_runtime': 10.8591, 'eval_samples_per_second': 161.524, 'eval_steps_per_second': 10.13, 'epoch': 0.49}
{'eval_loss': 10.01960277557373, 'eval_runtime': 10.855, 'eval_samples_per_second': 161.585, 'eval_steps_per_second': 10.134, 'epoch': 0.59}
{'eval_loss': 6.435924530029297, 'eval_runtime': 10.8714, 'eval_samples_per_secon



{'eval_loss': 36.052886962890625, 'eval_runtime': 10.9168, 'eval_samples_per_second': 160.669, 'eval_steps_per_second': 10.076, 'epoch': 0.1}
{'eval_loss': 15.621565818786621, 'eval_runtime': 10.7057, 'eval_samples_per_second': 163.838, 'eval_steps_per_second': 10.275, 'epoch': 0.2}
{'eval_loss': 4.663555145263672, 'eval_runtime': 10.9539, 'eval_samples_per_second': 160.125, 'eval_steps_per_second': 10.042, 'epoch': 0.29}
{'eval_loss': 1.4152004718780518, 'eval_runtime': 10.8382, 'eval_samples_per_second': 161.835, 'eval_steps_per_second': 10.149, 'epoch': 0.39}
{'loss': 15.8791, 'learning_rate': 5.6341463414634145e-06, 'epoch': 0.49}
{'eval_loss': 1.2953592538833618, 'eval_runtime': 10.7347, 'eval_samples_per_second': 163.395, 'eval_steps_per_second': 10.247, 'epoch': 0.49}
{'eval_loss': 1.2685067653656006, 'eval_runtime': 10.8273, 'eval_samples_per_second': 161.998, 'eval_steps_per_second': 10.159, 'epoch': 0.59}
{'eval_loss': 1.075588583946228, 'eval_runtime': 10.8487, 'eval_samples



{'eval_loss': 36.05367660522461, 'eval_runtime': 10.8478, 'eval_samples_per_second': 161.692, 'eval_steps_per_second': 10.14, 'epoch': 0.1}
{'eval_loss': 15.6232271194458, 'eval_runtime': 10.9081, 'eval_samples_per_second': 160.799, 'eval_steps_per_second': 10.084, 'epoch': 0.2}
{'eval_loss': 4.6620330810546875, 'eval_runtime': 10.7858, 'eval_samples_per_second': 162.621, 'eval_steps_per_second': 10.199, 'epoch': 0.29}
{'eval_loss': 1.4145015478134155, 'eval_runtime': 10.7249, 'eval_samples_per_second': 163.544, 'eval_steps_per_second': 10.256, 'epoch': 0.39}
{'loss': 15.879, 'learning_rate': 5.6341463414634145e-06, 'epoch': 0.49}
{'eval_loss': 1.2921593189239502, 'eval_runtime': 10.8791, 'eval_samples_per_second': 161.226, 'eval_steps_per_second': 10.111, 'epoch': 0.49}
{'eval_loss': 1.265830397605896, 'eval_runtime': 10.8177, 'eval_samples_per_second': 162.142, 'eval_steps_per_second': 10.169, 'epoch': 0.59}
{'eval_loss': 1.068159580230713, 'eval_runtime': 10.6981, 'eval_samples_per_



{'eval_loss': 34.478641510009766, 'eval_runtime': 10.8316, 'eval_samples_per_second': 161.934, 'eval_steps_per_second': 10.155, 'epoch': 0.1}
{'eval_loss': 12.019881248474121, 'eval_runtime': 10.9415, 'eval_samples_per_second': 160.307, 'eval_steps_per_second': 10.053, 'epoch': 0.2}
{'eval_loss': 2.8149824142456055, 'eval_runtime': 10.7852, 'eval_samples_per_second': 162.63, 'eval_steps_per_second': 10.199, 'epoch': 0.29}
{'eval_loss': 1.2846981287002563, 'eval_runtime': 10.6784, 'eval_samples_per_second': 164.257, 'eval_steps_per_second': 10.301, 'epoch': 0.39}
{'loss': 14.2927, 'learning_rate': 7.243902439024391e-06, 'epoch': 0.49}
{'eval_loss': 1.2354741096496582, 'eval_runtime': 10.8664, 'eval_samples_per_second': 161.416, 'eval_steps_per_second': 10.123, 'epoch': 0.49}
{'eval_loss': 1.0888170003890991, 'eval_runtime': 10.8705, 'eval_samples_per_second': 161.355, 'eval_steps_per_second': 10.119, 'epoch': 0.59}
{'eval_loss': 1.0591228008270264, 'eval_runtime': 10.7495, 'eval_samples



{'eval_loss': 34.47977066040039, 'eval_runtime': 10.8843, 'eval_samples_per_second': 161.149, 'eval_steps_per_second': 10.106, 'epoch': 0.1}
{'eval_loss': 12.014822959899902, 'eval_runtime': 10.9138, 'eval_samples_per_second': 160.714, 'eval_steps_per_second': 10.079, 'epoch': 0.2}
{'eval_loss': 2.813307285308838, 'eval_runtime': 10.6968, 'eval_samples_per_second': 163.974, 'eval_steps_per_second': 10.283, 'epoch': 0.29}
{'eval_loss': 1.2845816612243652, 'eval_runtime': 10.8568, 'eval_samples_per_second': 161.558, 'eval_steps_per_second': 10.132, 'epoch': 0.39}
{'loss': 14.2916, 'learning_rate': 7.243902439024391e-06, 'epoch': 0.49}
{'eval_loss': 1.2358342409133911, 'eval_runtime': 10.8217, 'eval_samples_per_second': 162.081, 'eval_steps_per_second': 10.165, 'epoch': 0.49}
{'eval_loss': 1.088053822517395, 'eval_runtime': 10.7332, 'eval_samples_per_second': 163.418, 'eval_steps_per_second': 10.249, 'epoch': 0.59}
{'eval_loss': 1.0942330360412598, 'eval_runtime': 10.8934, 'eval_samples_p

In [11]:
'''

rows=[]
for idx,comb_indexes in enumerate(combs):
    row = {name:val[idx] for name,val,idx in zip(hparams.keys(),hparams.values(),comb_indexes)}
    row['score']=scores[tuple(comb_indexes)]
    rows.append(row)
trials_df = pd.DataFrame(rows)
trials_df.to_csv(f'models/{MODEL_FOLDER}/hparams_trials.csv',index=False)

'''

In [24]:
#Test set performance
best_model = get_model_by_name(f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}/')
trainer_best_model = Trainer(model=best_model)
test_stats_best_model = trainer_best_model.evaluate(test_dataset)
pd.DataFrame([test_stats_best_model]).to_csv(f'models/{MODEL_FOLDER}/test_stats_best_model.csv'
                                             ,index=False)

***** Running Evaluation *****
  Num examples = 1754
  Batch size = 8


{'eval_loss': 0.626629650592804, 'eval_runtime': 33.3389, 'eval_samples_per_second': 52.611, 'eval_steps_per_second': 6.599}
