In [1]:
#!pip install -U transformers
#!pip install -U datasets
#!pip install optuna
import os
import sys
HOME = os.path.abspath('..')
sys.path.append(HOME)
os.chdir(HOME)
import pandas as pd
#!pip install transformers
from transformers import RobertaConfig, RobertaModel,RobertaForSequenceClassification, Trainer,AutoModelForSequenceClassification, EarlyStoppingCallback 
from transformers import AutoTokenizer
from transformers.models.roberta import RobertaPreTrainedModel
import torch
from torch import nn
from transformers import TrainingArguments
import glob
import optuna

In [2]:
MODEL_NAME =  "distilbert-base-uncased" #"roberta-base" 
TARGET_COL = 'averageRating'
MODEL_FOLDER = 'only_text_features'
FINAL_MODEL_NAME = f"{MODEL_NAME}-{TARGET_COL}"
CATEGORIES_AS_TEXT = False
NUMERIC_AS_TEXT = False
DATE_AS_TEXT = False
text_input_col = 'text_input'
COLAB = False
DEBUG = False

if COLAB == True:
  if not os.path.exists('data'):
    os.mkdir('data')
  if not os.path.exists('data/processed'):
    os.mkdir('data/processed')

  drive.mount('/content/gdrive/')
  for filename in glob.glob(os.path.join('gdrive/MyDrive/atdl', '*.*')):
      shutil.copy(filename, 'data/processed')

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def process_text_data(data_:pd.DataFrame,text_col,padding ="max_length", truncation = True, na_filler = ""):

    '''
    
    '''
    data = data_.copy()
    data[text_col] = data[text_col].fillna(na_filler)
    encodings = tokenizer(data[text_col].tolist(), padding=padding, truncation=truncation)
    return encodings
    

def columns_to_single_text(df,cols_to_transform,new_col_name = 'text_input',sep = tokenizer.sep_token,nan_replacement = tokenizer.unk_token ):

  '''
  
  Creates a new column called new_col_name with with all columns in cols_to_transform concatenated into a single text
  '''
  df[new_col_name] = df[cols_to_transform].astype(str).replace('nan',nan_replacement).agg(f' {sep} '.join, axis=1)


class NAFiller:

  def __init__(self,train):
    self.train = train

  def fit(self,column = 'Budget',groupby=['top_genre','top_country']):
    self.mapping = self.train.groupby(groupby)[column].median().reset_index()
    self.mapping = self.mapping.rename(columns={column:'na_filler'})
    self.median = self.train[column].median()
    self.column=column


  def transform(self,test,round = False):
    self.na_filler = test.merge(self.mapping,how='left')['na_filler']
    self.na_filler = self.na_filler.fillna(self.median)

    test[self.column] = test[self.column].reset_index(drop=True).fillna(self.na_filler).values

    if round:
      test[self.column].round().astype(int)
      


  def fit_transform(self,test,column = 'Budget',groupby=['top_genre','top_country']):
    self.fit(column,groupby)
    self.transform()
        self.column=column
        

def create_dataset_split(split,text_cols,text_input_col,TARGET_COL):

  #If all columns in text_cols are combined into a single text. A n
  columns_to_single_text(split,text_cols)

  #Get split encodings
  split_encodings = process_text_data(split,text_input_col)

  #get labels
  split_labels = split[TARGET_COL].tolist()

  #Create dataset objects
  split_dataset = IMDbDataset(split_encodings, split_labels)

  return split_dataset

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [4]:
all_cols =  ['Budget',
             'averageRating',
             'cast',
             'countries',
             'director',
             'genres',
             'imdb_id',
             'languages',
             'overview',
             'production companies',
             'release_date',
             'revenue_worldwide_BOM',
             'runtimeMinutes',
             'title']

categoric_cols = ['cast',
                  'countries',
                  'director',
                  'genres',
                  'languages',
                  'production companies']

text_cols = ['title','overview']                  
date_cols = ['release_date']
numeric_cols = ['Budget','runtimeMinutes']

if CATEGORIES_AS_TEXT:
  text_cols+=categoric_cols

if NUMERIC_AS_TEXT:
  text_cols+=numeric_cols

if DATE_AS_TEXT:
  text_cols+=date_cols


train_ids = pd.read_csv('data/processed/train.csv',usecols=['imdb_id'])['imdb_id'].tolist()
val_ids = pd.read_csv('data/processed/val.csv',usecols=['imdb_id'])['imdb_id'].tolist()
test_ids = pd.read_csv('data/processed/test.csv',usecols=['imdb_id'])['imdb_id'].tolist()
df = pd.read_csv('data/processed/df.csv',usecols = all_cols,parse_dates=['release_date']).sample(frac=1) #shuffle


df[categoric_cols] = df[categoric_cols].apply(lambda x: x.str.replace('|',', '),axis=0) #Change pipe to comma, its more meaningful



In [5]:
#Additional auxilary columns
df['top_genre'] = df['genres'].apply(lambda x: x.split(', ')[0])
df['top_country'] = df['countries'].apply(lambda x: x.split(', ')[0] if isinstance(x,str) else x)
df['year'] = df['release_date'].dt.year

In [6]:
#Create splits
if DEBUG:
    train = df[df['imdb_id'].isin(train_ids)].sample(frac=0.2)
    val = df[df['imdb_id'].isin(val_ids)].sample(frac=0.2)
    test = df[df['imdb_id'].isin(test_ids)]
else:
    train = df[df['imdb_id'].isin(train_ids)]
    val = df[df['imdb_id'].isin(val_ids)]
    test = df[df['imdb_id'].isin(test_ids)]


#Fill na in some columns with statistics
naf = NAFiller(train)
naf.fit(column = 'Budget',groupby=['top_genre','top_country'])
naf.transform(train,round=True)
naf.transform(val,round=True)
naf.transform(test,round=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
'''
#If all columns in text_cols are combined into a single text. A n
columns_to_single_text(train,text_cols)
columns_to_single_text(val,text_cols)
columns_to_single_text(test,text_cols)


#Get train encodings
train_encodings = process_text_data(train,text_input_col)
val_encodings = process_text_data(val,text_input_col)
test_encodings = process_text_data(test,text_input_col)

#get labels
train_labels = train[TARGET_COL].tolist()
val_labels = val[TARGET_COL].tolist()
test_labels = test[TARGET_COL].tolist()

#Create dataset objects
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)
'''


train_dataset=create_dataset_split(train,text_cols,text_input_col,TARGET_COL)
val_dataset=create_dataset_split(val,text_cols,text_input_col,TARGET_COL)
test_dataset=create_dataset_split(test,text_cols,text_input_col,TARGET_COL)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
def get_model():
    return AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                         problem_type='regression',
                                                         num_labels=1
                                                        )

from itertools import product
import numpy as np
from pprint import pprint

epochs = 10
num_evals = 20
patience = 2 if DEBUG else 20
callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
eval_steps = 50 if DEBUG else 100

hparams = {'batch_size' : [8,16,32],
           'learning_rate' : [1e-5, 2e-5, 3e-5,5e-5],
           'weight_decay' : [0.1,0.01],
           'repeats': range(1)}

combs = list(product(*[range(len(i)) for i in list(hparams.values())]))
scores = np.zeros([len(i) for i in list(hparams.values())])
best_score = float('inf')

for idx,comb_indexes in enumerate(combs):
    comb_values = {name:val[idx] for name,val,idx in zip(hparams.keys(),hparams.values(),comb_indexes)}
    
    
    print('training with following hparams:')
    pprint(comb_values)
    
    training_args = TrainingArguments(output_dir=f"{MODEL_NAME}-{TARGET_COL}",
                                      per_device_train_batch_size = comb_values['batch_size'],
                                      learning_rate=comb_values['learning_rate'],
                                      weight_decay=comb_values['weight_decay'],
                                      seed = 42,
                                      fp16=True,
                                      per_device_eval_batch_size = 16,
                                      warmup_ratio=0.06,
                                      num_train_epochs = epochs,
                                      evaluation_strategy = "steps",
                                      save_strategy = "steps",
                                      load_best_model_at_end=True,
                                      eval_steps = eval_steps,
                                      save_steps = eval_steps,
                                      save_total_limit = 1,
                                      log_level = 'error',
                                      no_deprecation_warning = True
                                    )

    trainer = Trainer(
        model_init=get_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks = callbacks
    )
    
    trainer.train()

    score = trainer.evaluate()['eval_loss']
    scores[tuple(comb_indexes)] = score
    
    if score<best_score:
        print(f'got a better model, with score {np.round(score,4)} saving...')
        best_score = score
        trainer.save_model(f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}')
        print('saved')
        
    

    
    
'''
    
training_args = TrainingArguments(output_dir=f"{MODEL_NAME}-{TARGET_COL}",
                                  seed = 42,
                                  fp16=True,
                                  per_device_eval_batch_size = 16,
                                  warmup_ratio=0.06,
                                  num_train_epochs = epochs,
                                  evaluation_strategy = "steps",
                                  save_strategy = "steps",
                                  load_best_model_at_end=True,
                                  eval_steps = eval_steps,
                                  save_steps = eval_steps,
                                  save_total_limit = 3
                                 )

trainer = Trainer(
    model_init=get_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks = callbacks
)


#Grid search
search_space = {'learning_rate' : [1e-5, 2e-5, 3e-5, 5e-5],
                'weight_decay' : [0.1,0.01],
                'per_device_train_batch_size' : [32,16,8]
               }




def hyperparameter_space(trial):

    return {
        "learning_rate": trial.suggest_categorical("learning_rate",search_space["learning_rate"]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", search_space["per_device_train_batch_size"]),
        "weight_decay": trial.suggest_categorical("weight_decay", search_space["weight_decay"])
    }


best_run = trainer.hyperparameter_search(hp_space=hyperparameter_space,
                                         n_trials=None,
                                         sampler = optuna.samplers.GridSampler(search_space),
                                         study_name = 'imdb_rating_finetune',
                                         direction='minimize',
                                         pruner=optuna.pruners.PatientPruner(None,patience=patience)
                                        )
'''


training with following hparams:
{'batch_size': 8, 'learning_rate': 1e-05, 'repeats': 0, 'weight_decay': 0.1}


Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]



Step,Training Loss,Validation Loss
100,No log,34.39362
200,No log,14.099778
300,No log,2.717003
400,No log,1.279733
500,14.826600,1.379121
600,14.826600,1.234488
700,14.826600,1.221797
800,14.826600,1.058792
900,14.826600,1.078565
1000,1.018800,1.047519


got a better model, with score 1.0187 saving...
saved
training with following hparams:
{'batch_size': 8, 'learning_rate': 1e-05, 'repeats': 0, 'weight_decay': 0.01}




{'eval_loss': 34.393165588378906, 'eval_runtime': 10.4026, 'eval_samples_per_second': 168.612, 'eval_steps_per_second': 10.574, 'epoch': 0.1}
{'eval_loss': 14.09725570678711, 'eval_runtime': 10.4008, 'eval_samples_per_second': 168.642, 'eval_steps_per_second': 10.576, 'epoch': 0.2}
{'eval_loss': 2.713705062866211, 'eval_runtime': 10.3552, 'eval_samples_per_second': 169.383, 'eval_steps_per_second': 10.623, 'epoch': 0.29}
{'eval_loss': 1.2800365686416626, 'eval_runtime': 10.2906, 'eval_samples_per_second': 170.447, 'eval_steps_per_second': 10.689, 'epoch': 0.39}
{'loss': 14.8252, 'learning_rate': 8.048780487804879e-06, 'epoch': 0.49}
{'eval_loss': 1.3805975914001465, 'eval_runtime': 10.3122, 'eval_samples_per_second': 170.09, 'eval_steps_per_second': 10.667, 'epoch': 0.49}
{'eval_loss': 1.2341183423995972, 'eval_runtime': 10.3738, 'eval_samples_per_second': 169.079, 'eval_steps_per_second': 10.604, 'epoch': 0.59}
{'eval_loss': 1.2102855443954468, 'eval_runtime': 10.3516, 'eval_samples_p

In [None]:
hparams

In [None]:
score = trainer.evaluate()['eval_loss']

In [None]:
score

In [None]:
best_run

In [None]:
#Refit with optimal hparams
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()
trainer.save_model(f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}')