In [1]:
#!pip install -U transformers
#!pip install -U datasets
#!pip install optuna
import os
import sys
HOME = os.path.abspath('..')
sys.path.append(HOME)
os.chdir(HOME)
import pandas as pd
#!pip install transformers
from transformers import RobertaConfig, RobertaModel,RobertaForSequenceClassification, Trainer,AutoModelForSequenceClassification, EarlyStoppingCallback 
from transformers import AutoTokenizer
from transformers.models.roberta import RobertaPreTrainedModel
import torch
from torch import nn
from transformers import TrainingArguments
import glob
import optuna
from itertools import product
import numpy as np
from pprint import pprint

In [95]:
MODEL_NAME =  "distilbert-base-uncased" #"roberta-base" 
TARGET_COL = 'averageRating'
MODEL_FOLDER = 'everything_as_text'
text_input_col = 'text_input'
CATEGORIES_AS_TEXT = True
NUMERIC_AS_TEXT = True
DATE_AS_TEXT = True
ADJUST_INFLATION = False
USE_COLUMN_NAMES = False
COLAB = False
DEBUG = False

if ADJUST_INFLATION:
    FINAL_MODEL_NAME+='-inflation_adjusted'
    
if USE_COLUMN_NAMES:
    FINAL_MODEL_NAME+='-with_column_names'

    
if USE_COLUMN_NAMES:
    assert CATEGORIES_AS_TEXT|NUMERIC_AS_TEXT|DATE_AS_TEXT, "can't use column names as text if there are no columns to treat as text!"
    
print('Final model name: ',FINAL_MODEL_NAME)
print('Saving at: ',MODEL_FOLDER)



if COLAB == True:
  if not os.path.exists('data'):
    os.mkdir('data')
  if not os.path.exists('data/processed'):
    os.mkdir('data/processed')

  drive.mount('/content/gdrive/')
  for filename in glob.glob(os.path.join('gdrive/MyDrive/atdl', '*.*')):
      shutil.copy(filename, 'data/processed')
    
FINAL_MODEL_NAME = f"{MODEL_NAME}-{TARGET_COL}"

Final model name:  distilbert-base-uncased-averageRating
Saving at:  everything_as_text


In [98]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def process_text_data(data_:pd.DataFrame,text_col,padding ="max_length", truncation = True, na_filler = ""):

    '''
    
    '''
    data = data_.copy()
    data[text_col] = data[text_col].fillna(na_filler)
    encodings = tokenizer(data[text_col].tolist(), padding=padding, truncation=truncation)
    return encodings
    

def columns_to_single_text(df,cols_to_transform,new_col_name = 'text_input',sep = tokenizer.sep_token,nan_replacement = tokenizer.unk_token ):

  '''
  
  Creates a new column called new_col_name with with all columns in cols_to_transform concatenated into a single text
  '''
  df[new_col_name] = df[cols_to_transform].astype(str).replace('nan',nan_replacement).agg(f' {sep} '.join, axis=1)


class NAFiller:

  def __init__(self,train):
    self.train = train

  def fit(self,column = 'Budget',groupby=['top_genre','top_country']):
    self.mapping = self.train.groupby(groupby)[column].median().reset_index()
    self.mapping = self.mapping.rename(columns={column:'na_filler'})
    self.median = self.train[column].median()
    self.column=column


  def transform(self,test,round = False):
    self.na_filler = test.merge(self.mapping,how='left')['na_filler']
    self.na_filler = self.na_filler.fillna(self.median)

    test[self.column] = test[self.column].reset_index(drop=True).fillna(self.na_filler).values

    if round:
      test[self.column] = test[self.column].round().astype(int)
      


  def fit_transform(self,test,column = 'Budget',groupby=['top_genre','top_country']):
    self.fit(column,groupby)
    self.transform()
    self.column=column
        

def create_dataset_split(split,text_cols,text_input_col,TARGET_COL):

  #If all columns in text_cols are combined into a single text. A n
  columns_to_single_text(split,text_cols)

  #Get split encodings
  split_encodings = process_text_data(split,text_input_col)

  #get labels
  split_labels = split[TARGET_COL].tolist()

  #Create dataset objects
  split_dataset = IMDbDataset(split_encodings, split_labels)

  return split_dataset


def get_model():
    return AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                         problem_type='regression',
                                                         num_labels=1)
def get_model_by_name(model_name):
    return AutoModelForSequenceClassification.from_pretrained(model_name,
                                                         problem_type='regression',
                                                         num_labels=1
                                                        )                                              


In [99]:
all_cols =  ['Budget',
             'averageRating',
             'cast',
             'countries',
             'director',
             'genres',
             'imdb_id',
             'languages',
             'overview',
             'production companies',
             'release_date',
             'revenue_worldwide_BOM',
             'runtimeMinutes',
             'title']

categoric_cols = ['cast',
                  'countries',
                  'director',
                  'genres',
                  'languages',
                  'production companies']

text_cols = ['title','overview']                  
date_cols = ['release_date']
numeric_cols = ['Budget','runtimeMinutes']

if CATEGORIES_AS_TEXT:
  text_cols+=categoric_cols

if NUMERIC_AS_TEXT:
  text_cols+=numeric_cols

if DATE_AS_TEXT:
  text_cols+=date_cols

train_ids = pd.read_csv('data/processed/train.csv',usecols=['imdb_id'])['imdb_id'].tolist()
val_ids = pd.read_csv('data/processed/val.csv',usecols=['imdb_id'])['imdb_id'].tolist()
test_ids = pd.read_csv('data/processed/test.csv',usecols=['imdb_id'])['imdb_id'].tolist()
df = pd.read_csv('data/processed/df.csv',usecols = all_cols,parse_dates=['release_date']).sample(frac=1,random_state=42) #shuffle


df[categoric_cols] = df[categoric_cols].apply(lambda x: x.str.replace('|',', '),axis=0) #Change pipe to comma, its more meaningful



In [100]:
#Additional auxilary columns
df['top_genre'] = df['genres'].apply(lambda x: x.split(', ')[0])
df['top_country'] = df['countries'].apply(lambda x: x.split(', ')[0] if isinstance(x,str) else x)
df['year'] = df['release_date'].dt.year

In [101]:
#Create splits
if DEBUG:
    train = df[df['imdb_id'].isin(train_ids)].sample(frac=0.2)
    val = df[df['imdb_id'].isin(val_ids)].sample(frac=0.2)
    test = df[df['imdb_id'].isin(test_ids)]
else:
    train = df[df['imdb_id'].isin(train_ids)]
    val = df[df['imdb_id'].isin(val_ids)]
    test = df[df['imdb_id'].isin(test_ids)]


#Fill na in some columns with statistics
naf = NAFiller(train)
naf.fit(column = 'Budget',groupby=['top_genre','top_country'])
naf.transform(train,round=True)
naf.transform(val,round=True)
naf.transform(test,round=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [102]:
train_dataset=create_dataset_split(train,text_cols,text_input_col,TARGET_COL)
val_dataset=create_dataset_split(val,text_cols,text_input_col,TARGET_COL)
test_dataset=create_dataset_split(test,text_cols,text_input_col,TARGET_COL)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [104]:
#Example of input to language model
train['text_input'].iloc[15]

"Dedication [SEP] A modern love story in which a misanthropic, emotionally complex author of a hit children's book is forced to team with a beautiful illustrator after his best friend and collaborator passes away. As Henry struggles with letting go of the ghosts of love and life, he discovers that sometimes you have to take a gamble at life to find love. [SEP] billy crudup, mandy moore, tom wilkinson [SEP] united states [SEP] justin theroux [SEP] comedy, drama, romance [SEP] english [SEP] first look international, hart-lunsford pictures, plum pictures [SEP] 13000000 [SEP] 95 [SEP] 2007-01-22"

In [107]:
epochs = 10
num_evals = 20
patience = 2 if DEBUG else 30
callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
eval_steps = 50 if DEBUG else 100

hparams = {'batch_size' : [8,16,32],
           'learning_rate' : [1e-5, 2e-5, 3e-5,5e-5],
           'weight_decay' : [0.1,0.01],
           'repeats': range(1)}

combs = list(product(*[range(len(i)) for i in list(hparams.values())]))
scores = np.zeros([len(i) for i in list(hparams.values())])
best_score = float('inf')


trials_df = []

for idx,comb_indexes in enumerate(combs):
    comb_values = {name:val[idx] for name,val,idx in zip(hparams.keys(),hparams.values(),comb_indexes)}
    
    
    print('training with following hparams:')
    pprint(comb_values)
    
    training_args = TrainingArguments(output_dir=f"{MODEL_NAME}-{TARGET_COL}",
                                      per_device_train_batch_size = comb_values['batch_size'],
                                      learning_rate=comb_values['learning_rate'],
                                      weight_decay=comb_values['weight_decay'],
                                      seed = 42,
                                      fp16=True,
                                      per_device_eval_batch_size = 16,
                                      warmup_ratio=0.06,
                                      num_train_epochs = epochs,
                                      evaluation_strategy = "steps",
                                      save_strategy = "steps",
                                      load_best_model_at_end=True,
                                      eval_steps = eval_steps,
                                      save_steps = eval_steps,
                                      save_total_limit = 1,
                                      log_level = 'error',
                                      disable_tqdm = True
                                      
                                    )

    trainer = Trainer(
        model_init=get_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks = callbacks
    )
    
    trainer.train()

    score = trainer.evaluate()['eval_loss']
    scores[tuple(comb_indexes)] = score #outdated
    
    comb_values['score'] = score
    trials_df.append(comb_values)
    
    if score<best_score:
        print(f'got a better model, with score {np.round(score,4)} saving...')
        best_score = score
        trainer.save_model(f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}')
        print('saved')
        

trials_df = pd.DataFrame(rows)
trials_df.to_csv(f'models/{MODEL_FOLDER}/hparams_trials.csv',index=False)


training with following hparams:
{'batch_size': 8, 'learning_rate': 1e-05, 'repeats': 0, 'weight_decay': 0.1}
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current proce

ValueError: Mixed precision training with AMP or APEX (`--fp16` or `--bf16`) and half precision evaluation (`--fp16_full_eval` or `--bf16_full_eval`) can only be used on CUDA devices.

In [11]:
'''
rows=[]
for idx,comb_indexes in enumerate(combs):
    row = {name:val[idx] for name,val,idx in zip(hparams.keys(),hparams.values(),comb_indexes)}
    row['score']=scores[tuple(comb_indexes)]
    rows.append(row)
trials_df = pd.DataFrame(rows)
trials_df.to_csv(f'models/{MODEL_FOLDER}/hparams_trials.csv',index=False)
'''

In [41]:
#Test set performance
best_model = get_model_by_name(f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}/')
trainer_best_model = Trainer(model=best_model)
test_stats_best_model = trainer_best_model.evaluate(test_dataset)
pd.DataFrame([test_stats_best_model]).to_csv(f'models/{MODEL_FOLDER}/test_stats_best_model.csv'
                                             ,index=False)

loading configuration file models/only_text_features/distilbert-base-uncased-averageRating/config.json
Model config DistilBertConfig {
  "_name_or_path": "models/only_text_features/distilbert-base-uncased-averageRating/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "regression",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file models/only_text_features/distilbert-base-uncased-averageRating/pytorch_model.bin
All model checkpoint weights were used when in