In [1]:
import transformers
print(transformers.__version__)

4.18.0


In [4]:
# !pip install -U transformers
# !pip install -U datasets
# !pip install optuna
import os
import sys
HOME = os.path.abspath('..')
sys.path.append(HOME)
os.chdir(HOME)
import pandas as pd
#!pip install transformers
from transformers import RobertaConfig, RobertaModel,RobertaForSequenceClassification, Trainer,AutoModelForSequenceClassification, EarlyStoppingCallback 
from transformers import AutoTokenizer
from transformers.models.roberta import RobertaPreTrainedModel
import torch
from torch import nn
from transformers import TrainingArguments
import glob
import optuna
from itertools import product
import numpy as np
from pprint import pprint

In [5]:
MODEL_NAME =  "distilbert-base-uncased" #"roberta-base" 
TARGET_COL = 'averageRating'#'revenue_worldwide_BOM'#'averageRating'
MODEL_FOLDER = 'text_with_numerical_including_date_unnormalized'#'only_text_features'#'everything_as_text'
text_input_col = 'text_input'
CATEGORIES_AS_TEXT = True
NUMERIC_AS_TEXT = False
DATE_AS_TEXT = False
ADJUST_INFLATION = False
USE_COLUMN_NAMES = False
COLAB = False
DEBUG = False

FINAL_MODEL_NAME = f"{MODEL_NAME}-{TARGET_COL}"

if ADJUST_INFLATION:
    FINAL_MODEL_NAME+='-inflation_adjusted'
    
if USE_COLUMN_NAMES:
    FINAL_MODEL_NAME+='-with_column_names'
    

FINAL_MODEL_PATH = f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}'
TRIALS_DF_PATH = f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}_hparams_trials.csv'
TEST_PERFORMANCE_PATH = f'models/{MODEL_FOLDER}/{FINAL_MODEL_NAME}_test_stats_best_model.csv'
    
if USE_COLUMN_NAMES:
    assert CATEGORIES_AS_TEXT|NUMERIC_AS_TEXT|DATE_AS_TEXT, "can't use column names as text if there are no columns to treat as text!"
    
print('Final model name: ',FINAL_MODEL_NAME)
print('Saving at: ',MODEL_FOLDER)


if COLAB == True:
  if not os.path.exists('data'):
    os.mkdir('data')
  if not os.path.exists('data/processed'):
    os.mkdir('data/processed')

  drive.mount('/content/gdrive/')
  for filename in glob.glob(os.path.join('gdrive/MyDrive/atdl', '*.*')):
      shutil.copy(filename, 'data/processed')

Final model name:  distilbert-base-uncased-averageRating
Saving at:  text_with_numerical_including_date_unnormalized


In [6]:
text_cols = ['text_input']
numerical_cols = ['normalized_Budget', 'normalized_runtimeMinutes','year','month','date']

column_info_dict = {
    'text_cols': text_cols,
    'num_cols': numerical_cols,
}

In [7]:
class TorchTabularTextDataset(torch.utils.data.Dataset):
    """
    :obj:`TorchDataset` wrapper for text dataset with categorical features
    and numerical features
    Parameters:
        encodings (:class:`transformers.BatchEncoding`):
            The output from encode_plus() and batch_encode() methods (tokens, attention_masks, etc) of
            a transformers.PreTrainedTokenizer
        categorical_feats (:class:`numpy.ndarray`, of shape :obj:`(n_examples, categorical feat dim)`, `optional`, defaults to :obj:`None`):
            An array containing the preprocessed categorical features
        numerical_feats (:class:`numpy.ndarray`, of shape :obj:`(n_examples, numerical feat dim)`, `optional`, defaults to :obj:`None`):
            An array containing the preprocessed numerical features
        labels (:class: list` or `numpy.ndarray`, `optional`, defaults to :obj:`None`):
            The labels of the training examples
        class_weights (:class:`numpy.ndarray`, of shape (n_classes),  `optional`, defaults to :obj:`None`):
            Class weights used for cross entropy loss for classification
        df (:class:`pandas.DataFrame`, `optional`, defaults to :obj:`None`):
            Model configuration class with all the parameters of the model.
            This object must also have a tabular_config member variable that is a
            TabularConfig instance specifying the configs for TabularFeatCombiner
    """
    def __init__(self,
                 encodings,
                 categorical_feats,
                 numerical_feats,
                 labels=None,
                 df=None,
                 label_list=None,
                 class_weights=None
                 ):
        self.df = df
        self.encodings = encodings
        self.cat_feats = categorical_feats
        self.numerical_feats = numerical_feats
        self.labels = labels
        self.class_weights = class_weights
        self.label_list = label_list if label_list is not None else [i for i in range(len(np.unique(labels)))]

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])
                for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]) if self.labels is not None  else None
        item['cat_feats'] = torch.tensor(self.cat_feats[idx]).float() \
            if self.cat_feats is not None else torch.zeros(0)
        item['numerical_feats'] = torch.tensor(self.numerical_feats[idx]).float()\
            if self.numerical_feats is not None else torch.zeros(0)
        return item

    def __len__(self):
        return len(self.labels)

    def get_labels(self):
        """returns the label names for classification"""
        return self.label_list

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def process_text_data(data_:pd.DataFrame,text_col,padding ="max_length", truncation = True, na_filler = ""):

    '''
    
    '''
    data = data_.copy()
    data[text_col] = data[text_col].fillna(na_filler)
    encodings = tokenizer(data[text_col].tolist(), padding=padding, truncation=truncation)
    return encodings
    

def columns_to_single_text(df,cols_to_transform,new_col_name = 'text_input',sep = tokenizer.sep_token,nan_replacement = tokenizer.unk_token ):

  '''
  
  Creates a new column called new_col_name with with all columns in cols_to_transform concatenated into a single text
  '''
  df[new_col_name] = df[cols_to_transform].astype(str).replace('nan',nan_replacement).agg(f' {sep} '.join, axis=1)


class NAFiller:

  def __init__(self,train):
    self.train = train

  def fit(self,column = 'Budget',groupby=['top_genre','top_country']):
    self.mapping = self.train.groupby(groupby)[column].median().reset_index()
    self.mapping = self.mapping.rename(columns={column:'na_filler'})
    self.median = self.train[column].median()
    self.column=column


  def transform(self,test,round = False):
    self.na_filler = test.merge(self.mapping,how='left')['na_filler']
    self.na_filler = self.na_filler.fillna(self.median)

    test[self.column] = test[self.column].reset_index(drop=True).fillna(self.na_filler).values

    if round:
      test[self.column] = test[self.column].round().astype(int)
      


  def fit_transform(self,test,column = 'Budget',groupby=['top_genre','top_country']):
    self.fit(column,groupby)
    self.transform()
    self.column=column
        

def create_dataset_split(split,text_cols,text_input_col,TARGET_COL):
    
  if TARGET_COL == 'revenue_worldwide_BOM':
    split[TARGET_COL] = np.log1p(split[TARGET_COL])
    print('log transforming target')
    

  #If all columns in text_cols are combined into a single text. A n
  columns_to_single_text(split,text_cols)

  #Get split encodings
  split_encodings = process_text_data(split,text_input_col)

  #get labels
  split_labels = split[TARGET_COL].tolist()

  split[numerical_cols] = split[numerical_cols].astype(float)
  numerical_feats = split[numerical_cols].values

  #Create dataset objects
  # split_dataset = IMDbDataset(split_encodings, split_labels)
  split_dataset = TorchTabularTextDataset(split_encodings,
                 None,
                 numerical_feats,
                 split_labels,
                 split,
                 None,
                 None)

  return split_dataset


def get_model():
    return AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                         problem_type='regression',
                                                         num_labels=1)
def get_model_by_name(model_name):
    return AutoModelForSequenceClassification.from_pretrained(model_name,
                                                         problem_type='regression',
                                                         num_labels=1
                                                        )  

In [9]:
all_cols =  ['Budget',
             'averageRating',
             'cast',
             'countries',
             'director',
             'genres',
             'imdb_id',
             'languages',
             'overview',
             'production companies',
             'release_date',
             'revenue_worldwide_BOM',
             'runtimeMinutes',
             'title']

categoric_cols = ['cast',
                  'countries',
                  'director',
                  'genres',
                  'languages',
                  'production companies']

text_cols = ['title','overview']                  
date_cols = ['release_date']
numeric_cols = ['normalized_Budget','normalized_runtimeMinutes','year','month','date']

if CATEGORIES_AS_TEXT:
  text_cols+=categoric_cols

if NUMERIC_AS_TEXT:
  text_cols+=numeric_cols

if DATE_AS_TEXT:
  text_cols+=date_cols

train_ids = pd.read_csv('notebooks/data/train_revised_advdlproject.csv',usecols=['imdb_id'])['imdb_id'].tolist()
val_ids = pd.read_csv('notebooks/data/val_revised_advdlproject.csv',usecols=['imdb_id'])['imdb_id'].tolist()
test_ids = pd.read_csv('notebooks/data/test_revised_advdlproject.csv',usecols=['imdb_id'])['imdb_id'].tolist()
df = pd.read_csv('notebooks/data/df_revised_advdlproject.csv',usecols = all_cols,parse_dates=['release_date']).sample(frac=1,random_state=42) #shuffle


df[categoric_cols] = df[categoric_cols].apply(lambda x: x.str.replace('|',', '),axis=0) #Change pipe to comma, its more meaningful



In [10]:
#Additional auxilary columns
df['top_genre'] = df['genres'].apply(lambda x: x.split(', ')[0])
df['top_country'] = df['countries'].apply(lambda x: x.split(', ')[0] if isinstance(x,str) else x)
df['year'] = df['release_date'].dt.year
df['month'] = df['release_date'].dt.month
df['date'] = df['release_date'].dt.day

In [11]:
#Create splits
if DEBUG:
    train = df[df['imdb_id'].isin(train_ids)].sample(frac=0.2)
    val = df[df['imdb_id'].isin(val_ids)].sample(frac=0.2)
    test = df[df['imdb_id'].isin(test_ids)]
else:
    train = df[df['imdb_id'].isin(train_ids)]
    val = df[df['imdb_id'].isin(val_ids)]
    test = df[df['imdb_id'].isin(test_ids)]


#Fill na in some columns with statistics
naf = NAFiller(train)
naf.fit(column = 'Budget',groupby=['top_genre','top_country','year'])
naf.transform(train,round=True)
naf.transform(val,round=True)
naf.transform(test,round=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
non_null_runtimeMinutes = train.loc[train['runtimeMinutes'] != '\\N']
median_val = non_null_runtimeMinutes['runtimeMinutes'].astype(int).median()

In [13]:
train['runtimeMinutes'].replace(r'\\N', str(int(median_val)), regex=True, inplace = True)
val['runtimeMinutes'].replace(r'\\N', str(int(median_val)), regex=True, inplace = True)
test['runtimeMinutes'].replace(r'\\N', str(int(median_val)), regex=True, inplace = True)
train['runtimeMinutes'] = train['runtimeMinutes'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [14]:
import numpy as np

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(np.expand_dims(train['runtimeMinutes'].to_numpy(),0).T)
train['normalized_runtimeMinutes'] = scaler.transform(np.expand_dims(train['runtimeMinutes'].to_numpy(),0).T)
val['normalized_runtimeMinutes'] = scaler.transform(np.expand_dims(val['runtimeMinutes'].to_numpy(),0).T)
test['normalized_runtimeMinutes'] = scaler.transform(np.expand_dims(test['runtimeMinutes'].to_numpy(),0).T)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
scaler = StandardScaler()
scaler.fit(np.expand_dims(train['Budget'].to_numpy(),0).T)
train['normalized_Budget'] = scaler.transform(np.expand_dims(train['Budget'].to_numpy(),0).T)
val['normalized_Budget'] = scaler.transform(np.expand_dims(val['Budget'].to_numpy(),0).T)
test['normalized_Budget'] = scaler.transform(np.expand_dims(test['Budget'].to_numpy(),0).T)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [17]:
# scaler = StandardScaler()
# scaler.fit(np.expand_dims(train['year'].to_numpy(),0).T)
# train['normalized_year'] = scaler.transform(np.expand_dims(train['year'].to_numpy(),0).T)
# val['normalized_year'] = scaler.transform(np.expand_dims(val['year'].to_numpy(),0).T)
# test['normalized_year'] = scaler.transform(np.expand_dims(test['year'].to_numpy(),0).T)

# scaler = StandardScaler()
# scaler.fit(np.expand_dims(train['month'].to_numpy(),0).T)
# train['normalized_month'] = scaler.transform(np.expand_dims(train['month'].to_numpy(),0).T)
# val['normalized_month'] = scaler.transform(np.expand_dims(val['month'].to_numpy(),0).T)
# test['normalized_month'] = scaler.transform(np.expand_dims(test['month'].to_numpy(),0).T)

# scaler = StandardScaler()
# scaler.fit(np.expand_dims(train['date'].to_numpy(),0).T)
# train['normalized_date'] = scaler.transform(np.expand_dims(train['date'].to_numpy(),0).T)
# val['normalized_date'] = scaler.transform(np.expand_dims(val['date'].to_numpy(),0).T)
# test['normalized_date'] = scaler.transform(np.expand_dims(test['date'].to_numpy(),0).T)

In [18]:
train_dataset=create_dataset_split(train,text_cols,text_input_col,TARGET_COL)
val_dataset=create_dataset_split(val,text_cols,text_input_col,TARGET_COL)
test_dataset=create_dataset_split(test,text_cols,text_input_col,TARGET_COL)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [19]:
num_labels = 1

In [20]:
from torch import nn
from transformers import DistilBertForSequenceClassification

In [21]:
class TabularConfig:
    r""" Config used for tabular combiner
    Args:
        mlp_division (int): how much to decrease each MLP dim for each additional layer
        combine_feat_method (str): The method to combine categorical and numerical features.
            See :obj:`TabularFeatCombiner` for details on the supported methods.
        mlp_dropout (float): dropout ratio used for MLP layers
        numerical_bn (bool): whether to use batchnorm on numerical features
        use_simple_classifier (bool): whether to use single layer or MLP as final classifier
        mlp_act (str): the activation function to use for finetuning layers
        gating_beta (float): the beta hyperparameters used for gating tabular data
            see the paper `Integrating Multimodal Information in Large Pretrained Transformers <https://www.aclweb.org/anthology/2020.acl-main.214.pdf>`_ for details
        numerical_feat_dim (int): the number of numerical features
        cat_feat_dim (int): the number of categorical features
    """
    def __init__(self,
                 num_labels,
                 mlp_division=4,
                 combine_feat_method='text_only',
                 mlp_dropout=0.1,
                 numerical_bn=True,
                 use_simple_classifier=True,
                 mlp_act='relu',
                 gating_beta=0.2,
                 numerical_feat_dim=0,
                 cat_feat_dim=0,
                 **kwargs
                 ):
        self.mlp_division = mlp_division
        self.combine_feat_method = combine_feat_method
        self.mlp_dropout = mlp_dropout
        self.numerical_bn = numerical_bn
        self.use_simple_classifier = use_simple_classifier
        self.mlp_act = mlp_act
        self.gating_beta = gating_beta
        self.numerical_feat_dim = numerical_feat_dim
        self.cat_feat_dim = cat_feat_dim
        self.num_labels = num_labels

In [22]:
import math
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss


class MLP(nn.Module):
    """mlp can specify number of hidden layers and hidden layer channels"""

    def __init__(self, input_dim, output_dim, act='relu', num_hidden_lyr=2,
                 dropout_prob=0.5, return_layer_outs=False,
                 hidden_channels=None, bn=False):
        super().__init__()
        self.out_dim = output_dim
        self.dropout = nn.Dropout(dropout_prob)
        self.return_layer_outs = return_layer_outs
        if not hidden_channels:
            hidden_channels = [input_dim for _ in range(num_hidden_lyr)]
        elif len(hidden_channels) != num_hidden_lyr:
            raise ValueError(
                "number of hidden layers should be the same as the lengh of hidden_channels")
        self.layer_channels = [input_dim] + hidden_channels + [output_dim]
        self.act_name = act
        self.activation = create_act(act)
        self.layers = nn.ModuleList(list(
            map(self.weight_init, [nn.Linear(self.layer_channels[i], self.layer_channels[i + 1])
                                   for i in range(len(self.layer_channels) - 2)])))
        final_layer = nn.Linear(self.layer_channels[-2], self.layer_channels[-1])
        self.weight_init(final_layer,   activation='linear')
        self.layers.append(final_layer)

        self.bn = bn
        if self.bn:
            self.bn = nn.ModuleList([torch.nn.BatchNorm1d(dim) for dim in self.layer_channels[1:-1]])

    def weight_init(self, m, activation=None):
        if activation is None:
            activation = self.act_name
        torch.nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain(activation))
        return m

    def forward(self, x):
        """
        :param x: the input features
        :return: tuple containing output of MLP,
                and list of inputs and outputs at every layer
        """
        layer_inputs = [x]
        for i, layer in enumerate(self.layers):
            input = layer_inputs[-1]
            if layer == self.layers[-1]:
                layer_inputs.append(layer(input))
            else:
                if self.bn:
                    output = self.activation(self.bn[i](layer(input)))
                else:
                    output = self.activation(layer(input))
                layer_inputs.append(self.dropout(output))

        # model.store_layer_output(self, layer_inputs[-1])
        if self.return_layer_outs:
            return layer_inputs[-1], layer_inputs
        else:
            return layer_inputs[-1]


def calc_mlp_dims(input_dim, division=2, output_dim=1):
    dim = input_dim
    dims = []
    while dim > output_dim:
        dim = dim // division
        dims.append(int(dim))
    dims = dims[:-1]
    return dims


def create_act(act, num_parameters=None):
    if act == 'relu':
        return nn.ReLU()
    elif act == 'prelu':
        return nn.PReLU(num_parameters)
    elif act == 'sigmoid':
        return nn.Sigmoid()
    elif act == 'tanh':
        return nn.Tanh()
    elif act == 'linear':
        class Identity(nn.Module):
            def forward(self, x):
                return x

        return Identity()
    else:
        raise ValueError('Unknown activation function {}'.format(act))


def glorot(tensor):
    stdv = math.sqrt(6.0 / (tensor.size(-2) + tensor.size(-1)))
    if tensor is not None:
        tensor.data.uniform_(-stdv, stdv)


def zeros(tensor):
    if tensor is not None:
        tensor.data.fill_(0)


def hf_loss_func(inputs, classifier, labels, num_labels, class_weights):
    logits = classifier(inputs)
    if type(logits) is tuple:
        logits, layer_outputs = logits[0], logits[1]
    else:  # simple classifier
        layer_outputs = [inputs, logits]
    if labels is not None:
        if num_labels == 1:
            #  We are doing regression
            loss_fct = MSELoss()
            labels = labels.float()
            loss = loss_fct(logits.view(-1), labels.view(-1))
        else:
            loss_fct = CrossEntropyLoss(weight=class_weights)
            labels = labels.long()
            loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
    else:
        return None, logits, layer_outputs

    return loss, logits, layer_outputs

In [23]:
import torch
from torch import nn
import torch.nn.functional as F

# from .layer_utils import calc_mlp_dims, create_act, glorot, zeros, MLP


class TabularFeatCombiner(nn.Module):
    r"""
        Combiner module for combining text features with categorical and numerical features
        The methods of combining, specified by :obj:`tabular_config.combine_feat_method` are shown below.
        :math:`\mathbf{m}` denotes the combined multimodal features,
        :math:`\mathbf{x}` denotes the output text features from the transformer,
        :math:`\mathbf{c}` denotes the categorical features, :math:`\mathbf{t}` denotes the numerical features,
        :math:`h_{\mathbf{\Theta}}` denotes a MLP parameterized by :math:`\Theta`, :math:`W` denotes a weight matrix,
        and :math:`b` denotes a scalar bias
        - **text_only**
            .. math::
                \mathbf{m} = \mathbf{x}
        - **concat**
            .. math::
                \mathbf{m} = \mathbf{x} \, \Vert \, \mathbf{c} \, \Vert \, \mathbf{n}
        - **mlp_on_categorical_then_concat**
            .. math::
                \mathbf{m} = \mathbf{x} \, \Vert \, h_{\mathbf{\Theta}}( \mathbf{c}) \, \Vert \, \mathbf{n}
        - **individual_mlps_on_cat_and_numerical_feats_then_concat**
            .. math::
                \mathbf{m} = \mathbf{x} \, \Vert \, h_{\mathbf{\Theta_c}}( \mathbf{c}) \, \Vert \, h_{\mathbf{\Theta_n}}(\mathbf{n})
        - **mlp_on_concatenated_cat_and_numerical_feats_then_concat**
            .. math::
                \mathbf{m} = \mathbf{x} \, \Vert \, h_{\mathbf{\Theta}}( \mathbf{c} \, \Vert \, \mathbf{n})
        - **attention_on_cat_and_numerical_feats** self attention on the text features
            .. math::
                \mathbf{m} = \alpha_{x,x}\mathbf{W}_x\mathbf{x} + \alpha_{x,c}\mathbf{W}_c\mathbf{c} + \alpha_{x,n}\mathbf{W}_n\mathbf{n}
          where :math:`\mathbf{W}_x` is of shape :obj:`(out_dim, text_feat_dim)`,
          :math:`\mathbf{W}_c` is of shape :obj:`(out_dim, cat_feat_dim)`,
          :math:`\mathbf{W}_n` is of shape :obj:`(out_dim, num_feat_dim)`, and the attention coefficients :math:`\alpha_{i,j}` are computed as
            .. math::
                \alpha_{i,j} =
                \frac{
                \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
                [\mathbf{W}_i\mathbf{x}_i \, \Vert \, \mathbf{W}_j\mathbf{x}_j]
                \right)\right)}
                {\sum_{k \in \{ x, c, n \}}
                \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
                [\mathbf{W}_i\mathbf{x}_i \, \Vert \, \mathbf{W}_k\mathbf{x}_k]
                \right)\right)}.
        - **gating_on_cat_and_num_feats_then_sum** sum of features gated by text features. Inspired by the gating mechanism introduced in `Integrating Multimodal Information in Large Pretrained Transformers <https://www.aclweb.org/anthology/2020.acl-main.214.pdf>`_
            .. math::
                \mathbf{m}= \mathbf{x} + \alpha\mathbf{h}
            .. math::
                \mathbf{h} = \mathbf{g_c} \odot (\mathbf{W}_c\mathbf{c}) + \mathbf{g_n} \odot (\mathbf{W}_n\mathbf{n}) + b_h
            .. math::
                \alpha = \mathrm{min}( \frac{\| \mathbf{x} \|_2}{\| \mathbf{h} \|_2}*\beta, 1)
          where :math:`\beta` is a hyperparamter, :math:`\mathbf{W}_c` is of shape :obj:`(out_dim, cat_feat_dim)`,
          :math:`\mathbf{W}_n` is of shape :obj:`(out_dim, num_feat_dim)`. and the gating vector :math:`\mathbf{g}_i` with activation function :math:`R` is defined as
            .. math::
                \mathbf{g}_i = R(\mathbf{W}_{gi}[\mathbf{i} \, \Vert \, \mathbf{x}]+ b_i)
          where :math:`\mathbf{W}_{gi}` is of shape :obj:`(out_dim, i_feat_dim + text_feat_dim)`
        - **weighted_feature_sum_on_transformer_cat_and_numerical_feats**
            .. math::
                \mathbf{m} = \mathbf{x} + \mathbf{W}_{c'} \odot \mathbf{W}_c \mathbf{c} + \mathbf{W}_{n'} \odot \mathbf{W}_n \mathbf{t}
       Parameters:
           tabular_config (:class:`~multimodal_config.TabularConfig`):
               Tabular model configuration class with all the parameters of the model.
       """
    def __init__(self, tabular_config):
        super().__init__()
        self.combine_feat_method = tabular_config.combine_feat_method
        self.cat_feat_dim = tabular_config.cat_feat_dim
        self.numerical_feat_dim = tabular_config.numerical_feat_dim
        self.num_labels = tabular_config.num_labels
        self.numerical_bn = tabular_config.numerical_bn
        self.mlp_act = tabular_config.mlp_act
        self.mlp_dropout = tabular_config.mlp_dropout
        self.mlp_division = tabular_config.mlp_division
        self.text_out_dim = tabular_config.text_feat_dim
        self.tabular_config = tabular_config

        if self.numerical_bn and self.numerical_feat_dim > 0:
            self.num_bn = nn.BatchNorm1d(self.numerical_feat_dim)
        else:
            self.num_bn = None

        if self.combine_feat_method == 'text_only':
            self.final_out_dim = self.text_out_dim
        elif self.combine_feat_method == 'concat':
            self.final_out_dim = self.text_out_dim + self.cat_feat_dim \
                           + self.numerical_feat_dim
        elif self.combine_feat_method == 'mlp_on_categorical_then_concat':
            assert self.cat_feat_dim != 0, 'dimension of cat feats should not be 0'
            # reduce dim of categorical features to same of num dim or text dim if necessary
            output_dim = min(self.text_out_dim,
                             max(self.numerical_feat_dim, self.cat_feat_dim // (self.mlp_division // 2)))
            dims = calc_mlp_dims(
                self.cat_feat_dim,
                self.mlp_division,
                output_dim
            )
            self.cat_mlp = MLP(
                self.cat_feat_dim,
                output_dim,
                act=self.mlp_act,
                num_hidden_lyr=len(dims),
                dropout_prob=self.mlp_dropout,
                hidden_channels=dims,
                return_layer_outs=False,
                bn=True
            )
            self.final_out_dim = self.text_out_dim + output_dim + self.numerical_feat_dim
        elif self.combine_feat_method == 'mlp_on_concatenated_cat_and_numerical_feats_then_concat':
            assert self.cat_feat_dim != 0, 'dimension of cat feats should not be 0'
            assert self.numerical_feat_dim != 0, 'dimension of numerical feats should not be 0'
            output_dim = min(self.numerical_feat_dim, self.cat_feat_dim, self.text_out_dim)
            in_dim = self.cat_feat_dim + self.numerical_feat_dim
            dims = calc_mlp_dims(
                in_dim,
                self.mlp_division,
                output_dim
            )
            self.cat_and_numerical_mlp = MLP(
                in_dim,
                output_dim,
                act=self.mlp_act,
                num_hidden_lyr=len(dims),
                dropout_prob=self.mlp_dropout,
                hidden_channels=dims,
                return_layer_outs=False,
                bn=True
            )
            self.final_out_dim = self.text_out_dim + output_dim
        elif self.combine_feat_method == 'individual_mlps_on_cat_and_numerical_feats_then_concat':
            output_dim_cat = 0
            if self.cat_feat_dim > 0:
                output_dim_cat = max(self.cat_feat_dim // (self.mlp_division // 2),
                                     self.numerical_feat_dim)
                dims = calc_mlp_dims(
                    self.cat_feat_dim,
                    self.mlp_division,
                    output_dim_cat)
                self.cat_mlp = MLP(
                    self.cat_feat_dim,
                    output_dim_cat,
                    act=self.mlp_act,
                    num_hidden_lyr=len(dims),
                    dropout_prob=self.mlp_dropout,
                    hidden_channels=dims,
                    return_layer_outs=False,
                    bn=True)

            output_dim_num = 0
            if self.numerical_feat_dim > 0:
                output_dim_num = self.numerical_feat_dim // (self.mlp_division // 2)
                self.num_mlp = MLP(
                    self.numerical_feat_dim,
                    output_dim_num,
                    act=self.mlp_act,
                    dropout_prob=self.mlp_dropout,
                    num_hidden_lyr=1,
                    return_layer_outs=False,
                    bn=True)
            self.final_out_dim = self.text_out_dim + output_dim_num + output_dim_cat
        elif self.combine_feat_method == 'weighted_feature_sum_on_transformer_cat_and_numerical_feats':
            assert self.cat_feat_dim + self.numerical_feat_dim != 0, 'should have some non text features'
            if self.cat_feat_dim > 0:
                output_dim_cat = self.text_out_dim
                if self.cat_feat_dim > self.text_out_dim:
                    dims = calc_mlp_dims(
                        self.cat_feat_dim,
                        division=self.mlp_division,
                        output_dim=output_dim_cat)
                    self.cat_layer = MLP(
                        self.cat_feat_dim,
                        output_dim_cat,
                        act=self.mlp_act,
                        num_hidden_lyr=len(dims),
                        dropout_prob=self.mlp_dropout,
                        hidden_channels=dims,
                        return_layer_outs=False,
                        bn=True)
                else:
                    self.cat_layer = nn.Linear(self.cat_feat_dim, output_dim_cat)
                self.dropout_cat = nn.Dropout(self.mlp_dropout)
                self.weight_cat = nn.Parameter(torch.rand(output_dim_cat))
            if self.numerical_feat_dim > 0:
                output_dim_num = self.text_out_dim
                if self.numerical_feat_dim > self.text_out_dim:
                    dims = calc_mlp_dims(
                        self.numerical_feat_dim,
                        division=self.mlp_division,
                        output_dim=output_dim_num)
                    self.num_layer = MLP(
                        self.numerical_feat_dim,
                        output_dim_num,
                        act=self.mlp_act,
                        num_hidden_lyr=len(dims),
                        dropout_prob=self.mlp_dropout,
                        hidden_channels=dims,
                        return_layer_outs=False,
                        bn=True)
                else:
                    self.num_layer = nn.Linear(self.numerical_feat_dim,
                                               output_dim_num)
                self.dropout_num = nn.Dropout(self.mlp_dropout)
                self.weight_num = nn.Parameter(torch.rand(output_dim_num))

            self.act_func = create_act(self.mlp_act)
            self.layer_norm = nn.LayerNorm(self.text_out_dim)
            self.final_dropout = nn.Dropout(tabular_config.hidden_dropout_prob)
            self.final_out_dim = self.text_out_dim

        elif self.combine_feat_method == 'attention_on_cat_and_numerical_feats':
            assert self.cat_feat_dim + self.numerical_feat_dim != 0, \
                'should have some non-text features for this method'

            output_dim = self.text_out_dim
            if self.cat_feat_dim > 0:
                if self.cat_feat_dim > self.text_out_dim:
                    output_dim_cat = self.text_out_dim
                    dims = calc_mlp_dims(
                        self.cat_feat_dim,
                        division=self.mlp_division,
                        output_dim=output_dim_cat)
                    self.cat_mlp = MLP(
                        self.cat_feat_dim,
                        output_dim_cat,
                        num_hidden_lyr=len(dims),
                        dropout_prob=self.mlp_dropout,
                        return_layer_outs=False,
                        hidden_channels=dims,
                        bn=True)
                else:
                    output_dim_cat = self.cat_feat_dim
                self.weight_cat = nn.Parameter(torch.rand((output_dim_cat,
                                                           output_dim)))
                self.bias_cat = nn.Parameter(torch.zeros(output_dim))

            if self.numerical_feat_dim > 0:
                if self.numerical_feat_dim > self.text_out_dim:
                    output_dim_num = self.text_out_dim
                    dims = calc_mlp_dims(
                        self.numerical_feat_dim,
                        division=self.mlp_division,
                        output_dim=output_dim_num)
                    self.cat_mlp = MLP(
                        self.numerical_feat_dim,
                        output_dim_num,
                        num_hidden_lyr=len(dims),
                        dropout_prob=self.mlp_dropout,
                        return_layer_outs=False,
                        hidden_channels=dims,
                        bn=True)
                else:
                    output_dim_num = self.numerical_feat_dim
                self.weight_num = nn.Parameter(torch.rand((output_dim_num,
                                                            output_dim)))
                self.bias_num = nn.Parameter(torch.zeros(output_dim))

            self.weight_transformer = nn.Parameter(torch.rand(self.text_out_dim,
                                                       output_dim))
            self.weight_a = nn.Parameter(torch.rand((1, output_dim + output_dim)))
            self.bias_transformer = nn.Parameter(torch.rand(output_dim))
            self.bias = nn.Parameter(torch.zeros(output_dim))
            self.negative_slope = 0.2
            self.final_out_dim = output_dim
            self.__reset_parameters()
        elif self.combine_feat_method == 'gating_on_cat_and_num_feats_then_sum':
            self.act_func = create_act(self.mlp_act)
            if self.cat_feat_dim > 0:
                if self.cat_feat_dim > self.text_out_dim:
                    dims = calc_mlp_dims(
                        self.numerical_feat_dim,
                        division=self.mlp_division,
                        output_dim=self.text_out_dim)
                    self.cat_layer = MLP(
                        self.cat_feat_dim,
                        self.text_out_dim,
                        act=self.mlp_act,
                        num_hidden_lyr=len(dims),
                        dropout_prob=self.mlp_dropout,
                        hidden_channels=dims,
                        return_layer_outs=False,
                        bn=True)
                self.g_cat_layer = nn.Linear(self.text_out_dim + min(self.text_out_dim, self.cat_feat_dim),
                                             self.text_out_dim)
                self.dropout_cat = nn.Dropout(self.mlp_dropout)
                self.h_cat_layer = nn.Linear(min(self.text_out_dim, self.cat_feat_dim), self.text_out_dim, bias=False)
            if self.numerical_feat_dim > 0:
                if self.numerical_feat_dim > self.text_out_dim:
                    dims = calc_mlp_dims(
                        self.numerical_feat_dim,
                        division=self.mlp_division,
                        output_dim=self.text_out_dim)
                    self.num_layer = MLP(
                        self.numerical_feat_dim,
                        self.text_out_dim,
                        act=self.mlp_act,
                        num_hidden_lyr=len(dims),
                        dropout_prob=self.mlp_dropout,
                        hidden_channels=dims,
                        return_layer_outs=False,
                        bn=True)
                self.g_num_layer = nn.Linear(min(self.numerical_feat_dim, self.text_out_dim) + self.text_out_dim,
                                             self.text_out_dim)
                self.dropout_num = nn.Dropout(self.mlp_dropout)
                self.h_num_layer = nn.Linear(min(self.text_out_dim, self.numerical_feat_dim),
                                             self.text_out_dim, bias=False)
            self.h_bias = nn.Parameter(torch.zeros(self.text_out_dim))
            self.layer_norm = nn.LayerNorm(self.text_out_dim)
            self.final_out_dim = self.text_out_dim
        else:
            raise ValueError(f'combine_feat_method {self.combine_feat_method} '
                             f'not implemented')

    def forward(self, text_feats, cat_feats=None, numerical_feats=None):
        """
        Args:
            text_feats (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, text_out_dim)`):
                The tensor of text features. This is assumed to be the output from a HuggingFace transformer model
            cat_feats (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, cat_feat_dim)`, `optional`, defaults to :obj:`None`)):
                The tensor of categorical features
            numerical_feats (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, numerical_feat_dim)`, `optional`, defaults to :obj:`None`):
                The tensor of numerical features
        Returns:
            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, final_out_dim)`:
                A tensor representing the combined features
        """
        if cat_feats is None:
            cat_feats = torch.zeros((text_feats.shape[0], 0)).to(text_feats.device)
        if numerical_feats is None:
            numerical_feats = torch.zeros((text_feats.shape[0], 0)).to(text_feats.device)

        if self.numerical_bn and self.numerical_feat_dim != 0:
            numerical_feats = self.num_bn(numerical_feats)

        if self.combine_feat_method == 'text_only':
            combined_feats = text_feats
        if self.combine_feat_method == 'concat':
            combined_feats = torch.cat((text_feats, cat_feats, numerical_feats),
                                       dim=1)
        elif self.combine_feat_method == 'mlp_on_categorical_then_concat':
            cat_feats = self.cat_mlp(cat_feats)
            combined_feats = torch.cat((text_feats, cat_feats, numerical_feats), dim=1)
        elif self.combine_feat_method == 'mlp_on_concatenated_cat_and_numerical_feats_then_concat':
            tabular_feats = torch.cat((cat_feats, numerical_feats), dim=1)
            tabular_feats = self.cat_and_numerical_mlp(tabular_feats)
            combined_feats = torch.cat((text_feats, tabular_feats), dim=1)
        elif self.combine_feat_method == 'individual_mlps_on_cat_and_numerical_feats_then_concat':
            if cat_feats.shape[1] != 0:
                cat_feats = self.cat_mlp(cat_feats)
            if numerical_feats.shape[1] != 0:
                numerical_feats = self.num_mlp(numerical_feats)
            combined_feats = torch.cat((text_feats, cat_feats, numerical_feats), dim=1)
        elif self.combine_feat_method == 'weighted_feature_sum_on_transformer_cat_and_numerical_feats':
            if cat_feats.shape[1] != 0:
                cat_feats = self.dropout_cat(self.cat_layer(cat_feats))
                cat_feats = self.weight_cat.expand_as(cat_feats) * cat_feats
            else:
                cat_feats = 0

            if numerical_feats.shape[1] != 0:
                numerical_feats = self.dropout_num(self.num_layer(numerical_feats))
                numerical_feats = self.weight_num.expand_as(numerical_feats) * numerical_feats
            else:
                numerical_feats = 0
            combined_feats = text_feats + cat_feats + numerical_feats
        elif self.combine_feat_method == 'attention_on_cat_and_numerical_feats':
            # attention keyed by transformer text features
            w_text = torch.mm(text_feats, self.weight_transformer)
            g_text = (torch.cat([w_text, w_text], dim=-1) * self.weight_a).sum(dim=1).unsqueeze(0).T

            if cat_feats.shape[1] != 0:
                if self.cat_feat_dim > self.text_out_dim:
                    cat_feats = self.cat_mlp(cat_feats)
                w_cat = torch.mm(cat_feats, self.weight_cat)
                g_cat = (torch.cat([w_text, w_cat], dim=-1) * self.weight_a).sum(dim=1).unsqueeze(0).T
            else:
                w_cat = None
                g_cat = torch.zeros(0, device=g_text.device)

            if numerical_feats.shape[1] != 0:
                if self.numerical_feat_dim > self.text_out_dim:
                    numerical_feats = self.num_mlp(numerical_feats)
                w_num = torch.mm(numerical_feats, self.weight_num)
                g_num = (torch.cat([w_text, w_cat], dim=-1) * self.weight_a).sum(dim=1).unsqueeze(0).T
            else:
                w_num = None
                g_num = torch.zeros(0, device=g_text.device)

            alpha = torch.cat([g_text, g_cat, g_num], dim=1)  # N by 3
            alpha = F.leaky_relu(alpha, 0.02)
            alpha = F.softmax(alpha, -1)
            stack_tensors = [tensor for tensor in [w_text, w_cat, w_num]
                             if tensor is not None]
            combined = torch.stack(stack_tensors, dim=1)  # N by 3 by final_out_dim
            outputs_w_attention = alpha[:, :, None] * combined
            combined_feats = outputs_w_attention.sum(dim=1)  # N by final_out_dim
        elif self.combine_feat_method == 'gating_on_cat_and_num_feats_then_sum':
            # assumes shifting of features relative to text features and that text features are the most important
            if cat_feats.shape[1] != 0:
                if self.cat_feat_dim > self.text_out_dim:
                    cat_feats = self.cat_layer(cat_feats)
                g_cat = self.dropout_cat(self.act_func(self.g_cat_layer(torch.cat([text_feats, cat_feats], dim=1))))
                g_mult_cat = g_cat * self.h_cat_layer(cat_feats)
            else:
                g_mult_cat = 0

            if numerical_feats.shape[1] != 0:
                if self.numerical_feat_dim > self.text_out_dim:
                    numerical_feats = self.num_layer(numerical_feats)
                g_num = self.dropout_num(self.act_func(self.g_num_layer(torch.cat([text_feats, numerical_feats], dim=1))))
                g_mult_num = g_num * self.h_num_layer(numerical_feats)
            else:
                g_mult_num = 0

            H = g_mult_cat + g_mult_num + self.h_bias
            norm = torch.norm(text_feats, dim=1) / torch.norm(H, dim=1)
            alpha = torch.clamp(norm * self.tabular_config.gating_beta, min=0, max=1)
            combined_feats = text_feats + alpha[:, None] * H

        return combined_feats

    def __reset_parameters(self):
        glorot(self.weight_a)
        if hasattr(self, 'weight_cat'):
            glorot(self.weight_cat)
            zeros(self.bias_cat)
        if hasattr(self, 'weight_num'):
            glorot(self.weight_num)
            zeros(self.bias_num)
        glorot(self.weight_transformer)
        zeros(self.bias_transformer)

In [24]:
class DistilBertWithTabular(DistilBertForSequenceClassification):
    """
    DistilBert Model transformer with a sequence classification/regression head as well as
    a TabularFeatCombiner module to combine categorical and numerical features
    with the Roberta pooled output
    Parameters:
        hf_model_config (:class:`~transformers.DistilBertConfig`):
            Model configuration class with all the parameters of the model.
            This object must also have a tabular_config member variable that is a
            :obj:`TabularConfig` instance specifying the configs for :obj:`TabularFeatCombiner`
    """
    def __init__(self, hf_model_config):
        super().__init__(hf_model_config)
        tabular_config = hf_model_config.tabular_config
        if type(tabular_config) is dict:  # when loading from saved model
            tabular_config = TabularConfig(**tabular_config)
        else:
            self.config.tabular_config = tabular_config.__dict__

        tabular_config.text_feat_dim = hf_model_config.hidden_size
        tabular_config.hidden_dropout_prob = hf_model_config.seq_classif_dropout
        self.tabular_combiner = TabularFeatCombiner(tabular_config)
        self.num_labels = tabular_config.num_labels
        combined_feat_dim = self.tabular_combiner.final_out_dim
        if tabular_config.use_simple_classifier:
            self.tabular_classifier = nn.Linear(combined_feat_dim,
                                                tabular_config.num_labels)
        else:
            dims = calc_mlp_dims(combined_feat_dim,
                                 division=tabular_config.mlp_division,
                                 output_dim=tabular_config.num_labels)
            self.tabular_classifier = MLP(combined_feat_dim,
                                          tabular_config.num_labels,
                                          num_hidden_lyr=len(dims),
                                          dropout_prob=tabular_config.mlp_dropout,
                                          hidden_channels=dims,
                                          bn=True)

    # @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        class_weights=None,
        cat_feats=None,
        numerical_feats=None
    ):
        r"""
        class_weights (:obj:`torch.FloatTensor` of shape :obj:`(tabular_config.num_labels,)`,`optional`, defaults to :obj:`None`):
            Class weights to be used for cross entropy loss function for classification task
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`tabular_config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`tabular_config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        cat_feats (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, tabular_config.cat_feat_dim)`,`optional`, defaults to :obj:`None`):
            Categorical features to be passed in to the TabularFeatCombiner
        numerical_feats (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, tabular_config.numerical_feat_dim)`,`optional`, defaults to :obj:`None`):
            Numerical features to be passed in to the TabularFeatCombiner
    Returns:
        :obj:`tuple` comprising various elements depending on configuration and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if tabular_config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, tabular_config.num_labels)`):
            Classification (or regression if tabular_config.num_labels==1) scores (before SoftMax).
        classifier_layer_outputs(:obj:`list` of :obj:`torch.FloatTensor`):
            The outputs of each layer of the final classification layers. The 0th index of this list is the
            combining module's output
        """

        distilbert_output = self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        text_feats = self.dropout(pooled_output)
        combined_feats = self.tabular_combiner(text_feats,
                                               cat_feats,
                                               numerical_feats)
        loss, logits, classifier_layer_outputs = hf_loss_func(combined_feats,
                                                              self.tabular_classifier,
                                                              labels,
                                                              self.num_labels,
                                                              class_weights)
        return loss, logits, classifier_layer_outputs

In [25]:
from transformers import AutoConfig,DistilBertConfig

In [26]:
config = AutoConfig.from_pretrained(
        'distilbert-base-uncased'
    )
tabular_config = TabularConfig(num_labels=num_labels,
                               cat_feat_dim=0,
                               numerical_feat_dim=5,
                               combine_feat_method='concat',
                               column_info=column_info_dict,
                               task='regression')

config.tabular_config = tabular_config

In [27]:
model = DistilBertWithTabular.from_pretrained(
        'distilbert-base-uncased',
        config=config
    )

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertWithTabular: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertWithTabular were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['tabular_classifier.bias', 'classifier.bias', 'tabular_combiner.num_bn.weight', 'pre_classifier.bias', 'classif

In [28]:
import csv
class DictWriter:
    
    def __init__(self,file_path,field_names):
        self.field_names = field_names
        self.file_path = file_path
        self.create_file() #Crerate file if it doesnt exist.
        
    def create_file(self):
        if not os.path.exists(self.file_path):
            print('creating file')
            f = open(self.file_path, 'w')
            w = csv.DictWriter(f, field_names)
            w.writeheader()
            f.close()
        else:
            print('file already exist. Will append rows to it.')
            
    def add_rows(self,rows):  
        with open(self.file_path, 'a') as f:
            w = csv.DictWriter(f,self.field_names)
            for r in rows:    
                w.writerow(r)   

In [None]:
epochs = 15
num_evals = 20
patience = 2 if DEBUG else 30
callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)]
eval_steps = 50 if DEBUG else 100


hparams = {'batch_size' : [8,16,32],
           'learning_rate' : [1e-5, 2e-5, 3e-5,5e-5],
           'weight_decay' : [0.1,0.01],
           'repeats': range(1)}

combs = list(product(*[range(len(i)) for i in list(hparams.values())]))
scores = np.zeros([len(i) for i in list(hparams.values())])

#trials_df_rows = []

field_names = list(hparams.keys()) + ['score']
dw = DictWriter(TRIALS_DF_PATH,field_names)

currernt_trials_df = pd.read_csv(TRIALS_DF_PATH) #This can be empty or not.
done_trials = currernt_trials_df.drop('score',axis=1).to_dict(orient='records') #empty list or not
best_score = min(float('inf'),currernt_trials_df['score'].min())

print(f'current best val score = {best_score}')

for idx,comb_indexes in enumerate(combs):
    comb_values = {name:val[idx] for name,val,idx in zip(hparams.keys(),hparams.values(),comb_indexes)}
    
    if comb_values not in done_trials: #Check if trial alrready exists. If it does, skip.
        print('training with following hparams:')
        pprint(comb_values)

        training_args = TrainingArguments(output_dir=f"{MODEL_NAME}-{TARGET_COL}",
                                          per_device_train_batch_size = comb_values['batch_size'],
                                          learning_rate=comb_values['learning_rate'],
                                          weight_decay=comb_values['weight_decay'],
                                          seed = 42,
                                          fp16=True,
                                          per_device_eval_batch_size = 16,
                                          warmup_ratio=0.06,
                                          num_train_epochs = epochs,
                                          evaluation_strategy = "steps",
                                          save_strategy = "steps",
                                          load_best_model_at_end=True,
                                          eval_steps = eval_steps,
                                          save_steps = eval_steps,
                                          save_total_limit = 1,
                                          log_level = 'error',
                                          disable_tqdm = True

                                        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            callbacks = callbacks
        )
        


        trainer.train()

        score = trainer.evaluate()['eval_loss']

        scores[tuple(comb_indexes)] = score #outdated

        comb_values['score'] = score

        dw.add_rows([comb_values]) #Append to dataframe

        #trials_df_rows.append(comb_values)

        if score<best_score:
            print(f'got a better model, with score {np.round(score,4)} saving...')
            best_score = score
            trainer.save_model(FINAL_MODEL_PATH)
            
            print('saved')
    else:
        print('skipping trial because already exists')


#trials_df = pd.DataFrame(trials_df_rows)
#trials_df.to_csv(f'models/{MODEL_FOLDER}/hparams_trials.csv',index=False)



file already exist. Will append rows to it.
current best val score = inf
training with following hparams:
{'batch_size': 8, 'learning_rate': 1e-05, 'repeats': 0, 'weight_decay': 0.1}




{'eval_loss': 35.7305908203125, 'eval_runtime': 12.6996, 'eval_samples_per_second': 138.115, 'eval_steps_per_second': 8.662, 'epoch': 0.1}
{'eval_loss': 16.36104393005371, 'eval_runtime': 12.6109, 'eval_samples_per_second': 139.086, 'eval_steps_per_second': 8.723, 'epoch': 0.2}
{'eval_loss': 4.757318496704102, 'eval_runtime': 11.988, 'eval_samples_per_second': 146.313, 'eval_steps_per_second': 9.176, 'epoch': 0.29}
{'eval_loss': 1.4715099334716797, 'eval_runtime': 12.4636, 'eval_samples_per_second': 140.73, 'eval_steps_per_second': 8.826, 'epoch': 0.39}
{'loss': 16.3296, 'learning_rate': 5.368763557483731e-06, 'epoch': 0.49}
{'eval_loss': 1.278017282485962, 'eval_runtime': 12.3638, 'eval_samples_per_second': 141.866, 'eval_steps_per_second': 8.897, 'epoch': 0.49}
{'eval_loss': 1.3097105026245117, 'eval_runtime': 12.2344, 'eval_samples_per_second': 143.367, 'eval_steps_per_second': 8.991, 'epoch': 0.59}
{'eval_loss': 1.132379412651062, 'eval_runtime': 12.3722, 'eval_samples_per_second':



{'eval_loss': 0.9293404817581177, 'eval_runtime': 12.4695, 'eval_samples_per_second': 140.664, 'eval_steps_per_second': 8.822, 'epoch': 0.1}
{'eval_loss': 0.980067789554596, 'eval_runtime': 12.4263, 'eval_samples_per_second': 141.152, 'eval_steps_per_second': 8.852, 'epoch': 0.2}
{'eval_loss': 0.9236785769462585, 'eval_runtime': 12.4184, 'eval_samples_per_second': 141.242, 'eval_steps_per_second': 8.858, 'epoch': 0.29}
{'eval_loss': 1.0473518371582031, 'eval_runtime': 12.4343, 'eval_samples_per_second': 141.062, 'eval_steps_per_second': 8.847, 'epoch': 0.39}
{'loss': 0.5268, 'learning_rate': 5.3904555314533626e-06, 'epoch': 0.49}
{'eval_loss': 1.0493983030319214, 'eval_runtime': 12.4753, 'eval_samples_per_second': 140.598, 'eval_steps_per_second': 8.817, 'epoch': 0.49}
{'eval_loss': 0.9847174286842346, 'eval_runtime': 12.4555, 'eval_samples_per_second': 140.822, 'eval_steps_per_second': 8.831, 'epoch': 0.59}
{'eval_loss': 0.8181149363517761, 'eval_runtime': 12.4427, 'eval_samples_per_s



{'eval_loss': 0.9792144894599915, 'eval_runtime': 12.4322, 'eval_samples_per_second': 141.085, 'eval_steps_per_second': 8.848, 'epoch': 0.1}
{'eval_loss': 1.0678170919418335, 'eval_runtime': 12.3552, 'eval_samples_per_second': 141.965, 'eval_steps_per_second': 8.903, 'epoch': 0.2}
{'eval_loss': 0.9185199737548828, 'eval_runtime': 12.3616, 'eval_samples_per_second': 141.891, 'eval_steps_per_second': 8.899, 'epoch': 0.29}
{'eval_loss': 1.005479097366333, 'eval_runtime': 12.4945, 'eval_samples_per_second': 140.382, 'eval_steps_per_second': 8.804, 'epoch': 0.39}
{'loss': 0.4301, 'learning_rate': 1.0780911062906725e-05, 'epoch': 0.49}
{'eval_loss': 1.1869325637817383, 'eval_runtime': 12.4263, 'eval_samples_per_second': 141.152, 'eval_steps_per_second': 8.852, 'epoch': 0.49}
{'eval_loss': 1.061139702796936, 'eval_runtime': 12.4218, 'eval_samples_per_second': 141.203, 'eval_steps_per_second': 8.855, 'epoch': 0.59}
{'eval_loss': 0.9507214426994324, 'eval_runtime': 12.4289, 'eval_samples_per_se

In [None]:
import torch
print(torch.__version__)

In [3]:
torch.cuda.is_available()

True

In [45]:
#Test set performance
best_model = DistilBertWithTabular.from_pretrained(FINAL_MODEL_PATH)
trainer_best_model = Trainer(model=best_model)

predictions = trainer_best_model.predict(test_dataset)

loading configuration file models/text_with_numerical_including_date/distilbert-base-uncased-averageRating/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertWithTabular"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tabular_config": {
    "cat_feat_dim": 0,
    "combine_feat_method": "concat",
    "gating_beta": 0.2,
    "hidden_dropout_prob": 0.2,
    "mlp_act": "relu",
    "mlp_division": 4,
    "mlp_dropout": 0.1,
    "num_labels": 1,
    "numerical_bn": true,
    "numerical_feat_dim": 5,
    "text_feat_dim": 768,
    "use_simple_classifier": true
  },
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transfo

In [46]:
preds = predictions.predictions[0].flatten() 
labels = predictions.label_ids

if TARGET_COL == 'revenue_worldwide_BOM':
    preds = np.expm1(preds)
    labels = np.expm1(labels)


mse = ((preds-labels)**2).mean()
mae = (np.abs(preds-labels)).mean()
errors = {'MAE':mae,'MSE':mse,'RMSE':np.sqrt(mse)}
pd.DataFrame([errors]).to_csv(TEST_PERFORMANCE_PATH,
                              index=False)

In [54]:
len(predictions.predictions[1][0][0])

773

In [51]:
predictions.predictions[0]

array([[5.3603134],
       [6.1881814],
       [6.5145   ],
       ...,
       [7.1097665],
       [5.7504053],
       [5.5580487]], dtype=float32)