# Imports

In [2]:
import random
import model_fns

import xgboost as xgb
from tqdm import tqdm
from sklearn.metrics import mean_squared_log_error

import pandas as pd
import json
from collections import OrderedDict

In [2]:
# set common random seed for consistency
RAND_SEED = 42

In [None]:
main_proj_dir = ".."

# Load dataset

In [4]:
# load list of files in train set
with open(f"{main_proj_dir}/train_files_801010.json") as json_file:
  train_files = json.load(json_file)
print(f"len(train_files): {len(train_files)}")

len(train_files): 159742


# Feature Engineering

A model is first trained on a (relatively) small randomly sample of 1000 files from the train set, which adds up to 100,000 samples. The feature scores are then obtained from the model.

In [4]:
# randomly sample files for training sample
random.seed(RAND_SEED)
n_epochs = 10
sample_train = random.choices(train_files, k=1000)

In [5]:
# train on rand sampled train set for 10 epochs
xgbr = None
params = {
    "tree_method": "approx",
}
for epoch_no in range(1, n_epochs+1):
  print(f"Epoch {epoch_no}:")
  if epoch_no == 2:
    params.update({'process_type': 'update',
                'updater'     : 'refresh',
                'refresh_leaf': True,
                'verbosity': 0})
  for train_file in tqdm(sample_train):
    train_df = pd.read_feather(train_file)
    y = train_df["Retweets"].values
    x = train_df.drop(labels="Retweets", axis="columns")
    feature_names = x.columns
    x = x.values
    dtrain = xgb.DMatrix(x, label=y, feature_names=feature_names)
    xgbr = xgb.train(params, dtrain, xgb_model=xgbr)

Epoch 1:


100%|██████████| 1000/1000 [31:30<00:00,  1.89s/it]


Epoch 2:


100%|██████████| 1000/1000 [01:01<00:00, 16.30it/s]


Epoch 3:


100%|██████████| 1000/1000 [01:00<00:00, 16.65it/s]


Epoch 4:


100%|██████████| 1000/1000 [01:00<00:00, 16.47it/s]


Epoch 5:


100%|██████████| 1000/1000 [01:02<00:00, 16.01it/s]


Epoch 6:


100%|██████████| 1000/1000 [01:01<00:00, 16.37it/s]


Epoch 7:


100%|██████████| 1000/1000 [00:58<00:00, 16.95it/s]


Epoch 8:


100%|██████████| 1000/1000 [01:01<00:00, 16.18it/s]


Epoch 9:


100%|██████████| 1000/1000 [01:26<00:00, 11.55it/s]


Epoch 10:


100%|██████████| 1000/1000 [01:01<00:00, 16.32it/s]


In [6]:
# save model
xgbr.save_model("feat_eng_10eps.model")

In [7]:
# load model
xgbr = xgb.Booster()
xgbr.load_model("feat_eng_10eps.model")

In [8]:
# to do a sanity check that the model is learning something, we predict on a train sample
sanity_df = pd.read_feather(sample_train[0])
y = sanity_df["Retweets"].values
x = sanity_df.drop(labels="Retweets", axis="columns")
feature_names = x.columns
x = x.values
dsanity = xgb.DMatrix(x, feature_names=feature_names)
y_pred = xgbr.predict(dsanity)
y_pred

array([1.6131823e-01, 1.6131823e-01, 1.6131823e-01, 1.6131823e-01,
       9.5413227e+00, 1.6461336e+01, 1.6131823e-01, 4.5578012e+00,
       1.6131823e-01, 1.5871524e+01, 1.6131823e-01, 1.6131823e-01,
       3.0037455e+01, 1.6131823e-01, 1.6131823e-01, 1.6131823e-01,
       1.6131823e-01, 7.1806574e-01, 1.6131823e-01, 1.6131823e-01,
       1.6131823e-01, 1.1776975e+01, 1.6131823e-01, 1.6131823e-01,
       4.6368098e+00, 1.6131823e-01, 1.6131823e-01, 2.9880378e+00,
       1.6131823e-01, 1.2622435e-01, 1.6131823e-01, 7.5646577e+00,
       1.6131823e-01, 1.6131823e-01, 1.6131823e-01, 1.6131823e-01,
       1.6461336e+01, 1.6131823e-01, 1.6131823e-01, 1.6131823e-01,
       1.6131823e-01, 1.6131823e-01, 5.0000000e-01, 9.7090225e+00,
       4.3580380e+02, 1.6131823e-01, 1.6131823e-01, 1.6131823e-01,
       1.6131823e-01, 1.9053748e+02, 1.6131823e-01, 3.9038097e+01,
       1.6131823e-01, 4.4615674e+00, 7.3373215e+01, 1.6131823e-01,
       1.6131823e-01, 1.6131823e-01, 2.5317542e+02, 1.6131823e

In [9]:
# true labels
y 

array([  0,   0,   0,   0,  26,  14,   0,   7,   0,   8,   0,   0,  18,
         0,   0,   0,   0,   4,   0,   0,   0,  14,   0,   0,   3,   0,
         0,   1,   0,   1,   0,   8,   0,   0,   0,   0,  12,   0,   0,
         0,   0,   0,   2,   3,  80,   0,   0,   0,   0,  52,   0,  23,
         0,   3,  33,   0,   0,   0, 149,   0,   1,  73,   0,   0,  18,
         0,   0,   0,   1,   0,  15,   0,   0,   3,  17,   0,   0,   0,
         0,   0,   1,  40,   2,   0,   0,   0,  57,  15,   1, 236,   0,
         0,   0,   0,   3,   0,   3,   0,   3,   3])

In [10]:
# predicted labels
y_pred.astype(int)

array([  0,   0,   0,   0,   9,  16,   0,   4,   0,  15,   0,   0,  30,
         0,   0,   0,   0,   0,   0,   0,   0,  11,   0,   0,   4,   0,
         0,   2,   0,   0,   0,   7,   0,   0,   0,   0,  16,   0,   0,
         0,   0,   0,   0,   9, 435,   0,   0,   0,   0, 190,   0,  39,
         0,   4,  73,   0,   0,   0, 253,   0,   3, 549,   0,   0,   8,
         0,   0,   0,   4,   0,  23,   0,   0,   4,   7,   0,   0,   0,
         0,   0,   3,  44,   5,   0,   0,   0, 429,  29,   2,  99,   0,
         0,   0,   0,   2,   0,   4,   0,   4,   4])

In [14]:
# calculate msle
mean_squared_log_error(y, y_pred)

0.2622544759359271

In [15]:
# get feature scores and rank them according to importance
OrderedDict(sorted(xgbr.get_fscore().items(), key=lambda t: t[1], reverse=True))

OrderedDict([('#Favourites', 41),
             ('#Followers', 12),
             ('confidence_mean', 12),
             ('usernamehash_col163', 10),
             ('usernamehash_col139', 8),
             ('sin_second', 8),
             ('#Friends', 7),
             ('usernamehash_col35', 6),
             ('usernamehash_col185', 5),
             ('usernamehash_col132', 4),
             ('confidence_max', 4),
             ('usernamehash_col182', 3),
             ('usernamehash_col54', 3),
             ('usernamehash_col174', 3),
             ('usernamehash_col97', 3),
             ('Sentiments1', 3),
             ('usernamehash_col102', 2),
             ('usernamehash_col50', 2),
             ('usernamehash_col73', 2),
             ('usernamehash_col56', 2),
             ('Sentiments0', 1),
             ('usernamehash_col105', 1),
             ('usernamehash_col196', 1),
             ('Hashtag_counts', 1),
             ('usernamehash_col201', 1),
             ('usernamehash_col215', 1)])

## Relevant Features

Extract relevant features based feature analysis.

In [5]:
# load existing feature names from a sample dataframe
example_df = pd.read_feather(train_files[0])
y = example_df["Retweets"].values
x = example_df.drop(labels="Retweets", axis="columns")
feature_names = x.columns
feature_names

Index(['usernamehash_col0', 'usernamehash_col1', 'usernamehash_col2',
       'usernamehash_col3', 'usernamehash_col4', 'usernamehash_col5',
       'usernamehash_col6', 'usernamehash_col7', 'usernamehash_col8',
       'usernamehash_col9',
       ...
       'URLshash_col251', 'URLshash_col252', 'URLshash_col253',
       'URLshash_col254', 'URLshash_col255', 'sin_second', 'cos_second',
       'sin_month', 'cos_month', 'year'],
      dtype='object', length=856)

In [6]:
# check "usernamehash_*" indicies
feature_names[:256]

Index(['usernamehash_col0', 'usernamehash_col1', 'usernamehash_col2',
       'usernamehash_col3', 'usernamehash_col4', 'usernamehash_col5',
       'usernamehash_col6', 'usernamehash_col7', 'usernamehash_col8',
       'usernamehash_col9',
       ...
       'usernamehash_col246', 'usernamehash_col247', 'usernamehash_col248',
       'usernamehash_col249', 'usernamehash_col250', 'usernamehash_col251',
       'usernamehash_col252', 'usernamehash_col253', 'usernamehash_col254',
       'usernamehash_col255'],
      dtype='object', length=256)

In [7]:
# construct list with only relevant features
relevant_features = [
  "#Favourites",
  "#Followers",
  "confidence_mean",
  "confidence_max",
  "confidence_median",
  "sin_second",
  "cos_second",
  "#Friends",
  "Hashtag_counts",
  "Sentiments0",
  "Sentiments1"
]
relevant_features.extend(feature_names[:256])

In [71]:
# save relevant features identified in a file
with open("relevant_features.json", "w") as f:
  json.dump(relevant_features, f)

In [25]:
len(relevant_features)

267

In [6]:
# load relevant features from file when needed
with open("relevant_features.json") as f:
  relevant_features = json.load(f)

# Experiment 1 - Number of Estimators

The number of estimators is varied with the `num_boost_round` variable. Each variation is first trained until it overfits. After which, the model at the epoch which performs best on the evaluation set for each variation if compared with the validation set.

## Train

### num_boost_round=2

In [24]:
# choose random sample of train files
random.seed(RAND_SEED)
train_set = random.choices(train_files, k=int(len(train_files)*0.8))
val_set = list(set(train_files)-set(train_set))

In [25]:
# initialise variables
n_epochs = 100
epoch_set_size = 750
train_val_ratio = (8, 2)
assert sum(train_val_ratio) == 10

In [26]:
# train model
model, stats = model_fns.train(train_set, 
                               val_set, 
                               n_epochs, 
                               relevant_features, 
                               epoch_set_size, 
                               train_val_ratio, 
                               num_boost_round=2,
                               load_checkpoints="saved_models/model1a_ep*.model", 
                               main_proj_dir=main_proj_dir)

Epoch 1:


100%|██████████| 1500/1500 [03:16<00:00,  7.62it/s]


MSLE: 1112.970997529657
Epoch 2:


100%|██████████| 1500/1500 [02:04<00:00, 12.08it/s]


MSLE: 0.5545539481262
Epoch 3:


100%|██████████| 1500/1500 [01:59<00:00, 12.53it/s]


MSLE: 0.5557742578496984
Increment early stopper to 1 because val loss (0.5557742578496984) is greater than threshold (0.5545539481262)
Epoch 4:


100%|██████████| 1500/1500 [01:54<00:00, 13.16it/s]


MSLE: 0.6896691136733668
Increment early stopper to 2 because val loss (0.6896691136733668) is greater than threshold (0.5545539481262)
Epoch 5:


100%|██████████| 1500/1500 [01:57<00:00, 12.75it/s]


MSLE: 0.5011754268274714
Epoch 6:


100%|██████████| 1500/1500 [01:55<00:00, 12.99it/s]


MSLE: 0.4576633675918724
Epoch 7:


100%|██████████| 1500/1500 [01:59<00:00, 12.56it/s]


MSLE: 1.1829657056688572
Increment early stopper to 1 because val loss (1.1829657056688572) is greater than threshold (0.4576633675918724)
Epoch 8:


100%|██████████| 1500/1500 [01:55<00:00, 13.01it/s]


MSLE: 0.5030423995961099
Increment early stopper to 2 because val loss (0.5030423995961099) is greater than threshold (0.4576633675918724)
Epoch 9:


100%|██████████| 1500/1500 [01:50<00:00, 13.57it/s]


MSLE: 0.5854925785628559
Increment early stopper to 3 because val loss (0.5854925785628559) is greater than threshold (0.4576633675918724)
Epoch 10:


100%|██████████| 1500/1500 [01:56<00:00, 12.84it/s]


MSLE: 0.5077311292200176
Increment early stopper to 4 because val loss (0.5077311292200176) is greater than threshold (0.4576633675918724)
Epoch 11:


100%|██████████| 6000/6000 [08:41<00:00, 11.52it/s]
100%|██████████| 1500/1500 [01:51<00:00, 13.42it/s]


MSLE: 0.5518794302113914
Increment early stopper to 5 because val loss (0.5518794302113914) is greater than threshold (0.4576633675918724)
Epoch 12:


100%|██████████| 6000/6000 [08:47<00:00, 11.38it/s]
100%|██████████| 1500/1500 [01:52<00:00, 13.33it/s]


MSLE: 0.6904054605631132
Increment early stopper to 6 because val loss (0.6904054605631132) is greater than threshold (0.4576633675918724)
Epoch 13:


100%|██████████| 6000/6000 [08:44<00:00, 11.44it/s]
100%|██████████| 1500/1500 [01:56<00:00, 12.90it/s]


MSLE: 0.5007362076407113
Increment early stopper to 7 because val loss (0.5007362076407113) is greater than threshold (0.4576633675918724)
Epoch 14:


100%|██████████| 6000/6000 [08:41<00:00, 11.51it/s]
100%|██████████| 1500/1500 [01:55<00:00, 13.03it/s]


MSLE: 0.45193548649089454
Epoch 15:


100%|██████████| 6000/6000 [08:43<00:00, 11.46it/s]
100%|██████████| 1500/1500 [01:55<00:00, 12.98it/s]


MSLE: 1.1907001284789542
Increment early stopper to 1 because val loss (1.1907001284789542) is greater than threshold (0.45193548649089454)
Epoch 16:


100%|██████████| 6000/6000 [08:42<00:00, 11.47it/s]
100%|██████████| 1500/1500 [01:53<00:00, 13.17it/s]


MSLE: 0.5088967185542181
Increment early stopper to 2 because val loss (0.5088967185542181) is greater than threshold (0.45193548649089454)
Epoch 17:


100%|██████████| 6000/6000 [08:43<00:00, 11.45it/s]
100%|██████████| 1500/1500 [01:50<00:00, 13.63it/s]


MSLE: 0.5907124112867419
Increment early stopper to 3 because val loss (0.5907124112867419) is greater than threshold (0.45193548649089454)
Epoch 18:


100%|██████████| 6000/6000 [09:07<00:00, 10.96it/s]
100%|██████████| 1500/1500 [02:07<00:00, 11.81it/s]


MSLE: 0.512220640693441
Increment early stopper to 4 because val loss (0.512220640693441) is greater than threshold (0.45193548649089454)
Epoch 19:


100%|██████████| 6000/6000 [09:24<00:00, 10.63it/s]
100%|██████████| 1500/1500 [02:00<00:00, 12.48it/s]


MSLE: 1.0575846424730713
Increment early stopper to 5 because val loss (1.0575846424730713) is greater than threshold (0.45193548649089454)
Epoch 20:


100%|██████████| 6000/6000 [10:30<00:00,  9.51it/s]
100%|██████████| 1500/1500 [02:14<00:00, 11.15it/s]


MSLE: 1.100532483315416
Increment early stopper to 6 because val loss (1.100532483315416) is greater than threshold (0.45193548649089454)
Epoch 21:


100%|██████████| 6000/6000 [10:46<00:00,  9.28it/s]
100%|██████████| 1500/1500 [02:07<00:00, 11.79it/s]


MSLE: 0.4397569859057111
Epoch 22:


100%|██████████| 6000/6000 [10:20<00:00,  9.68it/s]
100%|██████████| 1500/1500 [02:26<00:00, 10.22it/s]


MSLE: 0.988629415939118
Increment early stopper to 1 because val loss (0.988629415939118) is greater than threshold (0.4397569859057111)
Epoch 23:


100%|██████████| 6000/6000 [10:48<00:00,  9.25it/s]
100%|██████████| 1500/1500 [02:18<00:00, 10.82it/s]


MSLE: 0.41272735665277
Epoch 24:


100%|██████████| 6000/6000 [10:48<00:00,  9.26it/s]
100%|██████████| 1500/1500 [02:15<00:00, 11.10it/s]


MSLE: 0.6862775601314928
Increment early stopper to 1 because val loss (0.6862775601314928) is greater than threshold (0.41272735665277)
Epoch 25:


100%|██████████| 6000/6000 [10:46<00:00,  9.28it/s]
100%|██████████| 1500/1500 [02:11<00:00, 11.40it/s]


MSLE: 0.4813840497839673
Increment early stopper to 2 because val loss (0.4813840497839673) is greater than threshold (0.41272735665277)
Epoch 26:


100%|██████████| 6000/6000 [10:35<00:00,  9.45it/s]
100%|██████████| 1500/1500 [02:21<00:00, 10.60it/s]


MSLE: 0.48261653334266247
Increment early stopper to 3 because val loss (0.48261653334266247) is greater than threshold (0.41272735665277)
Epoch 27:


100%|██████████| 6000/6000 [10:28<00:00,  9.55it/s]
100%|██████████| 1500/1500 [02:12<00:00, 11.28it/s]


MSLE: 0.6877212772797645
Increment early stopper to 4 because val loss (0.6877212772797645) is greater than threshold (0.41272735665277)
Epoch 28:


100%|██████████| 6000/6000 [10:24<00:00,  9.61it/s]
100%|██████████| 1500/1500 [02:15<00:00, 11.09it/s]


MSLE: 1.0217762938995694
Increment early stopper to 5 because val loss (1.0217762938995694) is greater than threshold (0.41272735665277)
Epoch 29:


100%|██████████| 6000/6000 [10:24<00:00,  9.61it/s]
100%|██████████| 1500/1500 [02:12<00:00, 11.30it/s]


MSLE: 0.4418041083882186
Increment early stopper to 6 because val loss (0.4418041083882186) is greater than threshold (0.41272735665277)
Epoch 30:


100%|██████████| 6000/6000 [10:33<00:00,  9.47it/s]
100%|██████████| 1500/1500 [02:13<00:00, 11.27it/s]


MSLE: 0.4540058531352947
Increment early stopper to 7 because val loss (0.4540058531352947) is greater than threshold (0.41272735665277)
Epoch 31:


100%|██████████| 6000/6000 [10:58<00:00,  9.11it/s]
100%|██████████| 1500/1500 [02:17<00:00, 10.93it/s]


MSLE: 0.46776102368725325
Increment early stopper to 8 because val loss (0.46776102368725325) is greater than threshold (0.41272735665277)
Epoch 32:


100%|██████████| 6000/6000 [10:32<00:00,  9.48it/s]
100%|██████████| 1500/1500 [02:13<00:00, 11.24it/s]


MSLE: 1.5128867138162838
Increment early stopper to 9 because val loss (1.5128867138162838) is greater than threshold (0.41272735665277)
Epoch 33:


100%|██████████| 6000/6000 [10:19<00:00,  9.68it/s]
100%|██████████| 1500/1500 [02:14<00:00, 11.16it/s]

MSLE: 0.48288078556785924
Increment early stopper to 10 because val loss (0.48288078556785924) is greater than threshold (0.41272735665277)
Model has overfit, early stopping...





In [27]:
# add train statistics to file
with open("stats_train.json", "r") as f:
  prev_stats = json.load(f)

prev_stats.update({"model1a": stats})

with open("stats_train.json", "w") as f:
  json.dump(prev_stats, f)

### num_boost_round=4

In [20]:
random.seed(RAND_SEED)
train_set = random.choices(train_files, k=int(len(train_files)*0.8))
val_set = list(set(train_files)-set(train_set))

In [21]:
n_epochs = 100
epoch_set_size = 750
train_val_ratio = (8, 2)
assert sum(train_val_ratio) == 10

In [22]:
model, stats = model_fns.train(train_set, 
                               val_set, 
                               n_epochs, 
                               relevant_features, 
                               epoch_set_size, 
                               train_val_ratio, 
                               num_boost_round=4,
                               load_checkpoints="saved_models/model1b_ep*.model", 
                               main_proj_dir=main_proj_dir)

Epoch 1:


100%|██████████| 1500/1500 [04:06<00:00,  6.08it/s]


MSLE: 1006.8905977063852
Epoch 2:


100%|██████████| 1500/1500 [02:13<00:00, 11.21it/s]


MSLE: 0.6347022857193113
Epoch 3:


100%|██████████| 1500/1500 [02:27<00:00, 10.18it/s]


MSLE: 0.5223484835831803
Epoch 4:


100%|██████████| 1500/1500 [02:15<00:00, 11.09it/s]


MSLE: 0.3923977446382243
Epoch 5:


100%|██████████| 1500/1500 [02:37<00:00,  9.50it/s]


MSLE: 0.41261998217853074
Increment early stopper to 1 because val loss (0.41261998217853074) is greater than threshold (0.3923977446382243)
Epoch 6:


100%|██████████| 1500/1500 [02:30<00:00,  9.97it/s]


MSLE: 0.47212989398880784
Increment early stopper to 2 because val loss (0.47212989398880784) is greater than threshold (0.3923977446382243)
Epoch 7:


100%|██████████| 1500/1500 [02:23<00:00, 10.42it/s]


MSLE: 0.39788658731430715
Increment early stopper to 3 because val loss (0.39788658731430715) is greater than threshold (0.3923977446382243)
Epoch 8:


100%|██████████| 1500/1500 [02:45<00:00,  9.06it/s]


MSLE: 0.4976353181735433
Increment early stopper to 4 because val loss (0.4976353181735433) is greater than threshold (0.3923977446382243)
Epoch 9:


100%|██████████| 1500/1500 [02:33<00:00,  9.80it/s]


MSLE: 0.36147869018462603
Epoch 10:


100%|██████████| 1500/1500 [02:14<00:00, 11.18it/s]


MSLE: 0.477808961064381
Increment early stopper to 1 because val loss (0.477808961064381) is greater than threshold (0.36147869018462603)
Epoch 11:


100%|██████████| 6000/6000 [11:01<00:00,  9.07it/s]
100%|██████████| 1500/1500 [02:02<00:00, 12.29it/s]


MSLE: 0.48721568945181903
Increment early stopper to 2 because val loss (0.48721568945181903) is greater than threshold (0.36147869018462603)
Epoch 12:


100%|██████████| 6000/6000 [11:13<00:00,  8.91it/s]
100%|██████████| 1500/1500 [02:15<00:00, 11.11it/s]


MSLE: 0.6261396172606555
Increment early stopper to 3 because val loss (0.6261396172606555) is greater than threshold (0.36147869018462603)
Epoch 13:


100%|██████████| 6000/6000 [09:08<00:00, 10.94it/s]
100%|██████████| 1500/1500 [01:55<00:00, 13.02it/s]


MSLE: 0.5337274791638602
Increment early stopper to 4 because val loss (0.5337274791638602) is greater than threshold (0.36147869018462603)
Epoch 14:


100%|██████████| 6000/6000 [08:44<00:00, 11.45it/s]
100%|██████████| 1500/1500 [01:52<00:00, 13.33it/s]


MSLE: 0.5453417974731395
Increment early stopper to 5 because val loss (0.5453417974731395) is greater than threshold (0.36147869018462603)
Epoch 15:


100%|██████████| 6000/6000 [08:48<00:00, 11.36it/s]
100%|██████████| 1500/1500 [01:49<00:00, 13.73it/s]


MSLE: 0.6108048853722977
Increment early stopper to 6 because val loss (0.6108048853722977) is greater than threshold (0.36147869018462603)
Epoch 16:


100%|██████████| 6000/6000 [08:40<00:00, 11.53it/s]
100%|██████████| 1500/1500 [01:48<00:00, 13.76it/s]


MSLE: 0.6697270125454412
Increment early stopper to 7 because val loss (0.6697270125454412) is greater than threshold (0.36147869018462603)
Epoch 17:


100%|██████████| 6000/6000 [08:45<00:00, 11.41it/s]
100%|██████████| 1500/1500 [01:53<00:00, 13.18it/s]


MSLE: 0.44406025131612403
Increment early stopper to 8 because val loss (0.44406025131612403) is greater than threshold (0.36147869018462603)
Epoch 18:


100%|██████████| 6000/6000 [09:00<00:00, 11.11it/s]
100%|██████████| 1500/1500 [01:54<00:00, 13.09it/s]


MSLE: 0.3924418786464574
Increment early stopper to 9 because val loss (0.3924418786464574) is greater than threshold (0.36147869018462603)
Epoch 19:


100%|██████████| 6000/6000 [09:13<00:00, 10.85it/s]
100%|██████████| 1500/1500 [02:06<00:00, 11.86it/s]

MSLE: 0.6019011178979845
Increment early stopper to 10 because val loss (0.6019011178979845) is greater than threshold (0.36147869018462603)
Model has overfit, early stopping...





In [23]:
with open("stats_train.json", "r") as f:
  prev_stats = json.load(f)

prev_stats.update({"model1b": stats})

with open("stats_train.json", "w") as f:
  json.dump(prev_stats, f)

### num_boost_round=6

In [16]:
random.seed(RAND_SEED)
train_set = random.choices(train_files, k=int(len(train_files)*0.8))
val_set = list(set(train_files)-set(train_set))

In [17]:
n_epochs = 100
epoch_set_size = 750
train_val_ratio = (8, 2)
assert sum(train_val_ratio) == 10

In [18]:
model, stats = model_fns.train(train_set, 
                               val_set, 
                               n_epochs, 
                               relevant_features, 
                               epoch_set_size, 
                               train_val_ratio, 
                               num_boost_round=6,
                               load_checkpoints="saved_models/model1c_ep*.model", 
                               main_proj_dir=main_proj_dir)

Epoch 1:


100%|██████████| 1500/1500 [04:23<00:00,  5.69it/s]


MSLE: 1152.036196596083
Epoch 2:


100%|██████████| 1500/1500 [02:22<00:00, 10.53it/s]


MSLE: 0.608805660288006
Epoch 3:


100%|██████████| 1500/1500 [02:02<00:00, 12.25it/s]


MSLE: 0.5109344015651799
Epoch 4:


100%|██████████| 1500/1500 [01:56<00:00, 12.89it/s]


MSLE: 0.38071248437792404
Epoch 5:


100%|██████████| 1500/1500 [02:13<00:00, 11.25it/s]


MSLE: 0.3854474698573689
Increment early stopper to 1 because val loss (0.3854474698573689) is greater than threshold (0.38071248437792404)
Epoch 6:


100%|██████████| 1500/1500 [01:56<00:00, 12.85it/s]


MSLE: 0.41846743426239436
Increment early stopper to 2 because val loss (0.41846743426239436) is greater than threshold (0.38071248437792404)
Epoch 7:


100%|██████████| 1500/1500 [01:58<00:00, 12.61it/s]


MSLE: 0.370110933914273
Epoch 8:


100%|██████████| 1500/1500 [01:57<00:00, 12.81it/s]


MSLE: 0.451205992202456
Increment early stopper to 1 because val loss (0.451205992202456) is greater than threshold (0.370110933914273)
Epoch 9:


100%|██████████| 1500/1500 [01:51<00:00, 13.45it/s]


MSLE: 0.3827476270175404
Increment early stopper to 2 because val loss (0.3827476270175404) is greater than threshold (0.370110933914273)
Epoch 10:


100%|██████████| 1500/1500 [01:52<00:00, 13.34it/s]


MSLE: 0.4329052968870891
Increment early stopper to 3 because val loss (0.4329052968870891) is greater than threshold (0.370110933914273)
Epoch 11:


100%|██████████| 6000/6000 [10:24<00:00,  9.61it/s]
100%|██████████| 1500/1500 [01:53<00:00, 13.22it/s]


MSLE: 0.4479565669056605
Increment early stopper to 4 because val loss (0.4479565669056605) is greater than threshold (0.370110933914273)
Epoch 12:


100%|██████████| 6000/6000 [10:00<00:00,  9.99it/s]
100%|██████████| 1500/1500 [02:01<00:00, 12.35it/s]


MSLE: 0.5414136193916488
Increment early stopper to 5 because val loss (0.5414136193916488) is greater than threshold (0.370110933914273)
Epoch 13:


100%|██████████| 6000/6000 [11:28<00:00,  8.72it/s]
100%|██████████| 1500/1500 [01:52<00:00, 13.39it/s]


MSLE: 0.6600533782681511
Increment early stopper to 6 because val loss (0.6600533782681511) is greater than threshold (0.370110933914273)
Epoch 14:


100%|██████████| 6000/6000 [10:16<00:00,  9.74it/s]
100%|██████████| 1500/1500 [02:14<00:00, 11.15it/s]


MSLE: 0.7374931465900401
Increment early stopper to 7 because val loss (0.7374931465900401) is greater than threshold (0.370110933914273)
Epoch 15:


100%|██████████| 6000/6000 [12:35<00:00,  7.94it/s]
100%|██████████| 1500/1500 [03:02<00:00,  8.20it/s]


MSLE: 0.5796575661663068
Increment early stopper to 8 because val loss (0.5796575661663068) is greater than threshold (0.370110933914273)
Epoch 16:


100%|██████████| 6000/6000 [12:55<00:00,  7.74it/s]
100%|██████████| 1500/1500 [02:26<00:00, 10.22it/s]


MSLE: 0.6706100796479313
Increment early stopper to 9 because val loss (0.6706100796479313) is greater than threshold (0.370110933914273)
Epoch 17:


100%|██████████| 6000/6000 [11:49<00:00,  8.46it/s]
100%|██████████| 1500/1500 [02:31<00:00,  9.88it/s]

MSLE: 0.4458109659593459
Increment early stopper to 10 because val loss (0.4458109659593459) is greater than threshold (0.370110933914273)
Model has overfit, early stopping...





In [19]:
with open("stats_train.json", "r") as f:
  prev_stats = json.load(f)

prev_stats.update({"model1c": stats})

with open("stats_train.json", "w") as f:
  json.dump(prev_stats, f)

### num_boost_round=8

In [4]:
random.seed(RAND_SEED)
train_set = random.choices(train_files, k=int(len(train_files)*0.8))
val_set = list(set(train_files)-set(train_set))

In [5]:
n_epochs = 100
epoch_set_size = 750
train_val_ratio = (8, 2)
assert sum(train_val_ratio) == 10

In [8]:
model, stats = model_fns.train(train_set, 
                               val_set, 
                               n_epochs, 
                               relevant_features, 
                               epoch_set_size, 
                               train_val_ratio, 
                               num_boost_round=8,
                               load_checkpoints="saved_models/model1d_ep*.model", 
                               main_proj_dir=main_proj_dir)

Epoch 1:


100%|██████████| 1500/1500 [05:14<00:00,  4.77it/s]


MSLE: 984.8527362469513
Epoch 2:


100%|██████████| 1500/1500 [02:05<00:00, 11.93it/s]


MSLE: 0.6285709272378305
Epoch 3:


100%|██████████| 1500/1500 [02:13<00:00, 11.26it/s]


MSLE: 0.4958907688597652
Epoch 4:


100%|██████████| 1500/1500 [01:59<00:00, 12.53it/s]


MSLE: 0.4049398873627227
Epoch 5:


100%|██████████| 1500/1500 [02:08<00:00, 11.71it/s]


MSLE: 0.4119217061545709
Increment early stopper to 1 because val loss (0.4119217061545709) is greater than threshold (0.4049398873627227)
Epoch 6:


100%|██████████| 1500/1500 [02:11<00:00, 11.40it/s]


MSLE: 0.4123145767618634
Increment early stopper to 2 because val loss (0.4123145767618634) is greater than threshold (0.4049398873627227)
Epoch 7:


100%|██████████| 1500/1500 [02:07<00:00, 11.78it/s]


MSLE: 0.3665299852524323
Epoch 8:


100%|██████████| 1500/1500 [02:32<00:00,  9.85it/s]


MSLE: 0.4442816995201123
Increment early stopper to 1 because val loss (0.4442816995201123) is greater than threshold (0.3665299852524323)
Epoch 9:


100%|██████████| 1500/1500 [02:15<00:00, 11.03it/s]


MSLE: 0.4047345292492246
Increment early stopper to 2 because val loss (0.4047345292492246) is greater than threshold (0.3665299852524323)
Epoch 10:


100%|██████████| 1500/1500 [02:12<00:00, 11.33it/s]


MSLE: 0.4250912388780296
Increment early stopper to 3 because val loss (0.4250912388780296) is greater than threshold (0.3665299852524323)
Epoch 11:


100%|██████████| 6000/6000 [10:43<00:00,  9.32it/s]
100%|██████████| 1500/1500 [02:06<00:00, 11.83it/s]


MSLE: 0.48989202362828815
Increment early stopper to 4 because val loss (0.48989202362828815) is greater than threshold (0.3665299852524323)
Epoch 12:


100%|██████████| 6000/6000 [09:40<00:00, 10.34it/s]
100%|██████████| 1500/1500 [01:54<00:00, 13.12it/s]


MSLE: 0.4959524669143739
Increment early stopper to 5 because val loss (0.4959524669143739) is greater than threshold (0.3665299852524323)
Epoch 13:


100%|██████████| 6000/6000 [10:44<00:00,  9.30it/s]
100%|██████████| 1500/1500 [02:25<00:00, 10.34it/s]


MSLE: 0.6572264245594662
Increment early stopper to 6 because val loss (0.6572264245594662) is greater than threshold (0.3665299852524323)
Epoch 14:


100%|██████████| 6000/6000 [11:33<00:00,  8.65it/s]
100%|██████████| 1500/1500 [02:11<00:00, 11.38it/s]


MSLE: 0.8472028378286334
Increment early stopper to 7 because val loss (0.8472028378286334) is greater than threshold (0.3665299852524323)
Epoch 15:


100%|██████████| 6000/6000 [10:55<00:00,  9.15it/s]
100%|██████████| 1500/1500 [02:09<00:00, 11.60it/s]


MSLE: 0.5723245980792938
Increment early stopper to 8 because val loss (0.5723245980792938) is greater than threshold (0.3665299852524323)
Epoch 16:


100%|██████████| 6000/6000 [11:11<00:00,  8.93it/s]
100%|██████████| 1500/1500 [02:07<00:00, 11.76it/s]


MSLE: 0.6682165355466126
Increment early stopper to 9 because val loss (0.6682165355466126) is greater than threshold (0.3665299852524323)
Epoch 17:


100%|██████████| 6000/6000 [10:50<00:00,  9.22it/s]
100%|██████████| 1500/1500 [02:17<00:00, 10.87it/s]

MSLE: 0.467038203140593
Increment early stopper to 10 because val loss (0.467038203140593) is greater than threshold (0.3665299852524323)
Model has overfit, early stopping...





In [9]:
with open("stats_train.json", "r") as f:
  prev_stats = json.load(f)

prev_stats.update({"model1d": stats})

with open("stats_train.json", "w") as f:
  json.dump(prev_stats, f)

### num_boost_round=10

In [6]:
random.seed(RAND_SEED)
train_set = random.choices(train_files, k=int(len(train_files)*0.8))
val_set = list(set(train_files)-set(train_set))

In [8]:
n_epochs = 100
epoch_set_size = 750
train_val_ratio = (8, 2)
assert sum(train_val_ratio) == 10

In [9]:
model, stats = model_fns.train(train_set, 
                               val_set, 
                               n_epochs, 
                               relevant_features, 
                               epoch_set_size, 
                               train_val_ratio, 
                               load_checkpoints="saved_models/model1e_ep*.model", 
                               main_proj_dir=main_proj_dir)

Epoch 1:


100%|██████████| 1500/1500 [05:08<00:00,  4.86it/s]


MSLE: 1154.9968797907886
Epoch 2:


100%|██████████| 1500/1500 [02:59<00:00,  8.35it/s]


MSLE: 0.42264016809142385
Epoch 3:


100%|██████████| 1500/1500 [03:02<00:00,  8.23it/s]


MSLE: 0.36067698150310284
Epoch 4:


100%|██████████| 1500/1500 [02:53<00:00,  8.64it/s]


MSLE: 0.37364724245203956
Increment early stopper to 1 because val loss (0.37364724245203956) is greater than threshold (0.36067698150310284)
Epoch 5:


100%|██████████| 1500/1500 [02:20<00:00, 10.71it/s]


MSLE: 0.48262758418843243
Increment early stopper to 2 because val loss (0.48262758418843243) is greater than threshold (0.36067698150310284)
Epoch 6:


100%|██████████| 1500/1500 [02:26<00:00, 10.22it/s]


MSLE: 0.44321141983478796
Increment early stopper to 3 because val loss (0.44321141983478796) is greater than threshold (0.36067698150310284)
Epoch 7:


100%|██████████| 1500/1500 [02:48<00:00,  8.92it/s]


MSLE: 0.6161716134825336
Increment early stopper to 4 because val loss (0.6161716134825336) is greater than threshold (0.36067698150310284)
Epoch 8:


100%|██████████| 1500/1500 [02:21<00:00, 10.61it/s]


MSLE: 0.4457801710054602
Increment early stopper to 5 because val loss (0.4457801710054602) is greater than threshold (0.36067698150310284)
Epoch 9:


100%|██████████| 1500/1500 [02:31<00:00,  9.92it/s]


MSLE: 0.5865974356758757
Increment early stopper to 6 because val loss (0.5865974356758757) is greater than threshold (0.36067698150310284)
Epoch 10:


100%|██████████| 1500/1500 [02:22<00:00, 10.56it/s]


MSLE: 0.41486634628373736
Increment early stopper to 7 because val loss (0.41486634628373736) is greater than threshold (0.36067698150310284)
Epoch 11:


100%|██████████| 6000/6000 [12:36<00:00,  7.93it/s]
100%|██████████| 1500/1500 [02:08<00:00, 11.69it/s]


MSLE: 0.48789092978262494
Increment early stopper to 8 because val loss (0.48789092978262494) is greater than threshold (0.36067698150310284)
Epoch 12:


100%|██████████| 6000/6000 [11:26<00:00,  8.74it/s]
100%|██████████| 1500/1500 [02:26<00:00, 10.21it/s]


MSLE: 0.4058930206823023
Increment early stopper to 9 because val loss (0.4058930206823023) is greater than threshold (0.36067698150310284)
Epoch 13:


100%|██████████| 6000/6000 [11:53<00:00,  8.41it/s]
100%|██████████| 1500/1500 [02:03<00:00, 12.19it/s]

MSLE: 0.6326669750390674
Increment early stopper to 10 because val loss (0.6326669750390674) is greater than threshold (0.36067698150310284)
Model has overfit, early stopping...





In [30]:
with open("stats_train.json", "r") as f:
  prev_stats = json.load(f)

prev_stats.update({"model1e": stats})

with open("stats_train.json", "w") as f:
  json.dump(prev_stats, f)

## Validation

In [16]:
# based on the train statistics saved, get the epoch which performed the best 
# for each variation
with open("stats_train.json", "r") as f:
  stats = json.load(f)

model_best_ep_map = {
  "model1a": 0,
  "model1b": 0,
  "model1c": 0,
  "model1d": 0,
  "model1e": 0,
}

for model_name in model_best_ep_map.keys():
  min_err = min(stats[model_name])
  min_err_ep = stats[model_name].index(min_err)
  model_best_ep_map[model_name] = min_err_ep+1

model_best_ep_map

{'model1a': 23, 'model1b': 9, 'model1c': 7, 'model1d': 7, 'model1e': 3}

In [17]:
# load list of files in validation set
with open(f"{main_proj_dir}/val_files_801010.json") as json_file:
  val_files = json.load(json_file)
print(f"len(val_files): {len(val_files)}")

len(val_files): 19968


In [18]:
# for each variation, load the best model and run validation on it to get the 
# msle on the validation set
val_stats = {}
for model_name, ep_no in model_best_ep_map.items():
  print(f"{model_name} (epoch {ep_no}):")
  model = model_fns.load_model(f"saved_models/{model_name}_ep{ep_no}.model")
  val_stats[model_name] = model_fns._val_epoch(val_files, model, relevant_features, main_proj_dir=main_proj_dir)
  print(f"val msle: {val_stats[model_name]}")
val_stats

model1a (epoch 23):


100%|██████████| 19968/19968 [39:45<00:00,  8.37it/s]


val msle: 0.4214346885158393
model1b (epoch 9):


100%|██████████| 19968/19968 [36:37<00:00,  9.09it/s]


val msle: 0.3680438705400346
model1c (epoch 7):


100%|██████████| 19968/19968 [37:38<00:00,  8.84it/s]


val msle: 0.3758612621173864
model1d (epoch 7):


100%|██████████| 19968/19968 [33:58<00:00,  9.80it/s]


val msle: 0.3667101573014846
model1e (epoch 3):


100%|██████████| 19968/19968 [33:54<00:00,  9.82it/s]

val msle: 0.35703752802547395





{'model1a': 0.4214346885158393,
 'model1b': 0.3680438705400346,
 'model1c': 0.3758612621173864,
 'model1d': 0.3667101573014846,
 'model1e': 0.35703752802547395}

In [19]:
with open("stats_val.json", "w") as f:
  json.dump(val_stats, f)

# Experiment 2 - Dropout

In [5]:
random.seed(RAND_SEED)
train_set = random.choices(train_files, k=int(len(train_files)*0.8))
val_set = list(set(train_files)-set(train_set))

In [6]:
n_epochs = 100
epoch_set_size = 750
train_val_ratio = (8, 2)
assert sum(train_val_ratio) == 10

In [7]:
model, stats = model_fns.train(train_set, 
                               val_set, 
                               n_epochs, 
                               relevant_features, 
                               epoch_set_size, 
                               train_val_ratio, 
                               num_boost_round=2,
                               booster="dart",
                               load_checkpoints="saved_models/model2_ep*.model", 
                               main_proj_dir=main_proj_dir)

Epoch 1:


100%|██████████| 1500/1500 [27:13<00:00,  1.09s/it]


MSLE: 1027.0720225434445
Epoch 2:


100%|██████████| 1500/1500 [02:03<00:00, 12.19it/s]


MSLE: 0.5558800983220015
Epoch 3:


100%|██████████| 1500/1500 [02:03<00:00, 12.14it/s]


MSLE: 0.4812826105762874
Epoch 4:


100%|██████████| 1500/1500 [02:10<00:00, 11.46it/s]


MSLE: 0.5143313072255982
Increment early stopper to 1 because val loss (0.5143313072255982) is greater than threshold (0.4812826105762874)
Epoch 5:


100%|██████████| 1500/1500 [02:16<00:00, 10.98it/s]


MSLE: 0.5723851561917868
Increment early stopper to 2 because val loss (0.5723851561917868) is greater than threshold (0.4812826105762874)
Epoch 6:


100%|██████████| 1500/1500 [02:23<00:00, 10.44it/s]


MSLE: 0.5114301485874156
Increment early stopper to 3 because val loss (0.5114301485874156) is greater than threshold (0.4812826105762874)
Epoch 7:


100%|██████████| 1500/1500 [02:08<00:00, 11.63it/s]


MSLE: 0.6149326483985148
Increment early stopper to 4 because val loss (0.6149326483985148) is greater than threshold (0.4812826105762874)
Epoch 8:


100%|██████████| 1500/1500 [02:21<00:00, 10.59it/s]


MSLE: 0.5699008622045235
Increment early stopper to 5 because val loss (0.5699008622045235) is greater than threshold (0.4812826105762874)
Epoch 9:


100%|██████████| 1500/1500 [02:17<00:00, 10.92it/s]


MSLE: 0.6301649719552428
Increment early stopper to 6 because val loss (0.6301649719552428) is greater than threshold (0.4812826105762874)
Epoch 10:


100%|██████████| 1500/1500 [02:10<00:00, 11.46it/s]


MSLE: 0.5859669848582211
Increment early stopper to 7 because val loss (0.5859669848582211) is greater than threshold (0.4812826105762874)
Epoch 11:


100%|██████████| 6000/6000 [18:58<00:00,  5.27it/s]
100%|██████████| 1500/1500 [02:33<00:00,  9.77it/s]


MSLE: 0.5727667929445249
Increment early stopper to 8 because val loss (0.5727667929445249) is greater than threshold (0.4812826105762874)
Epoch 12:


100%|██████████| 6000/6000 [19:02<00:00,  5.25it/s]
100%|██████████| 1500/1500 [01:52<00:00, 13.39it/s]


MSLE: 0.6569970785287793
Increment early stopper to 9 because val loss (0.6569970785287793) is greater than threshold (0.4812826105762874)
Epoch 13:


100%|██████████| 6000/6000 [19:55<00:00,  5.02it/s]
100%|██████████| 1500/1500 [01:50<00:00, 13.58it/s]

MSLE: 0.6012796031082666
Increment early stopper to 10 because val loss (0.6012796031082666) is greater than threshold (0.4812826105762874)
Model has overfit, early stopping...





In [8]:
with open("stats_train.json", "r") as f:
  prev_stats = json.load(f)

prev_stats.update({"model2": stats})

with open("stats_train.json", "w") as f:
  json.dump(prev_stats, f)

## Validation

In [20]:
with open("stats_train.json", "r") as f:
  stats = json.load(f)

model_best_ep_map = {
  "model2": 0,
}

for model_name in model_best_ep_map.keys():
  min_err = min(stats[model_name])
  min_err_ep = stats[model_name].index(min_err)
  model_best_ep_map[model_name] = min_err_ep+1

model_best_ep_map

{'model2': 3}

In [21]:
with open(f"{main_proj_dir}/val_files_801010.json") as json_file:
  val_files = json.load(json_file)
print(f"len(val_files): {len(val_files)}")

len(val_files): 19968


In [22]:
val_stats = {}
for model_name, ep_no in model_best_ep_map.items():
  print(f"{model_name} (epoch {ep_no}):")
  model = model_fns.load_model(f"saved_models/{model_name}_ep{ep_no}.model")
  val_stats[model_name] = model_fns._val_epoch(val_files, model, relevant_features, main_proj_dir=main_proj_dir)
  print(f"val msle: {val_stats[model_name]}")
val_stats

model2 (epoch 3):


100%|██████████| 19968/19968 [34:38<00:00,  9.61it/s]

val msle: 0.4922658251014944





{'model2': 0.4922658251014944}

In [23]:
with open("stats_val.json") as f:
  prev_stats = json.load(f)

prev_stats.update(val_stats)

with open("stats_val.json", "w") as f:
  json.dump(prev_stats, f)

# Performance on Test Set

In [24]:
# load list of files in test set
with open(f"{main_proj_dir}/test_files_801010.json") as json_file:
  test_files = json.load(json_file)
print(f"len(test_files): {len(test_files)}")

len(test_files): 19967


In [25]:
# check msle of best performing model on test set
test_stats = {}
model_name, ep_no = "model1e", 3
print(f"{model_name} (epoch {ep_no}):")
model = model_fns.load_model(f"saved_models/{model_name}_ep{ep_no}.model")
test_stats[model_name] = model_fns._val_epoch(test_files, model, relevant_features, main_proj_dir=main_proj_dir)
print(f"test msle: {test_stats[model_name]}")

model1e (epoch 3):


100%|██████████| 19967/19967 [34:26<00:00,  9.66it/s]

test msle: 0.3563810491124486



