In [None]:
import wandb
from data_openml import data_split


In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
api = wandb.Api()

project_name = "middelman/saint_rossmann_mse"
metric_name = "orig_valid_rmse"
saint_runs = api.runs(project_name)

In [None]:
sorted_saint_runs = sorted(
    saint_runs, key=lambda r: r.summary.get(metric_name, float("inf"))
)

In [None]:
sorted_saint_runs[0].id

In [None]:
import wandb
run = wandb.init()
artifact = run.use_artifact('middelman/saint_rossmann_mse/SAINT_model_best_0wmepo8g:v0', type='model')
artifact_dir = artifact.download()

In [None]:
artifact_dir = artifact.download()

In [None]:
artifact_dir

In [None]:
#Get the best run from a sweep

sweep_id = "middelman/saint_rossmann_mse/ywy725ic"

sweep = api.sweep(sweep_id)

best_run = sorted(
    sweep.runs, key=lambda r: r.summary.get(metric_name, float("inf"))
)[0]

print(best_run.summary[metric_name])
print(best_run.id)
print(best_run.name)


In [None]:
# get the metadata for the run with the best metric
import pprint
# best_run = sorted_saint_runs[0]
pprint.pprint(best_run.config)

# Get the data

In [None]:
import pandas as pd
import numpy as np

print("Reading the data...")
train = pd.read_parquet(
    r"/home/coenraadmiddel/Documents/RossmannStoreSales/TabNet/tabnet/train_processed.parquet"
)
print("Read:", train.shape)

# select only a couple of columns

train = train[
    [
        "Store",
        "DayOfWeek",
        "Promo",
        "StateHoliday",
        "SchoolHoliday",
        "StoreType",
        "Assortment",
        "CompetitionDistance",
        "Promo2SinceWeek",
        "Promo2SinceYear",
        "Year",
        "Month",
        "Day",
        "WeekOfYear",
        "CompetitionOpen",
        "PromoOpen",
        "IsPromoMonth",
        "Sales",
        "Set",
    ]
]


if "Set" not in train.columns:
    train.reset_index(inplace=True, drop=True)
    train["Set"] = np.random.choice(
        ["train", "valid", "test"], p=[0.8, 0.1, 0.1], size=(train.shape[0],)
    )

train_indices = train[train.Set == "train"].index
valid_indices = train[train.Set == "valid"].index
test_indices = train[train.Set == "test"].index


categorical_columns = [
    "Store",
    "DayOfWeek",
    "Promo",
    "StateHoliday",
    "SchoolHoliday",
    "StoreType",
    "Assortment",
    # 'Year',
    # 'Month',
    # 'Day',
    # 'WeekOfYear',
    "IsPromoMonth",
]


# split x and y
X_all, y_all = train.drop(columns=["Sales", "Set"]), np.log1p(train[["Sales"]].values)

temp = X_all.fillna("MissingValue")
nan_mask = temp.ne("MissingValue").astype(int)

X_train = X_all.iloc[train_indices]
X_test = X_all.iloc[test_indices]
X_valid = X_all.iloc[valid_indices]

y_train = y_all[train_indices]
y_test = y_all[test_indices]
y_valid = y_all[valid_indices]

In [None]:
#This is SAINT specific...
# temp = X_all.fillna("MissingValue")
# nan_mask = temp.ne("MissingValue").astype(int)

X_train_d, y_train_d = data_split(X_all, y_all, nan_mask, train_indices)
X_valid_d, y_valid_d = data_split(X_all, y_all, nan_mask, valid_indices)
X_test_d, y_test_d = data_split(X_all, y_all, nan_mask, test_indices)

X_train = X_train_d['data']
X_test = X_test_d['data']
X_valid = X_valid_d['data']

y_train = y_train_d['data']
y_test = y_test_d['data']
y_valid = y_valid_d['data']

In [None]:
train[categorical_columns] = train[categorical_columns].astype('category')

cat_idxs = [train.columns.get_loc(c) for c in categorical_columns if c in train]
cat_dims = [len(train[c].cat.categories) for c in categorical_columns if c in train]
cont_idxs = [i for i in range(X_train.shape[1]) if i not in cat_idxs]

train_mean, train_std = np.array(X_train_d['data'][:,cont_idxs],dtype=np.float32).mean(0), np.array(X_train_d['data'][:,cont_idxs],dtype=np.float32).std(0)
continuous_mean_std = np.array([train_mean, train_std]).astype(np.float32) 



In [None]:

#Load the SAINT model
from models import SAINT
import torch
from torch import nn

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using {}".format(device))


#changes to the model inputs. NB
cat_dims = np.append(np.array([1]),np.array(cat_dims)).astype(int) #Appending 1 for CLS token, this is later used to generate embeddings.


model = SAINT(
    categories = cat_dims,
    num_continuous = len(cont_idxs),
    dim = best_run.config['embedding_size'],
    dim_out = 1,
    depth = best_run.config['transformer_depth'],
    heads = best_run.config['attention_heads'],
    attn_dropout = best_run.config['attention_dropout'],
    ff_dropout = best_run.config['ff_dropout'],
    mlp_hidden_mults = (4, 2),
    cont_embeddings = best_run.config['cont_embeddings'],
    attentiontype = best_run.config['attentiontype'],
    final_mlp_style = best_run.config['final_mlp_style'],
    y_dim = 1,
)
    
criterion = nn.MSELoss().to(device)
model.to(device)

model.load_state_dict(torch.load('/home/coenraadmiddel/Documents/RossmannStoreSales/SAINT/saint/bestmodels/regression/rossmann_local/SAINT_model_best_0wmepo8g.pt'))
model.eval()


In [None]:
from data_openml import DataSetCatCon
from torch.utils.data import DataLoader
from utils import count_parameters, classification_scores, mean_sq_error


valid_ds = DataSetCatCon(X_valid_d, y_valid_d, cat_idxs, task='regression', continuous_mean_std=continuous_mean_std)
# validloader = DataLoader(valid_ds, batch_size=best_run.config['batchsize'], shuffle=False)
validloader = DataLoader(valid_ds, batch_size=1, shuffle=False)

In [None]:
import random
random.seed(42)

from utils import mean_sq_error_per_sample

losses = mean_sq_error_per_sample(model, validloader, device, vision_dset=True)
   

# For Kaggle Submission

In [43]:
#Load the test data
test = pd.read_parquet(r'/home/coenraadmiddel/Documents/RossmannStoreSales/TabNet/tabnet/test_processed.parquet')

In [44]:
test.describe(include="all")    

Unnamed: 0,Store,DayOfWeek,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,WeekOfYear,CompetitionOpen,PromoOpen,IsPromoMonth
count,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0,41088.0
mean,427.5,2.979167,0.395833,0.004381,0.443487,1.252336,1.001168,5076.693925,14.182243,1168.078271,2015.0,8.354167,13.520833,34.645833,9006.475662,10168.176645,0.127434
std,247.108754,2.015481,0.489035,0.066044,0.496802,1.397401,0.994741,7221.22185,16.177932,992.765386,0.0,0.478266,8.44845,2.015481,11643.213793,11916.19762,0.333462
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015.0,8.0,1.0,31.0,0.0,2.0,0.0
25%,213.75,1.0,0.0,0.0,0.0,0.0,0.0,710.0,0.0,0.0,2015.0,8.0,6.75,33.0,47.0,34.25,0.0
50%,427.5,3.0,0.0,0.0,0.0,0.0,1.0,2410.0,9.0,2010.0,2015.0,8.0,12.5,35.0,119.0,66.25,0.0
75%,641.25,5.0,1.0,0.0,1.0,3.0,2.0,6435.0,31.0,2012.0,2015.0,9.0,19.25,36.0,24188.0,24188.5,0.0
max,855.0,6.0,1.0,1.0,1.0,3.0,2.0,75860.0,49.0,2015.0,2015.0,9.0,31.0,38.0,24189.0,24189.5,1.0


In [45]:
#This is SAINT specific...
temp = test.fillna("MissingValue")
nan_mask = temp.ne("MissingValue").astype(int)
testy = np.zeros((test.shape[0],1))

X_testtest_d, y_test_d = data_split(test, testy, nan_mask, test.index)

X_testtest = X_test_d['data']

y_testtest = y_test_d['data']


In [46]:
#testy is an array with 0's with the same shape as test
testtest_ds = DataSetCatCon(X_testtest_d, y_test_d, cat_idxs, task='regression', continuous_mean_std=continuous_mean_std)
testtestloader = DataLoader(testtest_ds, batch_size=128, shuffle=False)

In [47]:
from utils import predict

validloader = DataLoader(valid_ds, batch_size=128, shuffle=False)

# preds = predict(model, validloader, device)

In [48]:
preds = predict(model, testtestloader, device)

In [50]:
df_preds = pd.DataFrame(preds, columns=['preds'])

In [53]:
#Cast the preditions as integers
df_preds['preds'] = df_preds['preds'].astype(int)

In [55]:
df_submission = pd.DataFrame({'Id': df_preds.index+1, 'Sales': df_preds['preds']})


In [57]:
df_submission.to_csv(r'/home/coenraadmiddel/Documents/RossmannStoreSales/SAINT/saint/submission.csv', index=False)

# Check the randomness of the data, check the indices of the validation set

In [None]:
valid_indices

In [None]:

train_ds = DataSetCatCon(X_train_d, y_train_d, cat_idxs, task='regression', continuous_mean_std=continuous_mean_std)
trainloader = DataLoader(train_ds, batch_size=best_run.config['batchsize'], shuffle=True)

valid_ds = DataSetCatCon(X_valid_d, y_valid_d, cat_idxs, task='regression', continuous_mean_std=continuous_mean_std)
validloader = DataLoader(valid_ds, batch_size=best_run.config['batchsize'], shuffle=False)

test_ds = DataSetCatCon(X_test_d, y_test_d, cat_idxs, task='regression', continuous_mean_std=continuous_mean_std)
testloader = DataLoader(test_ds, batch_size=best_run.config['batchsize'], shuffle=False)

In [None]:
vision_dset = best_run.config['vision_dset']
with torch.no_grad():
    valid_rmse, orig_valid_rmse, valid_losses = mean_sq_error(model, validloader, device, vision_dset, batch_wise=True)
    test_rmse, orig_test_rmse, test_losses= mean_sq_error(model, testloader, device, vision_dset, batch_wise=True)
    train_rmse, orig_train_rmse, orig_losses = mean_sq_error(model, trainloader, device, vision_dset, batch_wise=True)
    

In [None]:
len(valid_losses)

In [None]:
#get the mean of the valid_losses list

df_losses = pd.DataFrame({'valid_losses':valid_losses})



In [None]:
df_losses['valid_losses'].mean()

In [None]:
print('VALID RMSE: %.3f, ORIG VALID RMSE: %.3f' %
    (valid_rmse, orig_valid_rmse ))
print('TEST RMSE: %.3f, ORIG TEST RMSE: %.3f' %
    (test_rmse, orig_test_rmse ))
print('TRAIN RMSE: %.3f, ORIG TRAIN RMSE: %.3f' %
    (train_rmse, orig_train_rmse ))

In [None]:
#save losses as a parquet file
df = pd.DataFrame({'losses':valid_losses})

df.to_parquet('/home/coenraadmiddel/Documents/RossmannStoreSales/SAINT/saint/saint_losses.parquet')