## setup

This workspace needs to be running on a T4, and we need to have a nightly copy of PyTorch 1.6.0 (still forthcoming) installed. Following the instructions on the PyTorch website...

In [1]:
!nvidia-smi

Wed May 27 19:13:32 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 440.82       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [6]:
!yes | pip uninstall torch torchvision

Found existing installation: torch 1.4.0
Uninstalling torch-1.4.0:
  Would remove:
    /usr/local/bin/convert-caffe2-to-onnx
    /usr/local/bin/convert-onnx-to-caffe2
    /usr/local/lib/python3.7/dist-packages/caffe2/*
    /usr/local/lib/python3.7/dist-packages/torch-1.4.0.dist-info/*
    /usr/local/lib/python3.7/dist-packages/torch/*
Proceed (y/n)?   Successfully uninstalled torch-1.4.0
Found existing installation: torchvision 0.5.0
Uninstalling torchvision-0.5.0:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/torchvision-0.5.0.dist-info/*
    /usr/local/lib/python3.7/dist-packages/torchvision/*
Proceed (y/n)?   Successfully uninstalled torchvision-0.5.0
yes: standard output: Broken pipe


Following instructions on `https://pytorch.org/` to get the nightly.

In [7]:
!pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html

Looking in links: https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
Collecting torch
  Downloading https://download.pytorch.org/whl/nightly/cu102/torch-1.6.0.dev20200527-cp37-cp37m-linux_x86_64.whl (812.8 MB)
[K     |████████████████████████████████| 812.8 MB 6.2 kB/s eta 0:00:0112     |███████████████████▏            | 488.0 MB 4.7 MB/s eta 0:01:09     |██████████████████████████▏     | 665.3 MB 11.5 MB/s eta 0:00:13
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/nightly/cu102/torchvision-0.7.0.dev20200527-cp37-cp37m-linux_x86_64.whl (6.7 MB)
[K     |████████████████████████████████| 6.7 MB 645 kB/s eta 0:00:01
Installing collected packages: torch, torchvision
Successfully installed torch-1.6.0.dev20200527 torchvision-0.7.0.dev20200527


In [11]:
!pip list | grep torch

torch                              1.6.0.dev20200527
torchvision                        0.7.0.dev20200527


In [1]:
import torch

In [2]:
torch.__version__

'1.6.0.dev20200527'

## old model

In [3]:
import pandas as pd
from pathlib import Path

path = Path('rossmann')
train_df = pd.read_pickle('/mnt/rossman-fastai-sample/train_clean').drop(['index', 'Date'], axis='columns')
test_df = pd.read_pickle('/mnt/rossman-fastai-sample/test_clean')

In [4]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import FeatureUnion, Pipeline
import numpy as np


cat_vars = [
    'Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw'
]
cont_vars = [
    'CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
    'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
    'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
    'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday'
]
target_var= 'Sales'


class ColumnFilter:
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return X.loc[:, cat_vars + cont_vars]
        

class GroupLabelEncoder:
    def __init__(self):
        self.labeller = LabelEncoder()
    
    def fit(self, X, y):
        self.encoders = {col: None for col in X.columns if col in cat_vars}
        for col in self.encoders:
            self.encoders[col] = LabelEncoder().fit(
                X[col].fillna(value='N/A').values
            )
        return self
    
    def transform(self, X):
        X_out = []
        categorical_part = np.hstack([
            self.encoders[col].transform(X[col].fillna(value='N/A').values)[:, np.newaxis]
            for col in cat_vars
        ])
        return pd.DataFrame(categorical_part, columns=cat_vars)


class GroupNullImputer:
    def fit(self, X, y):
        return self
        
    def transform(self, X):
        return X.loc[:, cont_vars].fillna(0)


class Preprocessor:
    def __init__(self):
        self.cf = ColumnFilter()
        self.gne = GroupNullImputer()
        
    def fit(self, X, y=None):
        self.gle = GroupLabelEncoder().fit(X, y=None)
        return self
    
    def transform(self, X):
        X_out = self.cf.transform(X)
        X_out = np.hstack((self.gle.transform(X_out).values, self.gne.transform(X_out).values))
        X_out = pd.DataFrame(X_out, columns=cat_vars + cont_vars)
        return X_out


X_train_sample = Preprocessor().fit(train_df).transform(train_df)
y_train_sample = train_df[target_var]

In [22]:
import torch
from torch import nn
import torch.utils.data
# ^ https://discuss.pytorch.org/t/attributeerror-module-torch-utils-has-no-attribute-data/1666


class FeedforwardTabularModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.batch_size = 512
        self.base_lr, self.max_lr = 0.001, 0.003
        self.n_epochs = 20
        self.cat_vars_embedding_vector_lengths = [
            (1115, 80), (7, 4), (3, 3), (12, 6), (31, 10), (2, 2), (25, 10), (26, 10), (4, 3),
            (3, 3), (4, 3), (23, 9), (8, 4), (12, 6), (52, 15), (22, 9), (6, 4), (6, 4), (3, 3),
            (3, 3), (8, 4), (8, 4)
        ]
        self.loss_fn = torch.nn.MSELoss()
        self.score_fn = torch.nn.MSELoss()
        
        # Layer 1: embeddings.
        self.embeddings = []
        for i, (in_size, out_size) in enumerate(self.cat_vars_embedding_vector_lengths):
            emb = nn.Embedding(in_size, out_size)
            self.embeddings.append(emb)
            setattr(self, f'emb_{i}', emb)

        # Layer 1: dropout.
        self.embedding_dropout = nn.Dropout(0.04)
        
        # Layer 1: batch normalization (of the continuous variables).
        self.cont_batch_norm = nn.BatchNorm1d(16, eps=1e-05, momentum=0.1)
        
        # Layers 2 through 9: sequential feedforward model.
        self.seq_model = nn.Sequential(*[
            nn.Linear(in_features=215, out_features=1000, bias=True),
            nn.ReLU(),
            nn.BatchNorm1d(1000, eps=1e-05, momentum=0.1),
            nn.Dropout(p=0.001),
            nn.Linear(in_features=1000, out_features=500, bias=True),
            nn.ReLU(),
            nn.BatchNorm1d(500, eps=1e-05, momentum=0.1),
            nn.Dropout(p=0.01),
            nn.Linear(in_features=500, out_features=1, bias=True)
        ])
    
    
    def forward(self, x):
        # Layer 1: embeddings.
        inp_offset = 0
        embedding_subvectors = []
        for emb in self.embeddings:
            index = torch.tensor(inp_offset, dtype=torch.int64).cuda()
            inp = torch.index_select(x, dim=1, index=index).long().cuda()
            out = emb(inp)
            out = out.view(out.shape[2], out.shape[0], 1).squeeze()
            embedding_subvectors.append(out)
            inp_offset += 1
        out_cat = torch.cat(embedding_subvectors)
        out_cat = out_cat.view(out_cat.shape[::-1])
        
        # Layer 1: dropout.
        out_cat = self.embedding_dropout(out_cat)
        
        # Layer 1: batch normalization (of the continuous variables).
        out_cont = self.cont_batch_norm(x[:, inp_offset:])
        
        out = torch.cat((out_cat, out_cont), dim=1)
        
        # Layers 2 through 9: sequential feedforward model.
        out = self.seq_model(out)

        print("YO")
        return out
        
        
    def fit(self, X, y):
        self.train()
        
        # TODO: set a random seed to invoke determinism.
        # Cf. GH#11278

        X = torch.tensor(X, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)
        
        # OneCycleLR with Adam.
        #
        # Implementation notes. OneCyceLR by default cycles both the learning rate /and/ the
        # momentum value.
        # Cf. https://www.kaggle.com/residentmario/one-cycle-learning-rate-schedulers
        #
        # Optimizers that don't support momentum must use a scheduler with cycle_momentum=False,
        # which disables the momentum-tuning behavior. Adam does not support momentum; it has its
        # own similar-ish thing built in.
        # Cf. https://www.kaggle.com/residentmario/keras-optimizers
        #
        # This code requires PyTorch >= 1.2 due to a bug, see GH#19003.
        optimizer = torch.optim.Adam(self.parameters(), lr=self.max_lr)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer, self.max_lr,
            cycle_momentum=False,
            epochs=self.n_epochs,
            steps_per_epoch=int(np.ceil(len(X) / self.batch_size)),
        )
        batches = torch.utils.data.DataLoader(
            torch.utils.data.TensorDataset(X, y),
            batch_size=self.batch_size, shuffle=True
        )
        
        for epoch in range(self.n_epochs):
            lvs = []
            for i, (X_batch, y_batch) in enumerate(batches):
                X_batch = X_batch.cuda()
                y_batch = y_batch.cuda()
                
                y_pred = model(X_batch).squeeze()
                optimizer.step()
                scheduler.step()
                loss = self.loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                
                loss.backward()
                
                lv = loss.detach().cpu().numpy()
                lvs.append(lv)
                if i % 100 == 0:
                    print(f"Epoch {epoch + 1}/{self.n_epochs}; Batch {i}; Loss {lv}")                
            print(
                f"Epoch {epoch + 1}/{self.n_epochs}; Average Loss {np.mean(lvs)}"
            )
    
    
    def predict(self, X):
        self.eval()
        with torch.no_grad():
            y_pred = model(torch.tensor(X, dtype=torch.float32).cuda())
        return y_pred.squeeze()
    
    
    def score(self, X, y):
        y_pred = self.predict(X)
        y = torch.tensor(y, dtype=torch.float32).cuda()
        return self.score_fn(y, y_pred)

In [23]:
model = FeedforwardTabularModel()
model.cuda()
model.fit(X_train_sample.values, y_train_sample.values)

Hello




RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [500, 1]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

In [18]:
model

FeedforwardTabularModel(
  (loss_fn): MSELoss()
  (score_fn): MSELoss()
  (emb_0): Embedding(1115, 80)
  (emb_1): Embedding(7, 4)
  (emb_2): Embedding(3, 3)
  (emb_3): Embedding(12, 6)
  (emb_4): Embedding(31, 10)
  (emb_5): Embedding(2, 2)
  (emb_6): Embedding(25, 10)
  (emb_7): Embedding(26, 10)
  (emb_8): Embedding(4, 3)
  (emb_9): Embedding(3, 3)
  (emb_10): Embedding(4, 3)
  (emb_11): Embedding(23, 9)
  (emb_12): Embedding(8, 4)
  (emb_13): Embedding(12, 6)
  (emb_14): Embedding(52, 15)
  (emb_15): Embedding(22, 9)
  (emb_16): Embedding(6, 4)
  (emb_17): Embedding(6, 4)
  (emb_18): Embedding(3, 3)
  (emb_19): Embedding(3, 3)
  (emb_20): Embedding(8, 4)
  (emb_21): Embedding(8, 4)
  (embedding_dropout): Dropout(p=0.04, inplace=False)
  (cont_batch_norm): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (seq_model): Sequential(
    (0): Linear(in_features=215, out_features=1000, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(1000, eps=1e-05, momentu

## writeout

In [16]:
!mkdir ../models/

In [41]:
%%writefile ../models/model_2.py
import pandas as pd
from pathlib import Path

path = Path('rossmann')
train_df = pd.read_pickle('/mnt/rossman-fastai-sample/train_clean').drop(['index', 'Date'], axis='columns')
test_df = pd.read_pickle('/mnt/rossman-fastai-sample/test_clean')

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import FeatureUnion, Pipeline
import numpy as np
import torch

cat_vars = [
    'Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw'
]
cont_vars = [
    'CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
    'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
    'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
    'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday'
]
target_var= 'Sales'


class ColumnFilter:
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return X.loc[:, cat_vars + cont_vars]
        

class GroupLabelEncoder:
    def __init__(self):
        self.labeller = LabelEncoder()
    
    def fit(self, X, y):
        self.encoders = {col: None for col in X.columns if col in cat_vars}
        for col in self.encoders:
            self.encoders[col] = LabelEncoder().fit(
                X[col].fillna(value='N/A').values
            )
        return self
    
    def transform(self, X):
        X_out = []
        categorical_part = np.hstack([
            self.encoders[col].transform(X[col].fillna(value='N/A').values)[:, np.newaxis]
            for col in cat_vars
        ])
        return pd.DataFrame(categorical_part, columns=cat_vars)


class GroupNullImputer:
    def fit(self, X, y):
        return self
        
    def transform(self, X):
        return X.loc[:, cont_vars].fillna(0)


class Preprocessor:
    def __init__(self):
        self.cf = ColumnFilter()
        self.gne = GroupNullImputer()
        
    def fit(self, X, y=None):
        self.gle = GroupLabelEncoder().fit(X, y=None)
        return self
    
    def transform(self, X):
        X_out = self.cf.transform(X)
        X_out = np.hstack((self.gle.transform(X_out).values, self.gne.transform(X_out).values))
        X_out = pd.DataFrame(X_out, columns=cat_vars + cont_vars)
        return X_out


X_train_sample = Preprocessor().fit(train_df).transform(train_df)
y_train_sample = train_df[target_var]

import torch
from torch import nn
import torch.utils.data
# ^ https://discuss.pytorch.org/t/attributeerror-module-torch-utils-has-no-attribute-data/1666


class FeedforwardTabularModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.batch_size = 512
        self.base_lr, self.max_lr = 0.001, 0.003
        self.n_epochs = 20
        self.cat_vars_embedding_vector_lengths = [
            (1115, 80), (7, 4), (3, 3), (12, 6), (31, 10), (2, 2), (25, 10), (26, 10), (4, 3),
            (3, 3), (4, 3), (23, 9), (8, 4), (12, 6), (52, 15), (22, 9), (6, 4), (6, 4), (3, 3),
            (3, 3), (8, 4), (8, 4)
        ]
        self.loss_fn = torch.nn.MSELoss()
        self.score_fn = torch.nn.MSELoss()
        
        # Layer 1: embeddings.
        self.embeddings = []
        for i, (in_size, out_size) in enumerate(self.cat_vars_embedding_vector_lengths):
            emb = nn.Embedding(in_size, out_size)
            self.embeddings.append(emb)
            setattr(self, f'emb_{i}', emb)

        # Layer 1: dropout.
        self.embedding_dropout = nn.Dropout(0.04)
        
        # Layer 1: batch normalization (of the continuous variables).
        self.cont_batch_norm = nn.BatchNorm1d(16, eps=1e-05, momentum=0.1)
        
        # Layers 2 through 9: sequential feedforward model.
        self.seq_model = nn.Sequential(*[
            nn.Linear(in_features=215, out_features=1000, bias=True),
            nn.ReLU(),
            nn.BatchNorm1d(1000, eps=1e-05, momentum=0.1),
            nn.Dropout(p=0.001),
            nn.Linear(in_features=1000, out_features=500, bias=True),
            nn.ReLU(),
            nn.BatchNorm1d(500, eps=1e-05, momentum=0.1),
            nn.Dropout(p=0.01),
            nn.Linear(in_features=500, out_features=1, bias=True)
        ])
    
    
    def forward(self, x):
        # Layer 1: embeddings.
        inp_offset = 0
        embedding_subvectors = []
        for emb in self.embeddings:
            index = torch.tensor(inp_offset, dtype=torch.int64).cuda()
            inp = torch.index_select(x, dim=1, index=index).long().cuda()
            out = emb(inp)
            out = out.view(out.shape[2], out.shape[0], 1).squeeze()
            embedding_subvectors.append(out)
            inp_offset += 1
        out_cat = torch.cat(embedding_subvectors)
        out_cat = out_cat.view(out_cat.shape[::-1])
        
        # Layer 1: dropout.
        out_cat = self.embedding_dropout(out_cat)
        
        # Layer 1: batch normalization (of the continuous variables).
        out_cont = self.cont_batch_norm(x[:, inp_offset:])
        
        out = torch.cat((out_cat, out_cont), dim=1)
        
        # Layers 2 through 9: sequential feedforward model.
        out = self.seq_model(out)
            
        return out
        
        
    def fit(self, X, y):
        self.train()
        
        # TODO: set a random seed to invoke determinism.
        # Cf. GH#11278

        X = torch.tensor(X, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)
        
        # OneCycleLR with Adam.
        #
        # Implementation notes. OneCyceLR by default cycles both the learning rate /and/ the
        # momentum value.
        # Cf. https://www.kaggle.com/residentmario/one-cycle-learning-rate-schedulers
        #
        # Optimizers that don't support momentum must use a scheduler with cycle_momentum=False,
        # which disables the momentum-tuning behavior. Adam does not support momentum; it has its
        # own similar-ish thing built in.
        # Cf. https://www.kaggle.com/residentmario/keras-optimizers
        #
        # This code requires PyTorch >= 1.2 due to a bug, see GH#19003.
        optimizer = torch.optim.Adam(self.parameters(), lr=self.max_lr)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer, self.max_lr,
            cycle_momentum=False,
            epochs=self.n_epochs,
            steps_per_epoch=int(np.ceil(len(X) / self.batch_size)),
        )
        batches = torch.utils.data.DataLoader(
            torch.utils.data.TensorDataset(X, y),
            batch_size=self.batch_size, shuffle=True
        )
        
        for epoch in range(self.n_epochs):
            lvs = []
            for i, (X_batch, y_batch) in enumerate(batches):
                X_batch = X_batch.cuda()
                y_batch = y_batch.cuda()
                
                y_pred = model(X_batch).squeeze()
                optimizer.step()
                scheduler.step()
                loss = self.loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                
                lv = loss.detach().cpu().numpy()
                lvs.append(lv)
                if i % 100 == 0:
                    print(f"Epoch {epoch + 1}/{self.n_epochs}; Batch {i}; Loss {lv}")
                
                loss.backward()
            print(
                f"Epoch {epoch + 1}/{self.n_epochs}; Average Loss {np.mean(lvs)}"
            )
    
    
    def predict(self, X):
        self.eval()
        with torch.no_grad():
            y_pred = model(torch.tensor(X, dtype=torch.float32).cuda())
        return y_pred.squeeze()
    
    
    def score(self, X, y):
        y_pred = self.predict(X)
        y = torch.tensor(y, dtype=torch.float32).cuda()
        return self.score_fn(y, y_pred)

model = FeedforwardTabularModel()
model.cuda()
model.fit(X_train_sample.values, y_train_sample.values)
torch.save(model.named_parameters(), "model.pth")

Overwriting ../models/model_2.py
