### Tasks

#### figure out a way to make the model deeper 
#### consider small model as low rank representation and project higher. (using outer product) train only small number of outer product parameters - https://arxiv.org/pdf/2012.13255.pdf

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms
from torchvision.utils import make_grid

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import random
import time
import copy
import os
import sys

import wandb
   
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import wandb
device = torch.device("cpu")

In [2]:
#Link to dataset: https://archive.ics.uci.edu/ml/datasets/DeliciousMIL%3A+A+Data+Set+for+Multi-Label+Multi-Instance+Learning+with+Instance+Labels#
import numpy as np
import pandas as pd
import re

def createDataset(path):
    file = open(path, "r")
    dataset = []
    split = file.read().splitlines()
 
    for i, line in enumerate(split):
        row = line.split(" ")
        while "" in row:
            row.remove("")
        for i, item in enumerate(row): 
            try:
                row[i] = float(item.split(":")[1])
            except:
                row[i] = int(item)
        dataset.append(row)

    return dataset


In [3]:
dataset = pd.DataFrame(createDataset('../Data/Skin/skin_nonskin.txt'))
dataset.dropna(inplace=True)

In [4]:
dataset

Unnamed: 0,0,1,2,3
0,1,74.0,85.0,123.0
1,1,73.0,84.0,122.0
2,1,72.0,83.0,121.0
3,1,70.0,81.0,119.0
4,1,70.0,81.0,119.0
...,...,...,...,...
245052,2,163.0,162.0,112.0
245053,2,163.0,162.0,112.0
245054,2,163.0,162.0,112.0
245055,2,163.0,162.0,112.0


In [5]:
class skin(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler  

X_train, X_valid, y_train, y_valid = \
    train_test_split(dataset.loc[:, 1:], dataset.loc[:, 0], test_size=0.2, random_state=42)

# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_valid = scaler.transform(X_valid)

y_train_proc = y_train - min(min(y_train), min(y_valid))
y_valid_proc = y_valid - min(min(y_train), min(y_valid))

X_train, y_train = np.array(X_train), np.array(y_train_proc)
X_valid, y_valid = np.array(X_valid), np.array(y_valid_proc)


print('X train shape : ', X_train.shape)
print('y train label shape : ', y_train.shape)
print('X valid  shape : ', X_valid.shape)
print('y valid shape: ', y_valid.shape)

X train shape :  (196045, 3)
y train label shape :  (196045,)
X valid  shape :  (49012, 3)
y valid shape:  (49012,)


In [7]:
train_dataset = skin(X=X_train, y=y_train)
val_dataset = skin(X=X_valid, y=y_valid)

train_loader = DataLoader(dataset=train_dataset, batch_size=len(train_dataset), shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=128, shuffle=False)

In [8]:
X = torch.tensor(train_dataset.X, device = device)
y = torch.tensor(train_dataset.y, device = device)
EPOCHS = 10
BATCH_SIZE = 512
LEARNING_RATE = 0.0001
NUM_FEATURES = X.shape[1]
NUM_CLASSES = 2

In [9]:
sweep_config = {
    'method': 'grid'
    }

parameters_dict = {
    "LEARNING_RATE": {
        'value': LEARNING_RATE
        },
    "NUM_FEATURES": {
        'value': NUM_FEATURES
        },
    "NUM_CLASSES": {
        'value': NUM_CLASSES
        },
    "EPOCHS": {
        'value': EPOCHS
        },
    }
# 
sweep_config['parameters'] = parameters_dict

In [10]:
sweep_id = wandb.sweep(sweep_config, project="sanity-experiments")

Create sweep with ID: 4wmv17t5
Sweep URL: https://wandb.ai/rice-and-shine/sanity-experiments/sweeps/4wmv17t5


In [11]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 2048)
        self.layer_2 = nn.Linear(2048, 512)
        self.layer_3 = nn.Linear(512, 128)
        self.layer_out = nn.Linear(128, num_class)
        
        self.relu = nn.ReLU()
        
        self.batchnorm1 = nn.InstanceNorm1d(2048)
        self.batchnorm2 = nn.InstanceNorm1d(512)
        self.batchnorm3 = nn.InstanceNorm1d(128)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)

        
        x = self.layer_out(x)
        
        return x

In [12]:
dataiter = iter(train_loader)
sentences, labels = next(dataiter)

print('sentences shape on PyTorch : ', sentences.size())
print('labels shape on PyTorch : ', labels.size())

sentences shape on PyTorch :  torch.Size([196045, 3])
labels shape on PyTorch :  torch.Size([196045])


In [None]:
neighbors = torch.linalg.norm(X.unsqueeze(1) - X.unsqueeze(0), dim=2).argsort(dim=1)[:, :33]

In [None]:
neighbors

In [21]:
import gc
def compute_loss_stateless_model (params, buffers, sample, target):
    batch = sample.unsqueeze(0)
    targets = target.unsqueeze(0)

    predictions = fmodel(params, buffers, batch) 
    loss = torch.nn.functional.cross_entropy(predictions, targets)

    return loss


def compute_next_batch(X, y, batch_norm, neighbors):
    highest_grad_points = batch_norm.argsort(descending=True)[:32]
    next_batch = X[highest_grad_points]
    neighbors = torch.linalg.norm(X.unsqueeze(0) - next_batch.unsqueeze(1), dim=2).argsort(dim=1)[:, :32]
    neigh_norms = 0
    for neigh  in neighbors:
        neigh_grads = batch_norm[neigh]
        neigh_norms += sum(neigh_grads)/len(neigh_grads)
    
    neigh_norms /= len(neighbors)
    del neighbors
    del highest_grad_points
    gc.collect()
    return neigh_norms
   


In [17]:
# %%
from functorch import make_functional_with_buffers, vmap, grad

base_model = MulticlassClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
base_model.to(device)

MulticlassClassification(
  (layer_1): Linear(in_features=3, out_features=2048, bias=True)
  (layer_2): Linear(in_features=2048, out_features=512, bias=True)
  (layer_3): Linear(in_features=512, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=2, bias=True)
  (relu): ReLU()
  (batchnorm1): InstanceNorm1d(2048, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
  (batchnorm2): InstanceNorm1d(512, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
  (batchnorm3): InstanceNorm1d(128, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)

In [18]:
ft_compute_grad = grad(compute_loss_stateless_model)
ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, 0, 0))

In [19]:
loss_stats = {
    'train': [],
    "val": [],
    "grad": []
}

In [20]:
def wandb_trainer_function(config=None):
    # INITIALIZE NEW WANDB RUN
    with wandb.init(config=config) as run:
        #USING THE CONFIG TO SET THE HYPERPARAMETERS FOR EACH RUN
        config = wandb.config
        wandb.define_metric("custom_step")
        wandb.define_metric("Neighbor Gradient", step_metric='custom_step')
        wandb.define_metric("Average Gradient", step_metric='custom_step')
        wandb.define_metric("Train Loss", step_metric='custom_step')
        wandb.define_metric("Val Loss", step_metric='custom_step')
        

        run.name = "NN-Covtype"

        model = copy.deepcopy(base_model)

        criterion = torch.nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True)

        global fmodel

        print("Begin training.")
        train_epoch_loss=0
        val_epoch_loss = 0
        gradient_norm_epoch = 0
        for e in tqdm(range(1, EPOCHS+1)):
            torch.cuda.empty_cache()
            model.train()
            optimizer.zero_grad()
            pred = model(X.float())

            train_loss = criterion(pred, y)
            train_loss.backward()
            optimizer.step()

            fmodel, params, buffers = make_functional_with_buffers(model)

            ### use functorch to calculate per sample gradient
            ft_per_sample_grads = ft_compute_sample_grad(params, buffers, X.float(), y)

            ### calculate norm of the gradient and use it to compute next batch
            batch_norm = torch.zeros(X.size(0), device=device)
            for item in ft_per_sample_grads:
                batch_norm +=  torch.linalg.norm(item, dim=tuple(range(1,len(item.shape))))
            gradient_norm_epoch = batch_norm.sum().item()/X.size(0)
            del ft_per_sample_grads
            del fmodel
            del params
            del buffers
            gc.collect()
            ## compute neighbors of points that give largest gradients, and see if their gradients are higher in general
            neighbor_norms = compute_next_batch(X, y, batch_norm)



            with torch.no_grad():
                model.eval()
                for X_val, y_val in val_loader:
                    val_pred = model(X_val.float())
                    val_loss = criterion(val_pred, y_val)
                    val_epoch_loss += val_loss.item()


                # TRAIN LOSS AND ACCURACY
                loss_stats['train'].append(train_loss)
                loss_stats['val'].append(val_epoch_loss/len(val_loader))
                loss_stats['grad'].append(gradient_norm_epoch)


                ## plot val loss and accuracy here. For train, standardise a subset for loss/accuracy
                print(f'Epoch {e+0:03}: | Train Loss: {loss_stats["train"][-1]:.5f} | Val Loss: {loss_stats["val"][-1]:.5f} | Avg Grad: {gradient_norm_epoch} | Neighbor Grad: {neighbor_norms}')
                wandb.log({"Train Loss":loss_stats["train"][-1], "Val Loss":loss_stats["val"][-1], "Neighbor Gradient": neighbor_norms, "Average Gradient": gradient_norm_epoch, 'custom_step': e})

                val_epoch_loss = 0
                val_epoch_acc = 0
                gradient_norm_epoch = 0

In [None]:
wandb.agent(sweep_id, wandb_trainer_function)

[34m[1mwandb[0m: Agent Starting Run: efxo694f with config:
[34m[1mwandb[0m: 	EPOCHS: 10
[34m[1mwandb[0m: 	LEARNING_RATE: 0.0001
[34m[1mwandb[0m: 	NUM_CLASSES: 2
[34m[1mwandb[0m: 	NUM_FEATURES: 3
[34m[1mwandb[0m: Currently logged in as: [33maw82[0m ([33mrice-and-shine[0m). Use [1m`wandb login --relogin`[0m to force relogin


Begin training.


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 001: | Train Loss: 0.63223 | Val Loss: 0.45689 | Avg Grad: 23.286056772679743 | Neighbor Grad: 76.15120697021484
