### Tasks

#### figure out a way to make the model deeper 
#### consider small model as low rank representation and project higher. (using outer product) train only small number of outer product parameters - https://arxiv.org/pdf/2012.13255.pdf

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms
from torchvision.utils import make_grid

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import random
import time
import copy
import os
import sys

import wandb
   
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import wandb
device = torch.device("cpu")

In [2]:
#Link to dataset: https://archive.ics.uci.edu/ml/datasets/DeliciousMIL%3A+A+Data+Set+for+Multi-Label+Multi-Instance+Learning+with+Instance+Labels#
import numpy as np
import pandas as pd
import re

def createDataset(path):
    file = open(path, "r")
    dataset = []
    split = file.read().splitlines()
 
    for i, line in enumerate(split):
        row = line.split('\t')
        while "" in row:
            row.remove("")
        for i, item in enumerate(row): 
            try:
                row[i] = float(item.split(":")[1])
            except:
                row[i] = int(item)
        dataset.append(row)

    return dataset


In [3]:
dataset = pd.DataFrame(createDataset('Skin_NonSkin.txt'))
dataset.dropna(inplace=True)
print(dataset.shape)
dataset=dataset.drop_duplicates()
print(dataset.shape)

(245057, 4)
(51444, 4)


In [19]:
dataset

Unnamed: 0,0,1,2,3
0,74,85,123,1
1,73,84,122,1
2,72,83,121,1
3,70,81,119,1
5,69,80,118,1
...,...,...,...,...
244672,73,73,49,2
244703,19,19,19,2
244719,62,64,35,2
244720,58,61,29,2


In [4]:
class skin(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [5]:
print(dataset.loc[:, 1:])

          1    2  3
0        85  123  1
1        84  122  1
2        83  121  1
3        81  119  1
5        80  118  1
...     ...  ... ..
244672   73   49  2
244703   19   19  2
244719   64   35  2
244720   61   29  2
244848  106   51  2

[51444 rows x 3 columns]


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler  

X_train, X_valid, y_train, y_valid = \
    train_test_split(dataset.loc[:, :2], dataset.loc[:, 3:], test_size=0.2, random_state=42)

# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_valid = scaler.transform(X_valid)

y_train = y_train - 1
y_valid = y_valid - 1

y_train = y_train.squeeze()
y_valid = y_valid.squeeze()

X_train, y_train = np.array(X_train), np.array(y_train)
X_valid, y_valid = np.array(X_valid), np.array(y_valid)



print('X train shape : ', X_train.shape)
print('y train label shape : ', y_train.shape)
print('X valid  shape : ', X_valid.shape)
print('y valid shape: ', y_valid.shape)

X train shape :  (41155, 3)
y train label shape :  (41155,)
X valid  shape :  (10289, 3)
y valid shape:  (10289,)


In [128]:
y_train

array([1, 1, 1, ..., 1, 0, 1])

In [54]:
train_dataset = skin(X=X_train.astype(float), y=y_train)
val_dataset = skin(X=X_valid.astype(float), y=y_valid)

train_loader = DataLoader(dataset=train_dataset, batch_size=len(train_dataset), shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=128, shuffle=False)

In [89]:
X = torch.tensor(train_dataset.X, device = device)
y = torch.tensor(train_dataset.y, device = device)
EPOCHS = 10
BATCH_SIZE = 512
LEARNING_RATE = 0.0001
NUM_FEATURES = X.shape[1]
NUM_CLASSES = 2

In [107]:
print(y)

tensor([[1],
        [1],
        [1],
        ...,
        [1],
        [0],
        [1]])


In [90]:
sweep_config = {
    'method': 'grid'
    }

parameters_dict = {
    "LEARNING_RATE": {
        'value': LEARNING_RATE
        },
    "NUM_FEATURES": {
        'value': NUM_FEATURES
        },
    "NUM_CLASSES": {
        'value': NUM_CLASSES
        },
    "EPOCHS": {
        'value': EPOCHS
        },
    }
# 
sweep_config['parameters'] = parameters_dict

In [91]:
sweep_id = wandb.sweep(sweep_config, project="Data_Variance_Exp")

Create sweep with ID: gwuk8cde
Sweep URL: https://wandb.ai/rice-and-shine/Data_Variance_Exp/sweeps/gwuk8cde


In [92]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 2048)
        self.layer_2 = nn.Linear(2048, 512)
        self.layer_3 = nn.Linear(512, 128)
        self.layer_out = nn.Linear(128, num_class)
        
        self.relu = nn.ReLU()
        
        self.batchnorm1 = nn.InstanceNorm1d(2048)
        self.batchnorm2 = nn.InstanceNorm1d(512)
        self.batchnorm3 = nn.InstanceNorm1d(128)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)

        
        x = self.layer_out(x)
        
        return x

In [111]:
# neighbors = torch.linalg.norm(X.unsqueeze(1) - X.unsqueeze(0), dim=2).argsort(dim=1)[:, :33]

In [112]:
neighbors

tensor([[    0, 38808, 27056,  ..., 12742,  5437, 18427],
        [    1, 38909, 25967,  ..., 24233, 11397,  4644],
        [    2, 37792, 40271,  ...,  9303, 32798, 22147],
        ...,
        [41152,  1855, 23863,  ..., 21431,  5212, 21239],
        [41153, 33173, 36497,  ..., 39413, 18052,  6038],
        [41154, 12283, 26185,  ..., 27497, 17250, 31485]])

In [93]:
import gc
def compute_loss_stateless_model (params, buffers, sample, target):
    batch = sample.unsqueeze(0)
    targets = target.unsqueeze(0)

    predictions = fmodel(params, buffers, batch) 
    loss = torch.nn.functional.cross_entropy(predictions, targets)

    return loss


def compute_next_batch(X, y, batch_norm):
    highest_grad_points = batch_norm.argsort(descending=True)[:32]
    next_batch = X[highest_grad_points]
    # neighbors = torch.linalg.norm(X.unsqueeze(0) - next_batch.unsqueeze(1), dim=2).argsort(dim=1)[:, :32]
    # neigh_norms = 0
    # for neigh  in neighbors:
    #     neigh_grads = batch_norm[neigh]
    #     neigh_norms += sum(neigh_grads)/len(neigh_grads)
    
    # neigh_norms /= len(neighbors)


    x0=next_batch[y==0]
    x1=next_batch[y==1]

    batch_norm0=batch_norm[y==0]
    batch_norm1=batch_norm[y==1]

    next_batch_0 = torch.linalg.norm(X.unsqueeze(0)-x0.unsqueeze(1),dim=2).argsort(dim=1)[:,1:32+1]
    next_batch_0=torch.reshape(next_batch_0,(-1,))

    next_batch_1 = torch.linalg.norm(X.unsqueeze(0)-x1.unsqueeze(1),dim=2).argsort(dim=1)[:,1:32+1]
    next_batch_1=torch.reshape(next_batch_1,(-1,))

    
    
    next_batch_weights_0=torch.cat(tuple(batch_norm0.repeat(32,1).T))
    next_batch_weights_1=torch.cat(tuple(batch_norm1.repeat(32,1).T))


    ## weighted selection of next batch.
    next_batch_0 = next_batch_0[next_batch_weights_0.multinomial(32)]
    next_batch_1 = next_batch_1[next_batch_weights_1.multinomial(32)]
    neigh_norms = 0
    for neigh  in next_batch_0:
        neigh_grads = batch_norm[neigh]
        neigh_norms += sum(neigh_grads)/len(neigh_grads)
    
    

    for neigh  in next_batch_1:
        neigh_grads = batch_norm[neigh]
        neigh_norms += sum(neigh_grads)/len(neigh_grads)
    
    neigh_norms /= (len(next_batch_1)+len(next_batch_0))
    
    # del neighbors

    del highest_grad_points
    gc.collect()
    return neigh_norms
   


In [94]:
# %%
from functorch import make_functional_with_buffers, vmap, grad

base_model = MulticlassClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
base_model.to(device)

MulticlassClassification(
  (layer_1): Linear(in_features=3, out_features=2048, bias=True)
  (layer_2): Linear(in_features=2048, out_features=512, bias=True)
  (layer_3): Linear(in_features=512, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=2, bias=True)
  (relu): ReLU()
  (batchnorm1): InstanceNorm1d(2048, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
  (batchnorm2): InstanceNorm1d(512, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
  (batchnorm3): InstanceNorm1d(128, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)

In [95]:
ft_compute_grad = grad(compute_loss_stateless_model)
ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, 0, 0))

In [96]:
loss_stats = {
    'train': [],
    "val": [],
    "grad": [],
    "var-1":[],
    "var-2":[],
    "var-3":[],
    "var-4":[],
    "var-5":[],
    "var-6":[],
    "var-7":[],
    "var-8":[],
    "var-9":[],
    "var-10":[],

}

In [97]:
import sys

In [98]:
def get_variance_batch(max_grad_data_point,y_max_grad_data_point):

    new_batch=[]
    # print(max_grad_data_point.shape)
    d=max_grad_data_point.shape[1]

    for i in range(1,11):
        for j in range(100):
            random_point=np.random.rand(d)
            norm=np.linalg.norm(random_point)
            random_point=random_point*(i/norm)
            new_batch.append(random_point)
            # print(torch.linalg.norm(random_point))
    return torch.tensor(new_batch),torch.tensor([y_max_grad_data_point]*1000)


In [81]:
l,m=get_variance_batch(X[[222]],y[[222]])

In [100]:
def wandb_trainer_function(config=None):
    # INITIALIZE NEW WANDB RUN
    with wandb.init(config=config) as run:
        #USING THE CONFIG TO SET THE HYPERPARAMETERS FOR EACH RUN
        config = wandb.config
        wandb.define_metric("custom_step")
        wandb.define_metric("Neighbor Gradient", step_metric='custom_step')
        wandb.define_metric("Average Gradient", step_metric='custom_step')
        wandb.define_metric("Train Loss", step_metric='custom_step')
        wandb.define_metric("Val Loss", step_metric='custom_step')
        

        run.name = "NN-Covtype-batched"

        model = copy.deepcopy(base_model)

        criterion = torch.nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True)

        global fmodel

        print("Begin training.")
        train_epoch_loss=0
        val_epoch_loss = 0
        gradient_norm_epoch = 0
        for e in tqdm(range(1, EPOCHS+1)):
            torch.cuda.empty_cache()
            model.train()
            optimizer.zero_grad()
            
            pred = model(X.float())

            train_loss = criterion(pred, y)
            train_loss.backward()
            optimizer.step()

            fmodel, params, buffers = make_functional_with_buffers(model)

            ### use functorch to calculate per sample gradient
            ft_per_sample_grads = ft_compute_sample_grad(params, buffers, X.float(), y)

            ### calculate norm of the gradient and use it to compute next batch
            batch_norm = torch.zeros(X.size(0), device=device)
            for item in ft_per_sample_grads:
                batch_norm +=  torch.linalg.norm(item, dim=tuple(range(1,len(item.shape))))
            gradient_norm_epoch = batch_norm.sum().item()/X.size(0)

            ###CODE FOR VARIANCE BATCH EXPERIMENT STARTS HERE
            max_grad_data_point=batch_norm.argsort(descending=True)[0:1]
            
            variance_batch, y_max_grad_data_point=get_variance_batch(X[max_grad_data_point],y[max_grad_data_point])

            ft_per_sample_grads_exp = ft_compute_sample_grad(params, buffers, variance_batch.float(), y_max_grad_data_point)

            batch_norm_exp = torch.zeros(variance_batch.size(0), device=device)
            for item in ft_per_sample_grads_exp:
                batch_norm_exp +=  torch.linalg.norm(item, dim=tuple(range(1,len(item.shape))))
            
            gradient_norm_epoch_exp=torch.zeros(10, device=device)

            for i in range(10):
                gradient_norm_epoch_exp[i]=batch_norm_exp[i:i+100].sum().item()/100
                loss_stats[f'var-{i+1}']=gradient_norm_epoch_exp[i]

            ###CODE FOR VARIANCE BATCH EXPERIMENT ENDS HERE
            del ft_per_sample_grads
            del fmodel
            del params
            del buffers
            gc.collect()
            
            ## compute neighbors of points that give largest gradients, and see if their gradients are higher in general
            # neighbor_norms = compute_next_batch(X, y, batch_norm)



            with torch.no_grad():
                model.eval()
                for X_val, y_val in val_loader:
                    val_pred = model(X_val.float())
                    val_loss = criterion(val_pred, y_val)
                    val_epoch_loss += val_loss.item()


                # TRAIN LOSS AND ACCURACY
                loss_stats['train'].append(train_loss)
                loss_stats['val'].append(val_epoch_loss/len(val_loader))
                loss_stats['grad'].append(gradient_norm_epoch)


                ## plot val loss and accuracy here. For train, standardise a subset for loss/accuracy
                print(f'Epoch {e+0:03}: | Train Loss: {loss_stats["train"][-1]:.5f} | Val Loss: {loss_stats["val"][-1]:.5f} | Avg Grad: {gradient_norm_epoch}')
                wandb.log({"Train Loss":loss_stats["train"][-1], "Val Loss":loss_stats["val"][-1], "Average Gradient": gradient_norm_epoch, "Var-1": loss_stats["var-1"],"Var-2": loss_stats["var-2"],"Var-3": loss_stats["var-3"],"Var-4": loss_stats["var-4"],"Var-5": loss_stats["var-5"], "Var-6": loss_stats["var-6"],"Var-7": loss_stats["var-7"],"Var-8": loss_stats["var-8"],"Var-9": loss_stats["var-9"],"Var-10": loss_stats["var-10"], 'custom_step': e})

                val_epoch_loss = 0
                val_epoch_acc = 0
                gradient_norm_epoch = 0

tensor([[ 56.,  56.,  20.],
        [ 48.,  51.,  12.],
        [ 11.,  14.,   0.],
        ...,
        [  5.,   3., 176.],
        [ 63.,  77., 130.],
        [ 98.,  75.,  59.]], dtype=torch.float64)


In [101]:
wandb.agent(sweep_id, wandb_trainer_function)

[34m[1mwandb[0m: Agent Starting Run: i5th1jc4 with config:
[34m[1mwandb[0m: 	EPOCHS: 10
[34m[1mwandb[0m: 	LEARNING_RATE: 0.0001
[34m[1mwandb[0m: 	NUM_CLASSES: 2
[34m[1mwandb[0m: 	NUM_FEATURES: 3
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mas278[0m ([33mrice-and-shine[0m). Use [1m`wandb login --relogin`[0m to force relogin


Begin training.


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 001: | Train Loss: 0.59599 | Val Loss: 0.52408 | Avg Grad: 25.47471145668813


Run i5th1jc4 errored: NameError("name 'neighbor_norms' is not defined")
[34m[1mwandb[0m: [32m[41mERROR[0m Run i5th1jc4 errored: NameError("name 'neighbor_norms' is not defined")
[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
