### Tasks

#### figure out a way to make the model deeper 
#### consider small model as low rank representation and project higher. (using outer product) train only small number of outer product parameters - https://arxiv.org/pdf/2012.13255.pdf

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms
from torchvision.utils import make_grid

import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler  
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import random
import time
import copy
import os
import sys

import wandb
   
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import wandb
device = torch.device("cpu")

In [4]:
dataset = 0
if dataset == 0:
    df=pd.read_csv("HIGGS.csv",header=None, nrows=50000)
    # df=pd.read_csv("HIGGS.csv",header=None)
    df.head()
elif dataset == 1:
    df=pd.read_csv("SUSY.csv",header=None, nrows=600000)
    df.head()

In [5]:
X = df.iloc[:, 1:]  
y = df.iloc[:, 0]

In [19]:
dataset

Unnamed: 0,0,1,2,3
0,74,85,123,1
1,73,84,122,1
2,72,83,121,1
3,70,81,119,1
5,69,80,118,1
...,...,...,...,...
244672,73,73,49,2
244703,19,19,19,2
244719,62,64,35,2
244720,58,61,29,2


In [6]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=69)

# Split train into train-val
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, stratify=y_trainval, random_state=21)


In [9]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)

In [10]:
class ClassifierDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    
    def getBatch(self, indices, yclass = -1):
        if yclass >= 0:
            return self.X_data[self.y_data == yclass], self.y_data[self.y_data == yclass]
        else:
            return self.X_data[(indices)], self.y_data[(indices)]
    
    def getSplitbyClass(self, indices, yclass):
        return self.X_data[self.y_data == yclass][indices], self.y_data[self.y_data == yclass][indices]

In [11]:
y_train

array([1., 0., 0., ..., 0., 0., 0.])

In [14]:
train_dataset = ClassifierDataset(X_data=X_train.astype(float), y_data=y_train)
val_dataset = ClassifierDataset(X_data=X_val.astype(float), y_data=y_val)

train_loader = DataLoader(dataset=train_dataset, batch_size=len(train_dataset), shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=128, shuffle=False)

In [15]:
X = torch.tensor(train_dataset.X_data, device = device)
y = torch.tensor(train_dataset.y_data, device = device)
EPOCHS = 10
BATCH_SIZE = 512
LEARNING_RATE = 0.0001
NUM_FEATURES = X.shape[1]
NUM_CLASSES = 2

In [30]:
print(y.int())

tensor([1, 0, 0,  ..., 0, 0, 0], dtype=torch.int32)


In [17]:
sweep_config = {
    'method': 'grid'
    }

parameters_dict = {
    "LEARNING_RATE": {
        'value': LEARNING_RATE
        },
    "NUM_FEATURES": {
        'value': NUM_FEATURES
        },
    "NUM_CLASSES": {
        'value': NUM_CLASSES
        },
    "EPOCHS": {
        'value': EPOCHS
        },
    }
# 
sweep_config['parameters'] = parameters_dict

In [52]:
sweep_id = wandb.sweep(sweep_config, project="Data_Variance_Exp")

Create sweep with ID: kndbnekn
Sweep URL: https://wandb.ai/rice-and-shine/Data_Variance_Exp/sweeps/kndbnekn


In [37]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 2048)
        self.layer_2 = nn.Linear(2048, 512)
        self.layer_3 = nn.Linear(512, 128)
        self.layer_out = nn.Linear(128, num_class)
        
        self.relu = nn.ReLU()
        
        self.batchnorm1 = nn.InstanceNorm1d(2048)
        self.batchnorm2 = nn.InstanceNorm1d(512)
        self.batchnorm3 = nn.InstanceNorm1d(128)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)

        
        x = self.layer_out(x)
        
        return x

In [111]:
# neighbors = torch.linalg.norm(X.unsqueeze(1) - X.unsqueeze(0), dim=2).argsort(dim=1)[:, :33]

In [112]:
neighbors

tensor([[    0, 38808, 27056,  ..., 12742,  5437, 18427],
        [    1, 38909, 25967,  ..., 24233, 11397,  4644],
        [    2, 37792, 40271,  ...,  9303, 32798, 22147],
        ...,
        [41152,  1855, 23863,  ..., 21431,  5212, 21239],
        [41153, 33173, 36497,  ..., 39413, 18052,  6038],
        [41154, 12283, 26185,  ..., 27497, 17250, 31485]])

In [38]:
import gc
def compute_loss_stateless_model (params, buffers, sample, target):
    batch = sample.unsqueeze(0)
    targets = target.unsqueeze(0)

    predictions = fmodel(params, buffers, batch) 
    loss = torch.nn.functional.cross_entropy(predictions, targets)

    return loss


def compute_next_batch(X, y, batch_norm):
    highest_grad_points = batch_norm.argsort(descending=True)[:32]
    next_batch = X[highest_grad_points]
    # neighbors = torch.linalg.norm(X.unsqueeze(0) - next_batch.unsqueeze(1), dim=2).argsort(dim=1)[:, :32]
    # neigh_norms = 0
    # for neigh  in neighbors:
    #     neigh_grads = batch_norm[neigh]
    #     neigh_norms += sum(neigh_grads)/len(neigh_grads)
    
    # neigh_norms /= len(neighbors)


    x0=next_batch[y==0]
    x1=next_batch[y==1]

    batch_norm0=batch_norm[y==0]
    batch_norm1=batch_norm[y==1]

    next_batch_0 = torch.linalg.norm(X.unsqueeze(0)-x0.unsqueeze(1),dim=2).argsort(dim=1)[:,1:32+1]
    next_batch_0=torch.reshape(next_batch_0,(-1,))

    next_batch_1 = torch.linalg.norm(X.unsqueeze(0)-x1.unsqueeze(1),dim=2).argsort(dim=1)[:,1:32+1]
    next_batch_1=torch.reshape(next_batch_1,(-1,))

    
    
    next_batch_weights_0=torch.cat(tuple(batch_norm0.repeat(32,1).T))
    next_batch_weights_1=torch.cat(tuple(batch_norm1.repeat(32,1).T))


    ## weighted selection of next batch.
    next_batch_0 = next_batch_0[next_batch_weights_0.multinomial(32)]
    next_batch_1 = next_batch_1[next_batch_weights_1.multinomial(32)]
    neigh_norms = 0
    for neigh  in next_batch_0:
        neigh_grads = batch_norm[neigh]
        neigh_norms += sum(neigh_grads)/len(neigh_grads)
    
    

    for neigh  in next_batch_1:
        neigh_grads = batch_norm[neigh]
        neigh_norms += sum(neigh_grads)/len(neigh_grads)
    
    neigh_norms /= (len(next_batch_1)+len(next_batch_0))
    
    # del neighbors

    del highest_grad_points
    gc.collect()
    return neigh_norms
   


In [39]:
# %%
from functorch import make_functional_with_buffers, vmap, grad

base_model = MulticlassClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
base_model.to(device)

MulticlassClassification(
  (layer_1): Linear(in_features=28, out_features=2048, bias=True)
  (layer_2): Linear(in_features=2048, out_features=512, bias=True)
  (layer_3): Linear(in_features=512, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=2, bias=True)
  (relu): ReLU()
  (batchnorm1): InstanceNorm1d(2048, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
  (batchnorm2): InstanceNorm1d(512, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
  (batchnorm3): InstanceNorm1d(128, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)

In [40]:
ft_compute_grad = grad(compute_loss_stateless_model)
ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, 0, 0))

In [46]:
loss_stats = {
    'train': [],
    "val": [],
    "grad": [],
    "var-1":[],
    "var-2":[],
    "var-3":[],
    "var-4":[],
    "var-5":[],
    "var-6":[],
    "var-7":[],
    "var-8":[],
    "var-9":[],
    "var-10":[],

}

In [24]:
import sys

In [50]:
def get_variance_batch(max_grad_data_point,y_max_grad_data_point):

    new_batch=[]
    # print(max_grad_data_point.shape)
    d=max_grad_data_point.shape[1]

    for i in range(50,60):
        for j in range(100):
            random_point=np.random.rand(d)
            norm=np.linalg.norm(random_point)
            random_point=random_point*(i/norm)
            new_batch.append(random_point)
            # print(torch.linalg.norm(random_point))
    return torch.tensor(new_batch),torch.tensor([y_max_grad_data_point]*1000)


In [81]:
l,m=get_variance_batch(X[[222]],y[[222]])

In [51]:
def wandb_trainer_function(config=None):
    # INITIALIZE NEW WANDB RUN
    with wandb.init(config=config) as run:
        #USING THE CONFIG TO SET THE HYPERPARAMETERS FOR EACH RUN
        config = wandb.config
        wandb.define_metric("custom_step")
        wandb.define_metric("Neighbor Gradient", step_metric='custom_step')
        wandb.define_metric("Average Gradient", step_metric='custom_step')
        wandb.define_metric("Train Loss", step_metric='custom_step')
        wandb.define_metric("Val Loss", step_metric='custom_step')
        

        run.name = "NN-HIGGS-50:60-Variance"

        model = copy.deepcopy(base_model)

        criterion = torch.nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=True)

        global fmodel

        print("Begin training.")
        train_epoch_loss=0
        val_epoch_loss = 0
        gradient_norm_epoch = 0
        for e in tqdm(range(1, EPOCHS+1)):
            torch.cuda.empty_cache()
            model.train()
            optimizer.zero_grad()
            
            pred = model(X.float())

            train_loss = criterion(pred, y.long())
            train_loss.backward()
            optimizer.step()

            fmodel, params, buffers = make_functional_with_buffers(model)

            ### use functorch to calculate per sample gradient
            ft_per_sample_grads = ft_compute_sample_grad(params, buffers, X.float(), y.long())

            ### calculate norm of the gradient and use it to compute next batch
            batch_norm = torch.zeros(X.size(0), device=device)
            for item in ft_per_sample_grads:
                batch_norm +=  torch.linalg.norm(item, dim=tuple(range(1,len(item.shape))))
            gradient_norm_epoch = batch_norm.sum().item()/X.size(0)

            ###CODE FOR VARIANCE BATCH EXPERIMENT STARTS HERE
            max_grad_data_point=batch_norm.argsort(descending=True)[0:1]
            
            variance_batch, y_max_grad_data_point=get_variance_batch(X[max_grad_data_point],y[max_grad_data_point])

            ft_per_sample_grads_exp = ft_compute_sample_grad(params, buffers, variance_batch.float(), y_max_grad_data_point.long())

            batch_norm_exp = torch.zeros(variance_batch.size(0), device=device)
            for item in ft_per_sample_grads_exp:
                batch_norm_exp +=  torch.linalg.norm(item, dim=tuple(range(1,len(item.shape))))
            
            gradient_norm_epoch_exp=torch.zeros(10, device=device)

            for i in range(10):
                gradient_norm_epoch_exp[i]=batch_norm_exp[i:i+100].sum().item()/100
                loss_stats[f'var-{i+1}']=gradient_norm_epoch_exp[i]

            ###CODE FOR VARIANCE BATCH EXPERIMENT ENDS HERE
            del ft_per_sample_grads
            del fmodel
            del params
            del buffers
            gc.collect()
            
            ## compute neighbors of points that give largest gradients, and see if their gradients are higher in general
            # neighbor_norms = compute_next_batch(X, y, batch_norm)



            with torch.no_grad():
                model.eval()
                for X_val, y_val in val_loader:
                    val_pred = model(X_val.float())
                    val_loss = criterion(val_pred, y_val.long())
                    val_epoch_loss += val_loss.item()


                # TRAIN LOSS AND ACCURACY
                loss_stats['train'].append(train_loss)
                loss_stats['val'].append(val_epoch_loss/len(val_loader))
                loss_stats['grad'].append(gradient_norm_epoch)


                ## plot val loss and accuracy here. For train, standardise a subset for loss/accuracy
                print(f'Epoch {e+0:03}: | Train Loss: {loss_stats["train"][-1]:.5f} | Val Loss: {loss_stats["val"][-1]:.5f} | Avg Grad: {gradient_norm_epoch}')
                wandb.log({"Train Loss":loss_stats["train"][-1], "Val Loss":loss_stats["val"][-1], "Average Gradient": gradient_norm_epoch, "Var-1": loss_stats["var-1"],"Var-2": loss_stats["var-2"],"Var-3": loss_stats["var-3"],"Var-4": loss_stats["var-4"],"Var-5": loss_stats["var-5"], "Var-6": loss_stats["var-6"],"Var-7": loss_stats["var-7"],"Var-8": loss_stats["var-8"],"Var-9": loss_stats["var-9"],"Var-10": loss_stats["var-10"], 'custom_step': e})

                val_epoch_loss = 0
                val_epoch_acc = 0
                gradient_norm_epoch = 0

tensor([[ 56.,  56.,  20.],
        [ 48.,  51.,  12.],
        [ 11.,  14.,   0.],
        ...,
        [  5.,   3., 176.],
        [ 63.,  77., 130.],
        [ 98.,  75.,  59.]], dtype=torch.float64)


In [53]:
wandb.agent(sweep_id, wandb_trainer_function)

[34m[1mwandb[0m: Agent Starting Run: upgc3yl8 with config:
[34m[1mwandb[0m: 	EPOCHS: 10
[34m[1mwandb[0m: 	LEARNING_RATE: 0.0001
[34m[1mwandb[0m: 	NUM_CLASSES: 2
[34m[1mwandb[0m: 	NUM_FEATURES: 28
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Begin training.


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 001: | Train Loss: 0.76314 | Val Loss: 0.77983 | Avg Grad: 41.60123263888889
Epoch 002: | Train Loss: 0.78225 | Val Loss: 0.73554 | Avg Grad: 36.807170138888885
Epoch 003: | Train Loss: 0.73783 | Val Loss: 0.69472 | Avg Grad: 33.26524305555556
Epoch 004: | Train Loss: 0.69589 | Val Loss: 0.69649 | Avg Grad: 30.52273611111111
Epoch 005: | Train Loss: 0.69685 | Val Loss: 0.70784 | Avg Grad: 28.514027777777777
Epoch 006: | Train Loss: 0.70776 | Val Loss: 0.70789 | Avg Grad: 27.108569444444445
Epoch 007: | Train Loss: 0.70755 | Val Loss: 0.70024 | Avg Grad: 26.071727430555555
Epoch 008: | Train Loss: 0.69961 | Val Loss: 0.69245 | Avg Grad: 25.200890625
Epoch 009: | Train Loss: 0.69188 | Val Loss: 0.68914 | Avg Grad: 24.431663194444443
Epoch 010: | Train Loss: 0.68869 | Val Loss: 0.69015 | Avg Grad: 23.75315625


VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Average Gradient,█▆▅▄▃▂▂▂▁▁
Train Loss,▇█▅▂▂▂▂▂▁▁
Val Loss,█▅▁▂▂▂▂▁▁▁
Var-1,█▆▄▄▄▄▃▂▁▁
Var-10,█▆▄▄▄▄▃▂▂▁
Var-2,█▆▄▄▄▄▃▂▂▁
Var-3,█▆▄▄▄▄▃▂▂▁
Var-4,█▆▄▄▄▄▃▂▂▁
Var-5,█▆▄▄▄▄▃▂▂▁
Var-6,█▆▄▄▄▄▃▂▂▁

0,1
Average Gradient,23.75316
Train Loss,0.68869
Val Loss,0.69015
Var-1,25.08708
Var-10,25.12664
Var-2,25.03853
Var-3,25.02548
Var-4,25.01478
Var-5,25.02056
Var-6,25.03536


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
