In [1]:
import copy
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

device = torch.device("mps")

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Early stopping (see previous notebook --> neural_net_3.ipynb)
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_model = None
        self.best_loss = None
        self.counter = 0
        self.status = ""

    def __call__(self, model, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())
        elif self.best_loss - val_loss >= self.min_delta:
            self.best_model = copy.deepcopy(model.state_dict())
            self.best_loss = val_loss
            self.counter = 0
            self.status = f"Improvement found, counter reset to {self.counter}"
        else:
            self.counter += 1
            self.status = f"No improvement in the last {self.counter} epochs"
            if self.counter >= self.patience:
                self.status = f"Early stopping triggered after {self.counter} epochs."
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model)
                return True
        return False

### K-fold cross validation using pytorch

- Three ways to handle model's predictions:
    - Choose the model with highest validation score
    - Preset new data to the model and average the results
    - Retrain a new model (using same parameters as the cross-validation) on the entire dataset (__Will use it here__)


#### Regression and classification with K-fold cross validation
- Regression:
    - Here number of different classes doesn't play much issue 
    - Hence can use sklearn's KFold

- Classification:
    - Could use sklearn's KFold but count of each class data would be different hence use *StratifiedKFold*
    - *Drift* is another issue when model is put into production and imablance of classes

### Problem statement: *Regression based problem to predict the age by given details*

In [3]:
df_ = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

In [4]:
df = df_.copy()

In [5]:
df.head()

Unnamed: 0,id,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product
0,1,vv,c,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,b
1,2,kd,c,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,c
2,3,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b
3,4,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b
4,5,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2000 non-null   int64  
 1   job             2000 non-null   object 
 2   area            2000 non-null   object 
 3   income          1941 non-null   float64
 4   aspect          2000 non-null   float64
 5   subscriptions   2000 non-null   int64  
 6   dist_healthy    2000 non-null   float64
 7   save_rate       2000 non-null   int64  
 8   dist_unhealthy  2000 non-null   float64
 9   age             2000 non-null   int64  
 10  pop_dense       2000 non-null   float64
 11  retail_dense    2000 non-null   float64
 12  crime           2000 non-null   float64
 13  product         2000 non-null   object 
dtypes: float64(7), int64(4), object(3)
memory usage: 218.9+ KB


#### Columns to handle:
- don't need *id*
- *job, area, product* columns to handle

In [7]:
df = df.drop(['id'], axis=1)

In [8]:
for col in df.select_dtypes("object"):
    print('Col name: {}'.format(col))
    print(df[col].nunique())
    dummies = pd.get_dummies(df[col], prefix=col, dtype='int')
    df[dummies.columns] = dummies

Col name: job
33
Col name: area
4
Col name: product
7


In [9]:
df.head()

Unnamed: 0,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,...,area_b,area_c,area_d,product_a,product_b,product_c,product_d,product_e,product_f,product_g
0,vv,c,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,...,0,1,0,0,1,0,0,0,0,0
1,kd,c,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,...,0,1,0,0,0,1,0,0,0,0
2,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,...,0,1,0,0,1,0,0,0,0,0
3,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,...,0,1,0,0,1,0,0,0,0,0
4,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,...,0,0,1,1,0,0,0,0,0,0


In [10]:
df.columns

Index(['job', 'area', 'income', 'aspect', 'subscriptions', 'dist_healthy',
       'save_rate', 'dist_unhealthy', 'age', 'pop_dense', 'retail_dense',
       'crime', 'product', 'job_11', 'job_al', 'job_am', 'job_ax', 'job_bf',
       'job_by', 'job_cv', 'job_de', 'job_dz', 'job_e2', 'job_f8', 'job_gj',
       'job_gv', 'job_kd', 'job_ke', 'job_kl', 'job_kp', 'job_ks', 'job_kw',
       'job_mm', 'job_nb', 'job_nn', 'job_ob', 'job_pe', 'job_po', 'job_pq',
       'job_pz', 'job_qp', 'job_qw', 'job_rn', 'job_sa', 'job_vv', 'job_zz',
       'area_a', 'area_b', 'area_c', 'area_d', 'product_a', 'product_b',
       'product_c', 'product_d', 'product_e', 'product_f', 'product_g'],
      dtype='object')

In [11]:
df = df.drop(['job', 'area', 'product'], axis=1)

In [12]:
# checking null vals in income
df['income'].isna().sum()

59

In [13]:
# fill na value with median value
med = df['income'].median()
df['income'] = df['income'].fillna(med)

In [14]:
df.head()

Unnamed: 0,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,...,area_b,area_c,area_d,product_a,product_b,product_c,product_d,product_e,product_f,product_g
0,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,...,0,1,0,0,1,0,0,0,0,0
1,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,...,0,1,0,0,0,1,0,0,0,0
2,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,...,0,1,0,0,1,0,0,0,0,0
3,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,...,0,1,0,0,1,0,0,0,0,0
4,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,...,0,0,1,1,0,0,0,0,0,0


In [15]:
# saving dataset for classification tasks
df.to_csv('../datasets/regression_processed.csv', index=False)

#### Inference:

 Tried MinMaxScaler got RMSE: 3.5
 Tried StandardScaler got RMSE: 4.24
 Using Zcore getting RMSE: 3.36

You can use scipy.stats.zscores across income, aspect, save_rate, subscriptions columns
 
- Adding view(-1,1): 
    [What is -1 in view?](https://stackoverflow.com/questions/42479902/what-does-view-do-in-pytorch)

In [16]:
# splitting data
x = df.drop(['age'], axis=1)
y = df['age'].values
print(x.shape), print(y.shape)

(2000, 53)
(2000,)


(None, None)

In [17]:
sclaer = MinMaxScaler()
x_sclaed = sclaer.fit_transform(x)

In [18]:
x_sclaed.shape

(2000, 53)

In [19]:
# convert scaled data to torch tensors
x_tnsr = torch.tensor(x_sclaed, dtype=torch.float32, device=device)
y_tnsr = torch.tensor(y, dtype=torch.float32, device=device).view(-1, 1)

#setting manual seed
torch.manual_seed(42)

# cross-validate
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# early stopping
patience = 10

fold = 0

for train_idx, test_idx in kf.split(x_tnsr):
    fold += 1
    print(f"Fold #{fold}")

    x_train, x_test = x_tnsr[train_idx], x_tnsr[test_idx]
    y_train, y_test = y_tnsr[train_idx], y_tnsr[test_idx]

    # pytorch dataloader
    train_dataset = TensorDataset(x_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # model
    model = nn.Sequential(
        nn.Linear(x_tnsr.shape[1], 20),
        nn.ReLU(),
        nn.Linear(20,10),
        nn.ReLU(),
        nn.Linear(10,1)
    )
    model = torch.compile(model, backend="aot_eager").to(device)

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    # early stopping
    best_loss = float('inf')
    early_Stopping_cntr = 0

    #training loop
    EPOCHS = 500
    epoch = 0
    done = False
    es = EarlyStopping()

    while not done and epoch<EPOCHS:
        epoch += 1
        model.train()
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            op = model(x_batch)
            loss = loss_fn(op, y_batch)
            loss.backward()
            optimizer.step()

        # validation
        model.eval()
        with torch.no_grad():
            val_op = model(x_test)
            val_loss = loss_fn(val_op, y_test)
        
        if es(model, val_loss):
            done = True
    
    print(f"Epoch {epoch}/{EPOCHS}, Validation loss: ", f"{val_loss.item()}, {es.status}")

# final evaluation
model.eval()
with torch.no_grad():
    oss_pred = model(x_test)
score = torch.sqrt(loss_fn(oss_pred, y_test)).item()
print(f"Fold score (RMSE): {score}")

Fold #1
Epoch 116/500, Validation loss:  0.4894576072692871, Early stopping triggered after 5 epochs.
Fold #2
Epoch 128/500, Validation loss:  0.2492123693227768, Early stopping triggered after 5 epochs.
Fold #3
Epoch 151/500, Validation loss:  0.4484054744243622, Early stopping triggered after 5 epochs.
Fold #4
Epoch 100/500, Validation loss:  0.23313216865062714, Early stopping triggered after 5 epochs.
Fold #5
Epoch 127/500, Validation loss:  1.36281156539917, Early stopping triggered after 5 epochs.
Fold score (RMSE): 1.151145339012146


### Info on Patience in early stopping:
- During training model's performance on validation dataset is monitored at the ned of each epoch
- If this validation performance like validation loss doesn't improve for a specified number of consecutive epochs equal to the patience vlaue, training is stopped
- The model is then saved or the training process is terminated 

### Classification problem: *__Classify the products based on given inputs from the above dataset__*

- Will use the same dataset as above but will build a model to classify various products

In [20]:
df = df_.copy()

In [21]:
df.head()

Unnamed: 0,id,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product
0,1,vv,c,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,b
1,2,kd,c,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,c
2,3,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b
3,4,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b
4,5,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a


In [22]:
df.columns

Index(['id', 'job', 'area', 'income', 'aspect', 'subscriptions',
       'dist_healthy', 'save_rate', 'dist_unhealthy', 'age', 'pop_dense',
       'retail_dense', 'crime', 'product'],
      dtype='object')

In [23]:
df['product'].value_counts()

product
b    963
c    738
a    130
f     72
d     59
e     30
g      8
Name: count, dtype: int64

In [24]:
# cleaning income column
med = df['income'].median()
df['income'] = df['income'].fillna(med)

In [25]:
# changing category columns
cat_cols = ['job', 'area']
for col in cat_cols:
    print('Col name: {}'.format(col))
    print(df[col].nunique())
    dummies = pd.get_dummies(df[col], prefix=col, dtype='int')
    df[dummies.columns] = dummies
    df = df.drop(col, axis=1)

# standardizing ranges in other columns
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['subscriptions'] = zscore(df['subscriptions'])
df['save_rate'] = zscore(df['save_rate'])
df['age'] = zscore(df['age'])

Col name: job
33
Col name: area
4


In [26]:
# saving dataset for classification tasks
df.to_csv('../datasets/classification_processed.csv', index=False)

In [27]:
x = df.drop(['product', 'id'], axis = 1).values
dummies = pd.get_dummies(df['product'], dtype=int)
products = dummies.columns
y = dummies.values

In [28]:
x.shape, y.shape

((2000, 47), (2000, 7))

In [29]:
x[0]

array([-0.60754957, -0.66491815, -0.20844851,  9.01789549, -0.21576413,
       11.73893494,  0.85432106,  0.88582677,  0.49212598,  0.07109996,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ])

In [30]:
y.shape[1]

7

In [31]:
# model
model = nn.Sequential(
    nn.Linear(x.shape[1], 100),
    nn.ReLU(),
    nn.Linear(100, 50),
    nn.ReLU(),
    nn.Linear(50, 25),
    nn.ReLU(),
    nn.Linear(25, y.shape[1]),
    nn.Softmax(dim=1),
)

model = torch.compile(model, backend="aot_eager").to(device)

kfold = StratifiedKFold(10, shuffle=True, random_state=42)

# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

oos_y = []
oos_pred = []
fold = 0

for train, test in kfold.split(x, df['product']):
    fold += 1
    print(f"Fold #{fold}")

    x_train = torch.tensor(x[train], device=device, dtype=torch.float32)
    y_train = torch.tensor(np.argmax(y[train], axis=1), device=device, dtype=torch.long)
    x_test = torch.tensor(x[test], device=device, dtype=torch.float32)
    y_test = torch.tensor(np.argmax(y[test], axis=1), device=device, dtype=torch.long)

    # training loop
    EPOCHS = 500
    epoch = 0
    done = False
    es = EarlyStopping(restore_best_weights=True)

    while not done and epoch < EPOCHS:
        epoch += 1
        model.train()
        optimizer.zero_grad()
        output = model(x_train)
        loss = criterion(output, y_train)
        loss.backward()
        optimizer.step()

        # evaluate validation loss
        model.eval()
        with torch.no_grad():
            y_val = model(x_test)
            val_loss = criterion(y_val, y_test)
        
        if es(model, val_loss):
            done = True

    # prediction
    with torch.no_grad():
        y_val = model(x_test)
        _, pred = torch.max(y_val, 1)

    oos_y.append(y_test.cpu().numpy())
    oos_pred.append(pred.cpu().numpy())

    print(f"Epoch {epoch}/{EPOCHS}, validation loss: " f"{val_loss.item()}, {es.status}")

    # measure each fold's accuracy
    score = accuracy_score(y_test.cpu().numpy(), pred.cpu().numpy())
    print(f"FOld score (accuracy): {score}")

# build oos prediction list and calculate error
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)

score = accuracy_score(oos_y, oos_pred)
print(f"Final score (accuracy): {score}")

Fold #1
Epoch 189/500, validation loss: 1.4803826808929443, Early stopping triggered after 5 epochs.
FOld score (accuracy): 0.695
Fold #2
Epoch 7/500, validation loss: 1.4642550945281982, Early stopping triggered after 5 epochs.
FOld score (accuracy): 0.71
Fold #3
Epoch 7/500, validation loss: 1.4793931245803833, Early stopping triggered after 5 epochs.
FOld score (accuracy): 0.715
Fold #4
Epoch 7/500, validation loss: 1.4593509435653687, Early stopping triggered after 5 epochs.
FOld score (accuracy): 0.7
Fold #5
Epoch 8/500, validation loss: 1.434614896774292, Early stopping triggered after 5 epochs.
FOld score (accuracy): 0.755
Fold #6
Epoch 7/500, validation loss: 1.4001926183700562, Early stopping triggered after 5 epochs.
FOld score (accuracy): 0.78
Fold #7
Epoch 7/500, validation loss: 1.4221832752227783, Early stopping triggered after 5 epochs.
FOld score (accuracy): 0.75
Fold #8
Epoch 6/500, validation loss: 1.4068495035171509, Early stopping triggered after 5 epochs.
FOld scor

### Training Schedules

- **StepLR class** from *torch.optim.lr_scheduler* used for decreasing the learning rate by a certain factor 
- Generally put after optimizer.step()
- Three important parameters: 
    - __optimizer__: like SGD, Adam
    - __step_size__: number of epochs you want to reduce lr
    - __gamma__: factor by which lr will be reduced

### Dropout Regularization

- Main aim of regularization is to handle the balance between *underfitting* (bias) and *overfitting* (variance)
- *__Dropout__* is concept that drops out or temporarily turns off a fraction of neurons in the model during training and thereby reducing interdependencines of neurons
- It randomly disable a fraction of neurons (defined by a probability parameter, typeicalyy ranging from 0.2 to 0.5), which effectively creates a different architecture of the network each training instance

In [32]:
df_reg = pd.read_csv('/Users/rishinigam/t81_588_course/datasets/regression_processed.csv')
df_reg.head()

Unnamed: 0,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,...,area_b,area_c,area_d,product_a,product_b,product_c,product_d,product_e,product_f,product_g
0,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,...,0,1,0,0,1,0,0,0,0,0
1,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,...,0,1,0,0,0,1,0,0,0,0
2,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,...,0,1,0,0,1,0,0,0,0,0
3,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,...,0,1,0,0,1,0,0,0,0,0
4,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,...,0,0,1,1,0,0,0,0,0,0


In [35]:
x = torch.tensor(df_reg.values, dtype=torch.float32, device=device)
y = torch.tensor(df['age'].values, dtype=torch.float32, device=device).view(-1,1)

torch.manual_seed(42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

patience = 10
fold = 0

for train_idx, test_idx in kf.split(x):
    fold += 1
    print(f"Fold #{fold}")

    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # pytorch dataloader
    train_dataset = TensorDataset(x_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    # model
    model = nn.Sequential(
        nn.Linear(x.shape[1], 20),
        nn.Dropout(0.1),
        nn.ReLU(),
        nn.Linear(20,10),
        nn.Dropout(0.1),
        nn.ReLU(),
        nn.Linear(10,1)
    )
    model = torch.compile(model, backend="aot_eager").to(device)

    #optimizer
    optimizer = optim.Adam(model.parameters())
    loss_fn = nn.MSELoss()

    # early stopping variables
    best_loss = float("inf")
    early_stopping_counter = 0

    # training loop
    EPOCHS = 500
    epoch = 0
    done = False
    es = EarlyStopping()

    while not done and epoch < EPOCHS:
        epoch += 1
        model.train()
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(x_batch)
            loss = loss_fn(output, y_batch)
            loss.backward()
            optimizer.step()

        # validation
        model.eval()
        with torch.no_grad():
            val_output = model(x_test)
            val_loss = loss_fn(val_output, y_test)

        if es(model, val_loss):
            done = True
    
    print(f"Epoch {epoch}/{EPOCHS}, Validation loss: " f"{val_loss.item()}, {es.status}")

# final evaluation
model.eval()
with torch.no_grad():
    oos_pred = model(x_test)
score = torch.sqrt(loss_fn(oos_pred, y_test)).item()
print(f"Fold score (RMSE): {score}")

Fold #1
Epoch 127/500, Validation loss: 1.0309289693832397, Early stopping triggered after 5 epochs.
Fold #2
Epoch 16/500, Validation loss: 1.0974091291427612, Early stopping triggered after 5 epochs.
Fold #3
Epoch 6/500, Validation loss: 1.058467984199524, Early stopping triggered after 5 epochs.
Fold #4
Epoch 7/500, Validation loss: 18.293764114379883, Early stopping triggered after 5 epochs.
Fold #5
Epoch 155/500, Validation loss: 0.8919046521186829, Early stopping triggered after 5 epochs.
Fold score (RMSE): 0.944202184677124


### Batch Normalization