In [1]:
import copy
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import pandas as pd
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split

device = torch.device("mps")

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Early stopping (see previous notebook --> neural_net_3.ipynb)
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_model = None
        self.best_loss = None
        self.counter = 0
        self.status = ""

    def __call__(self, model, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())
        elif self.best_loss - val_loss >= self.min_delta:
            self.best_model = copy.deepcopy(model.state_dict())
            self.best_loss = val_loss
            self.counter = 0
            self.status = f"Improvement found, counter reset to {self.counter}"
        else:
            self.counter += 1
            self.status = f"No improvement in the last {self.counter} epochs"
            if self.counter >= self.patience:
                self.status = f"Early stopping triggered after {self.counter} epochs."
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model)
                return True
        return False

### K-fold cross validation using pytorch

- Three ways to handle model's predictions:
    - Choose the model with highest validation score
    - Preset new data to the model and average the results
    - Retrain a new model (using same parameters as the cross-validation) on the entire dataset (__Will use it here__)


#### Regression and classification with K-fold cross validation
- Regression:
    - Here number of different classes doesn't play much issue 
    - Hence can use sklearn's KFold

- Classification:
    - Could use sklearn's KFold but count of each class data would be different hence use *StratifiedKFold*
    - *Drift* is another issue when model is put into production and imablance of classes

### Problem statement: *Regression based problem to predict the age by given details*

In [3]:
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

In [4]:
df.head()

Unnamed: 0,id,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product
0,1,vv,c,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,b
1,2,kd,c,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,c
2,3,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,b
3,4,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,b
4,5,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,a


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2000 non-null   int64  
 1   job             2000 non-null   object 
 2   area            2000 non-null   object 
 3   income          1941 non-null   float64
 4   aspect          2000 non-null   float64
 5   subscriptions   2000 non-null   int64  
 6   dist_healthy    2000 non-null   float64
 7   save_rate       2000 non-null   int64  
 8   dist_unhealthy  2000 non-null   float64
 9   age             2000 non-null   int64  
 10  pop_dense       2000 non-null   float64
 11  retail_dense    2000 non-null   float64
 12  crime           2000 non-null   float64
 13  product         2000 non-null   object 
dtypes: float64(7), int64(4), object(3)
memory usage: 218.9+ KB


### Columns to handle:
- don't need *id*
- *job, area, product* columns to handle

In [6]:
df = df.drop(['id'], axis=1)

In [7]:
for col in df.select_dtypes("object"):
    print('Col name: {}'.format(col))
    print(df[col].nunique())
    dummies = pd.get_dummies(df[col], prefix=col, dtype='int')
    df[dummies.columns] = dummies

Col name: job
33
Col name: area
4
Col name: product
7


In [8]:
df.head()

Unnamed: 0,job,area,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,...,area_b,area_c,area_d,product_a,product_b,product_c,product_d,product_e,product_f,product_g
0,vv,c,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,...,0,1,0,0,1,0,0,0,0,0
1,kd,c,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,...,0,1,0,0,0,1,0,0,0,0
2,pe,c,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,...,0,1,0,0,1,0,0,0,0,0
3,11,c,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,...,0,1,0,0,1,0,0,0,0,0
4,kl,d,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,...,0,0,1,1,0,0,0,0,0,0


In [9]:
df.columns

Index(['job', 'area', 'income', 'aspect', 'subscriptions', 'dist_healthy',
       'save_rate', 'dist_unhealthy', 'age', 'pop_dense', 'retail_dense',
       'crime', 'product', 'job_11', 'job_al', 'job_am', 'job_ax', 'job_bf',
       'job_by', 'job_cv', 'job_de', 'job_dz', 'job_e2', 'job_f8', 'job_gj',
       'job_gv', 'job_kd', 'job_ke', 'job_kl', 'job_kp', 'job_ks', 'job_kw',
       'job_mm', 'job_nb', 'job_nn', 'job_ob', 'job_pe', 'job_po', 'job_pq',
       'job_pz', 'job_qp', 'job_qw', 'job_rn', 'job_sa', 'job_vv', 'job_zz',
       'area_a', 'area_b', 'area_c', 'area_d', 'product_a', 'product_b',
       'product_c', 'product_d', 'product_e', 'product_f', 'product_g'],
      dtype='object')

In [10]:
df = df.drop(['job', 'area', 'product'], axis=1)

In [11]:
# checking null vals in income
df['income'].isna().sum()

59

In [12]:
# fill na value with median value
med = df['income'].median()
df['income'] = df['income'].fillna(med)

In [13]:
df.head()

Unnamed: 0,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,...,area_b,area_c,area_d,product_a,product_b,product_c,product_d,product_e,product_f,product_g
0,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,0.0711,...,0,1,0,0,1,0,0,0,0,0
1,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,0.400809,...,0,1,0,0,0,1,0,0,0,0
2,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,0.207723,...,0,1,0,0,1,0,0,0,0,0
3,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,0.361216,...,0,1,0,0,1,0,0,0,0,0
4,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,0.068033,...,0,0,1,1,0,0,0,0,0,0


### Inference:

 Tried MinMaxScaler got RMSE: 3.5
 Tried StandardScaler got RMSE: 4.24
 Using Zcore getting RMSE: 3.36

You can use scipy.stats.zscores across income, aspect, save_rate, subscriptions columns
 
- Adding view(-1,1): 
    [What is -1 in view?](https://stackoverflow.com/questions/42479902/what-does-view-do-in-pytorch)

In [14]:
# splitting data
x = df.drop(['age'], axis=1)
y = df['age'].values
print(x.shape), print(y.shape)

(2000, 53)
(2000,)


(None, None)

In [15]:
sclaer = MinMaxScaler()
x_sclaed = sclaer.fit_transform(x)

In [16]:
x_sclaed.shape

(2000, 53)

In [17]:
# convert scaled data to torch tensors
x_tnsr = torch.tensor(x_sclaed, dtype=torch.float32, device=device)
y_tnsr = torch.tensor(y, dtype=torch.float32, device=device).view(-1, 1)

#setting manual seed
torch.manual_seed(42)

# cross-validate
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# early stopping
patience = 10

fold = 0

for train_idx, test_idx in kf.split(x_tnsr):
    fold += 1
    print(f"Fold #{fold}")

    x_train, x_test = x_tnsr[train_idx], x_tnsr[test_idx]
    y_train, y_test = y_tnsr[train_idx], y_tnsr[test_idx]

    # pytorch dataloader
    train_dataset = TensorDataset(x_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # model
    model = nn.Sequential(
        nn.Linear(x_tnsr.shape[1], 20),
        nn.ReLU(),
        nn.Linear(20,10),
        nn.ReLU(),
        nn.Linear(10,1)
    )
    model = torch.compile(model, backend="aot_eager").to(device)

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.MSELoss()

    # early stopping
    best_loss = float('inf')
    early_Stopping_cntr = 0

    #training loop
    EPOCHS = 500
    epoch = 0
    done = False
    es = EarlyStopping()

    while not done and epoch<EPOCHS:
        epoch += 1
        model.train()
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            op = model(x_batch)
            loss = loss_fn(op, y_batch)
            loss.backward()
            optimizer.step()

        # validation
        model.eval()
        with torch.no_grad():
            val_op = model(x_test)
            val_loss = loss_fn(val_op, y_test)
        
        if es(model, val_loss):
            done = True
    
    print(f"Epoch {epoch}/{EPOCHS}, Validation loss: ", f"{val_loss.item()}, {es.status}")

# final evaluation
model.eval()
with torch.no_grad():
    oss_pred = model(x_test)
score = torch.sqrt(loss_fn(oss_pred, y_test)).item()
print(f"Fold score (RMSE): {score}")

Fold #1


Epoch 116/500, Validation loss:  0.4894576072692871, Early stopping triggered after 5 epochs.
Fold #2
Epoch 128/500, Validation loss:  0.2492123693227768, Early stopping triggered after 5 epochs.
Fold #3
Epoch 151/500, Validation loss:  0.4484054744243622, Early stopping triggered after 5 epochs.
Fold #4
Epoch 100/500, Validation loss:  0.23313216865062714, Early stopping triggered after 5 epochs.
Fold #5
Epoch 127/500, Validation loss:  1.36281156539917, Early stopping triggered after 5 epochs.
Fold score (RMSE): 1.151145339012146


### Info on Patience in early stopping:
- During training model's performance on validation dataset is monitored at the ned of each epoch
- If this validation performance like validation loss doesn't improve for a specified number of consecutive epochs equal to the patience vlaue, training is stopped
- The model is then saved or the training process is terminated 

### Classification problem: *__Classify the products based on given inputs__*