In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

import tqdm
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

For this assignment you must complete the following.

- Write a classification neural network that predicts the probability of either "+" or "-" for the column a16.

- Use early stopping to know when to complete your training.
- For all columns that are categorical, you must convert them to dummy variables.
- Some columns have missing values, fill these missing values with the median of that column.
- This is a simple neural network using basic techniques, do not worry too much about overall accuracy.
- Predict/submit for the entire dataset that I gave you, training and validation, you should have the same number of rows as crx.csv (690 data and 1 header row).

In [2]:
df_ = pd.read_csv('/Users/rishinigam/t81_588_course/datasets/crx.csv')

In [3]:
df = df_.copy()

In [4]:
df.head()

Unnamed: 0,a1,a2,s3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [5]:
# editing variables for further processing
df['a16'] = df['a16'].replace('+',1)
df['a16'] = df['a16'].replace('-',0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a1      690 non-null    object 
 1   a2      690 non-null    object 
 2   s3      690 non-null    float64
 3   a4      690 non-null    object 
 4   a5      690 non-null    object 
 5   a6      690 non-null    object 
 6   a7      690 non-null    object 
 7   a8      690 non-null    float64
 8   a9      690 non-null    object 
 9   a10     690 non-null    object 
 10  a11     690 non-null    int64  
 11  a12     690 non-null    object 
 12  a13     690 non-null    object 
 13  a14     690 non-null    object 
 14  a15     690 non-null    int64  
 15  a16     690 non-null    int64  
dtypes: float64(2), int64(3), object(11)
memory usage: 86.4+ KB


In [7]:
df.isna().sum()

a1     0
a2     0
s3     0
a4     0
a5     0
a6     0
a7     0
a8     0
a9     0
a10    0
a11    0
a12    0
a13    0
a14    0
a15    0
a16    0
dtype: int64

### Cleaning and prepping data

In [8]:
df['a2'] = df['a2'].replace('?', np.nan)
a2_median = df['a2'].median()
df['a2'].fillna(a2_median, inplace=True)
df['a2'] = df['a2'].astype('float64')

df['a14'] = df['a14'].replace('?', np.nan)
a14_median = df['a14'].median()
df['a14'].fillna(a14_median, inplace=True)
df['a14'] = df['a14'].astype('float64')

In [9]:
cat_cols = df.select_dtypes('object')

In [10]:
cat_cols_ = cat_cols.columns
cat_cols_

Index(['a1', 'a4', 'a5', 'a6', 'a7', 'a9', 'a10', 'a12', 'a13'], dtype='object')

In [11]:
for col in cat_cols.columns:
    print("*" * 50)
    print(str(col))
    print(df[col].unique())
    print("*" * 25)
    print(df[col].value_counts())
    print("*" * 25)

**************************************************
a1
['b' 'a' '?']
*************************
a1
b    468
a    210
?     12
Name: count, dtype: int64
*************************
**************************************************
a4
['u' 'y' '?' 'l']
*************************
a4
u    519
y    163
?      6
l      2
Name: count, dtype: int64
*************************
**************************************************
a5
['g' 'p' '?' 'gg']
*************************
a5
g     519
p     163
?       6
gg      2
Name: count, dtype: int64
*************************
**************************************************
a6
['w' 'q' 'm' 'r' 'cc' 'k' 'c' 'd' 'x' 'i' 'e' 'aa' 'ff' 'j' '?']
*************************
a6
c     137
q      78
w      64
i      59
aa     54
ff     53
k      51
cc     41
m      38
x      38
d      30
e      25
j      10
?       9
r       3
Name: count, dtype: int64
*************************
**************************************************
a7
['v' 'h' 'bb' 'ff' 'j' 'z' '?' 'o' 'd

### Interpretation:
- There are __?__ present in categorical columns too
- Populating missing categories with categories with least amount of data from each column

In [12]:
clean_cat_cols = ['a7', 'a6', 'a5', 'a4', 'a1']
df['a1'] = df['a1'].replace('?', 'a')
df['a4'] = df['a4'].replace('?', 'l')
df['a5'] = df['a5'].replace('?', 'gg')
df['a6'] = df['a6'].replace('?', 'r')
df['a7'] = df['a7'].replace('?', 'o')

### Label encoding

In [13]:
le = LabelEncoder()

dum_df = cat_cols.apply(le.fit_transform)

In [14]:
dum_df.head()

Unnamed: 0,a1,a4,a5,a6,a7,a9,a10,a12,a13
0,2,2,1,13,8,1,1,0,0
1,1,2,1,11,4,1,1,0,0
2,1,2,1,11,4,1,0,0,0
3,2,2,1,13,8,1,1,1,0
4,2,2,1,13,8,1,0,0,2


In [15]:
# joining df and dum_df
df = df.drop(cat_cols_, axis=1)
print(df.head())
final_df_ = df.join(dum_df)

      a2     s3    a8  a11    a14  a15  a16
0  30.83  0.000  1.25    1  202.0    0    1
1  58.67  4.460  3.04    6   43.0  560    1
2  24.50  0.500  1.50    0  280.0  824    1
3  27.83  1.540  3.75    5  100.0    3    1
4  20.17  5.625  1.71    0  120.0    0    1


In [16]:
final_df = final_df_.copy()

In [17]:
final_df.head(2)

Unnamed: 0,a2,s3,a8,a11,a14,a15,a16,a1,a4,a5,a6,a7,a9,a10,a12,a13
0,30.83,0.0,1.25,1,202.0,0,1,2,2,1,13,8,1,1,0,0
1,58.67,4.46,3.04,6,43.0,560,1,1,2,1,11,4,1,1,0,0


In [18]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a2      690 non-null    float64
 1   s3      690 non-null    float64
 2   a8      690 non-null    float64
 3   a11     690 non-null    int64  
 4   a14     690 non-null    float64
 5   a15     690 non-null    int64  
 6   a16     690 non-null    int64  
 7   a1      690 non-null    int64  
 8   a4      690 non-null    int64  
 9   a5      690 non-null    int64  
 10  a6      690 non-null    int64  
 11  a7      690 non-null    int64  
 12  a9      690 non-null    int64  
 13  a10     690 non-null    int64  
 14  a12     690 non-null    int64  
 15  a13     690 non-null    int64  
dtypes: float64(4), int64(12)
memory usage: 86.4 KB


In [19]:
#final_df.to_csv('../datasets/crx_enc.csv', index=False)

In [20]:
x = final_df.drop('a16', axis=1).values
scaler = StandardScaler()
x = scaler.fit_transform(x)
y = le.fit_transform(final_df['a16'])
cls = le.classes_

In [21]:
np.random.seed(42)
torch.manual_seed(42)

#mps_device = str(torch.device('mps'))
device = torch.device('mps')

In [22]:
# numpy to torch tensor
x_tr = torch.tensor(x, device=device, dtype=torch.float32)
y_tr = torch.tensor(y, device=device, dtype=torch.long)

In [23]:
x_tr.shape, y_tr.shape

(torch.Size([690, 15]), torch.Size([690]))

### Modelling and early stopping

In [24]:
# early stopping class
import copy

class EarlyStopping():
    def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_model = None
        self.best_loss = None
        self.counter = 0
        self.status = ""

    def __call__(self, model, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())
        elif self.best_loss - val_loss >= self.min_delta:
            self.best_model = copy.deepcopy(model.state_dict())
            self.best_loss = val_loss
            self.counter = 0
            self.status = f"Improvement found, counter reset to {self.counter}"
        else:
            self.counter += 1
            self.status = f"No improvement in the last {self.counter} epochs"
            if self.counter >= self.patience:
                self.status = f"Early stopping triggered after {self.counter} epochs"
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model)
                return True
        return False

In [25]:
# modelling and training

BATCH_SIZE = 16

dataset_train = TensorDataset(x_tr, y_tr)
dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)

# create model
model = nn.Sequential(
    nn.Linear(x_tr.shape[1], 50),
    nn.ReLU(),
    nn.Linear(50,25),
    nn.ReLU(),
    nn.Linear(25, len(cls)),
    nn.Sigmoid(),
)

model = torch.compile(model, backend='aot_eager').to(device)

loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

es = EarlyStopping()

epoch = 0
done = False

while epoch < 1000 and not done:
    epoch += 1
    steps = list(enumerate(dataloader_train))
    pbar = tqdm.tqdm(steps)
    model.train()
    for i, (x_batch, y_batch) in pbar:
        y_batch_pred = model(x_batch.to(device))
        loss = loss_fn(y_batch_pred, y_batch.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss, current = loss.item(), (i+1) * len(x_batch)
        if i == len(steps) - 1:
            model.eval()
            pred = model(x_tr)
            vloss = loss_fn(pred, y_tr)
            if es(model, vloss):
                done = True
            pbar.set_description(f"Epoch: {epoch}, tloss: {loss}, vloss: {vloss:>7f}, {es.status}")
        else:
            pbar.set_description(f'Epoch: {epoch}, tloss {loss:}')

Epoch: 1, tloss: 0.3137699067592621, vloss: 0.433007, : 100%|██████████| 44/44 [00:04<00:00, 10.29it/s]
Epoch: 2, tloss: 0.3136136531829834, vloss: 0.418041, Improvement found, counter reset to 0: 100%|██████████| 44/44 [00:00<00:00, 120.85it/s]
Epoch: 3, tloss: 0.31326404213905334, vloss: 0.419308, No improvement in the last 1 epochs: 100%|██████████| 44/44 [00:00<00:00, 157.69it/s]
Epoch: 4, tloss: 0.3133283257484436, vloss: 0.416139, Improvement found, counter reset to 0: 100%|██████████| 44/44 [00:00<00:00, 161.50it/s]
Epoch: 5, tloss: 0.35895752906799316, vloss: 0.421017, No improvement in the last 1 epochs: 100%|██████████| 44/44 [00:00<00:00, 140.70it/s]
Epoch: 6, tloss: 0.3132917881011963, vloss: 0.412780, Improvement found, counter reset to 0: 100%|██████████| 44/44 [00:00<00:00, 161.71it/s]
Epoch: 7, tloss: 0.31326165795326233, vloss: 0.407650, Improvement found, counter reset to 0: 100%|██████████| 44/44 [00:00<00:00, 146.10it/s]
Epoch: 8, tloss: 0.8132724165916443, vloss: 0

In [26]:
pred = model(x_tr)
vloss = loss_fn(pred, y_tr)
print(f"Loss = {vloss}")

Loss = 0.39295193552970886


In [27]:
pred.shape

torch.Size([690, 2])

In [28]:
type(pred)

torch.Tensor

In [37]:
top_p, predicted_class = torch.max(pred,1)
pred_arr = pred.cpu().detach().numpy()
px = pd.DataFrame(pred_arr, columns=['+', '-'], dtype=np.float64)
px['+'] = px['+'].apply(lambda x: float(x))

In [38]:
top_p.cpu().detach().numpy().shape

(690,)

In [39]:
px

Unnamed: 0,+,-
0,7.253057e-10,1.000000e+00
1,8.370072e-11,1.000000e+00
2,6.584205e-03,9.970058e-01
3,1.557166e-11,1.000000e+00
4,1.114385e-04,9.999645e-01
...,...,...
685,1.000000e+00,2.913498e-16
686,1.000000e+00,3.252083e-13
687,1.000000e+00,4.508613e-14
688,1.000000e+00,5.712516e-17


In [40]:
px.to_csv('../datasets/assignment_3_crx_pred.csv', index=False)