### 2.   Load and Explore Dataset

**[2.1]** Launch the magic commands for auto-relaoding external modules

In [1]:
#Solution
%load_ext autoreload
%autoreload 2

**[2.2]** Import the pandas and numpy packages

In [2]:
#Solution
import pandas as pd
import numpy as np

**[2.3]** Create a variable called `file_url` containing th url to the raw dataset

In [3]:
#Solution
file_url = 'https://raw.githubusercontent.com/aso-uts/applied_ds/master/unit3/dataset/Car%20Evaluation.csv'

**[2.4]** Load the data in a dataframe called `df`


In [4]:
#Solution:
df = pd.read_csv(file_url)

**[2.5]** Display the first 5 rows of df

In [5]:
# Solution
df.head()

Unnamed: 0,buying_price,maintenance_cost,doors,persons_capacity,luggage_boot,safety,evaluation
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


**[2.6]** Display the dimensions (shape) of df

In [6]:
# Solution
df.shape

(1728, 7)

**[2.7]** Display the summary (info) of df

In [7]:
# Solution
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying_price        1728 non-null object
maintenance_cost    1728 non-null object
doors               1728 non-null object
persons_capacity    1728 non-null object
luggage_boot        1728 non-null object
safety              1728 non-null object
evaluation          1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


**[2.8]** Display the descriptive statistics of df


In [8]:
# Solution
df.describe()

Unnamed: 0,buying_price,maintenance_cost,doors,persons_capacity,luggage_boot,safety,evaluation
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,high,high,5more,2,small,high,unacc
freq,432,432,432,576,576,576,1210


**[2.9]** Save the dataframe locally in the `data/raw` folder

In [9]:
# Solution
df.to_csv('../data/raw/car_evaluation.csv', index=False)

### 3. Prepare Data

**[3.1]** Create a copy of `df` and save it into a variable called `df_cleaned`

In [10]:
# Solution
df_cleaned = df.copy()

**[3.2]** Create a dictionary called `cats_dict` that contains the categorical variables as keys and their respective values sorted in ascending order

In [11]:
# Solution
cats_dict = {
    'buying_price': [['low', 'med', 'high', 'vhigh']],
    'maintenance_cost': [['low', 'med', 'high', 'vhigh']],
    'doors': [['2', '3', '4', '5more']],
    'persons_capacity': [['2', '4', 'more']],
    'luggage_boot': [['small', 'med', 'big']],
    'safety': [['low', 'med', 'high']],
    'evaluation': [['unacc', 'acc', 'good', 'vgood']],
}

**[3.3]** Import `StandardScaler` and `OrdinalEncoder` from `sklearn.preprocessing`

In [12]:
# Solution
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

**[3.4]** Iterate through the elements of `cast_dict`, instantiate an OrdinalEncoder() and transform the values of each column with this encoder

In [13]:
# Solution
for col, cats in cats_dict.items():
    col_encoder = OrdinalEncoder(categories=cats)
    df_cleaned[col] = col_encoder.fit_transform(df_cleaned[[col]])

**[3.5]** Create a list called `num_cols` that contains all numeric columns

In [14]:
# Solution
num_cols = ['buying_price', 'maintenance_cost', 'doors', 'persons_capacity', 'luggage_boot', 'safety']

**[3.6]** Instantiate a `StandardScaler` and called it `sc`

In [15]:
# Solution
sc = StandardScaler()

**[3.7]** Fit and transform the numeric feature of `df_cleaned` and replace the data into it

In [16]:
# Solution
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

**[3.8]** Convert the column `evaluation` as integer

In [17]:
# Solution
df_cleaned['evaluation'] = df_cleaned['evaluation'].astype(int)

**[3.9]** Import `split_sets_random` and `save_sets` from `src.data.sets`

In [18]:
# Solution
from src.data.sets import split_sets_random, save_sets

**[3.10]** Split the data into training and testing sets with 80-20 ratio

In [19]:
# Solution
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(df_cleaned, target_col='evaluation', test_ratio=0.2)

**[3.11]** Create the following folder: ../data/processed/car_evaluation/

In [20]:
!mkdir ../data/processed/car_evaluation

**[3.12]** Save the sets in the `data/processed/car_evaluation` folder

In [21]:
save_sets(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test, path='../data/processed/car_evaluation/')

**[3.13]** Import this class from `src/models/pytorch` and convert all sets to PytorchDataset

In [22]:
# Solution:
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

### 4. Baseline Model

**[4.1]** Import `NullModel` from `src.models.null`

In [23]:
# Solution
from src.models.null import NullModel

**[4.2]** Instantiate a `NullModel` and call `.fit_predict()` on the training target to extract your predictions into a variable called `y_base`

In [24]:
# Solution:
baseline_model = NullModel(target_type='classification')
y_base = baseline_model.fit_predict(y_train)

**[4.3]** Import `print_class_perf` from `src.models.performance`

In [25]:
# Solution:
from src.models.performance import print_class_perf

**[4.4]** Print the classification metrics for this baseline model

In [26]:
# Solution:
print_class_perf(y_base, y_train, set_name='Training', average='weighted')

Accuracy Training: 0.6988416988416989
F1 Training: 0.5749561249561249


### 5. Define Architecture

**[5.1]** Import `torch`, `torch.nn` as `nn` and `torch.nn.functional` as `F`

In [27]:
# Solution:
import torch
import torch.nn as nn
import torch.nn.functional as F

**[5.2]** Create in `src/models/pytorch.py` a class called `PytorchMultiClass` that inherits from `nn.Module` with:
- `num_features` as input parameter
- attributes:
    - `layer_1`: fully-connected layer with 32 neurons
    - `layer_out`: fully-connected layer with 4 neurons
    - `softmax`: softmax function
- methods:
    - `forward()` with `inputs` as input parameter, perform ReLU and DropOut on the fully-connected layer followed by the output layer with softmax

In [None]:
# Solution:
class PytorchMultiClass(nn.Module):
    def __init__(self, num_features):
        super(PytorchMultiClass, self).__init__()
        
        self.layer_1 = nn.Linear(num_features, 32)
        self.layer_out = nn.Linear(32, 4)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = F.dropout(F.relu(self.layer_1(x)), training=self.training)
        x = self.layer_out(x)
        return self.softmax(x)

**[5.3]** Instantiate `PytorchMultiClass` with the correct number of input feature and save it into a variable called `model`

In [28]:
# Solution:
from src.models.pytorch import PytorchMultiClass

model = PytorchMultiClass(X_train.shape[1])

**[5.4]** Import `get_device()` from `src.models.pytorch` and set `model` to use the device available

In [29]:
# Solution:
from src.models.pytorch import get_device

device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=4, bias=True)
  (softmax): Softmax(dim=1)
)

**[5.5]** Print the architecture of `model`

In [30]:
# Solution:
print(model)

PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=4, bias=True)
  (softmax): Softmax(dim=1)
)


### 6. Train Model

**[6.1]** Instantiate a `nn.CrossEntropyLoss()` and save it into a variable called `criterion` 

In [31]:
# Solution:
criterion = nn.CrossEntropyLoss()

**[6.2]** Instantiate a `torch.optim.Adam()` optimizer with the model's parameters and 0.1 as learning rate and save it into a variable called `optimizer`

In [32]:
# Solution:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

**[6.3]** Create a function called `train_classification()` that will perform forward and back propagation and calculate loss and Accuracy scores

In [None]:
def train_classification(train_data, model, criterion, optimizer, batch_size, device, scheduler=None, generate_batch=None):
    """Train a Pytorch multi-class classification model

    Parameters
    ----------
    train_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    optimizer: torch.optim
        Optimizer
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    scheduler : torch.optim.lr_scheduler
        Pytorch Scheduler used for updating learning rate
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """
    
    # Set model to training mode
    model.train()
    train_loss = 0
    train_acc = 0
    
    # Create data loader
    data = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:

        # Reset gradients
        optimizer.zero_grad()
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Make predictions
        output = model(feature)
        
        # Calculate loss for given batch
        loss = criterion(output, target_class.long())

        # Calculate global loss
        train_loss += loss.item()
        
        # Calculate gradients
        loss.backward()

        # Update Weights
        optimizer.step()
        
        # Calculate global accuracy
        train_acc += (output.argmax(1) == target_class).sum().item()

    # Adjust the learning rate
    if scheduler:
        scheduler.step()

    return train_loss / len(train_data), train_acc / len(train_data)

**[6.5]** Create a function called `test_classification()` that will perform forward and calculate loss and accuracy scores

In [None]:
def test_classification(test_data, model, criterion, batch_size, device, generate_batch=None):
    """Calculate performance of a Pytorch multi-class classification model

    Parameters
    ----------
    test_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """    
    
    # Set model to evaluation mode
    model.eval()
    test_loss = 0
    test_acc = 0
    
    # Create data loader
    data = DataLoader(test_data, batch_size=batch_size, collate_fn=generate_batch)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Set no update to gradients
        with torch.no_grad():
            
            # Make predictions
            output = model(feature)
            
            # Calculate loss for given batch
            loss = criterion(output, target_class.long())

            # Calculate global loss
            test_loss += loss.item()
            
            # Calculate global accuracy
            test_acc += (output.argmax(1) == target_class).sum().item()

    return test_loss / len(test_data), test_acc / len(test_data)

**[6.5]** Create 2 variables called `N_EPOCHS` and `BATCH_SIZE` that will take respectively 50 and 32 as values

In [33]:
# Solution:
N_EPOCHS = 50
BATCH_SIZE = 32

**[6.6]** Create a for loop that will iterate through the specified number of epochs and will train the model with the training set and assess the performance on the validation set and print their scores

In [34]:
# Solution:
from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

Epoch: 0
	(train)	|	Loss: 0.0314	|	Acc: 75.3%
	(valid)	|	Loss: 0.0293	|	Acc: 81.8%
Epoch: 1
	(train)	|	Loss: 0.0304	|	Acc: 79.1%
	(valid)	|	Loss: 0.0290	|	Acc: 82.9%
Epoch: 2
	(train)	|	Loss: 0.0296	|	Acc: 81.3%
	(valid)	|	Loss: 0.0292	|	Acc: 82.7%
Epoch: 3
	(train)	|	Loss: 0.0297	|	Acc: 81.2%
	(valid)	|	Loss: 0.0288	|	Acc: 83.5%
Epoch: 4
	(train)	|	Loss: 0.0295	|	Acc: 81.8%
	(valid)	|	Loss: 0.0284	|	Acc: 85.3%
Epoch: 5
	(train)	|	Loss: 0.0294	|	Acc: 82.2%
	(valid)	|	Loss: 0.0288	|	Acc: 83.5%
Epoch: 6
	(train)	|	Loss: 0.0303	|	Acc: 79.7%
	(valid)	|	Loss: 0.0281	|	Acc: 86.1%
Epoch: 7
	(train)	|	Loss: 0.0296	|	Acc: 81.1%
	(valid)	|	Loss: 0.0284	|	Acc: 85.0%
Epoch: 8
	(train)	|	Loss: 0.0293	|	Acc: 82.1%
	(valid)	|	Loss: 0.0280	|	Acc: 86.4%
Epoch: 9
	(train)	|	Loss: 0.0292	|	Acc: 82.9%
	(valid)	|	Loss: 0.0282	|	Acc: 85.5%
Epoch: 10
	(train)	|	Loss: 0.0294	|	Acc: 81.8%
	(valid)	|	Loss: 0.0276	|	Acc: 87.6%
Epoch: 11
	(train)	|	Loss: 0.0287	|	Acc: 84.1%
	(valid)	|	Loss: 0.0279	|	Acc: 86.7%
Ep

**[6.7]** Save the model into the `models` folder

In [35]:
# Solution
torch.save(model, "../models/pytorch_multi_car_evaluation.pt")

**[6.8]** Assess the model performance on the testing set and print its scores

In [36]:
test_loss, test_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tAccuracy: {test_acc:.1f}')

	Loss: 0.0299	|	Accuracy: 0.8
