In [None]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import matplotlib as plt
import pandas as pd

from torch.utils.data import DataLoader, TensorDataset, random_split
from torchvision.datasets.utils import download_url


# Download e esplorazione dei dati

In [None]:
DATASET_URL = "https://gist.github.com/BirajCoder/5f068dfe759c1ea6bdfce9535acdb72d/raw/c84d84e3c80f93be67f6c069cbdc0195ec36acbd/insurance.csv"
DATA_FILENAME = "insurance.csv"
download_url(DATASET_URL, '.')

In [None]:
dataframe_raw = pd.read_csv(DATA_FILENAME)
dataframe_raw.head()

In [None]:
your_name = 'Renato'

In [None]:
def customize_dataset(dataframe_raw, rand_str):

    dataframe = dataframe_raw.copy(deep = True)         # Crea una copia del dataframe, ponendo deep = True le modifiche apportate non modifica il dataframe originale

    # Eliminazione di alcune righe
    dataframe = dataframe.sample(int(0.95 * len(dataframe)), random_state = int(ord(rand_str[0])))
    # .sample(): Restituisce un numero specificato di righe    /    random_state(): controlla il mescolamento dei dati prima di dividerli

    # Scalatura input
    dataframe.bmi = dataframe.bmi * ord(rand_str[1])/100.

    # Scalatura target

    dataframe.charges = dataframe.charges * ord(rand_str[2])/100

    # Eliminazione colonne
    if ord(rand_str[3]) % 2 == 1:
        dataframe = dataframe.drop(['region'], axis=1)

    return dataframe


In [None]:
dataframe = customize_dataset(dataframe_raw, your_name)
dataframe.head()

**Domanda 1:** quante righe ha **'dataset'**?

In [None]:
num_rows = len(dataframe.index)
num_rows

**Domanda 2:** quante righe ha 'dataset'?

In [None]:
num_columns = len(dataframe.columns)
num_columns

**Domanda 3:** Quali sono i titoli delle colonne di input?

In [None]:
input_cols = dataframe.columns[:5].tolist()
input_cols

**Domanda 4:** Quali colonne non possiedono valori numerici?

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_cols = dataframe.select_dtypes(exclude = numerics).columns.tolist()
categorical_cols

**Domanda 5:** Quali sono i titoli delle colonne di output?

In [None]:
output_cols = dataframe.columns[5:6]
output_cols

**Domanda bonus:** minimo, massimo e media dei valori in charges, si può vedere la distribuzione dei valori in un grafico?

In [None]:
max_charges = dataframe["charges"].max()
min_charges = dataframe["charges"].min()
mean_charges = dataframe["charges"].mean()

print('Il valore massimo è: ', max_charges)
print('Il valore minimo è: ', min_charges)
print('Il valore medio è: ', mean_charges)

Distribuzione dei valori di charges:

In [None]:
dataframe['charges'].plot(kind = 'kde')

# Preparazione dataset per il training

Per poter eserguire il training abbiamo bisogno di avere dei tensori, quindi faremo una trasformazione da pandas a pytorch: **Pandas --> NumPy --> Pytorch**

Passaggio da dataframe ad arry:

In [None]:
def dataframe_to_arrays(dataframe):
    dataframe1 = dataframe.copy(deep = True)        #copia dataframe

    # Conversione delle colonne non numeriche a numeriche
    for col in categorical_cols:
        dataframe1[col] = dataframe1[col].astype('category').cat.codes      # Categorizza attraverso codici numerici

    # Estrazione di input e output sotto forma di array numPy
    inputs_array = dataframe1[input_cols].to_numpy()
    targets_array = dataframe1[output_cols].to_numpy()

    return inputs_array, targets_array

In [None]:
inputs_array, targets_array = dataframe_to_arrays(dataframe)
inputs_array, targets_array

**Domanda 6:** convertire array numpy in tensori PyTorch (NB! Fare attenzione che siano float32)

In [None]:
inputs = torch.from_numpy(inputs_array).to(torch.float32)
targets = torch.from_numpy(targets_array).to(torch.float32)

In [None]:
inputs.dtype, targets.dtype     # Verifica tipologia dati

In [None]:
dataset = TensorDataset(inputs, targets)
print(len(dataset))

**Domanda 7:** Scegliere un numero tra 0.1 e 0.2 per determinare la frazione di dati che verranno usati per creare un validation set, quindi usare random_split per creare training e validation set

In [None]:
val_percent = 0.2
val_size = int (num_rows * val_percent)
print('Dimensione validation test: ', val_size)
train_size = num_rows - val_size
print('Dimensione training test: ', train_size)

train_ds, val_ds = random_split(dataset, [train_size, val_size])

**Domanda 8:** Scegliere una dimensione di lotto per il data loader

In [None]:
batch_size = 50

In [None]:
train_loader = DataLoader(train_ds, batch_size, shuffle = True)
val_loader = DataLoader(val_ds, batch_size)

In [None]:
for xb, yb in train_loader:
    print('inputs: ', xb)
    print('targets: ', yb)
    break

# Creare un modello di regressione lineare

In [None]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

In [None]:
input_size = len(input_cols)
output_size = len(output_cols)

In [None]:
input_size

In [None]:
output_size

In [None]:
class InsuranceModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)
        
    def forward(self, xb):
        out = self.linear(xb)
        return out
    
    def training_step(self, batch):
        inputs, targets = batch
        out = self(inputs)   
        
        loss = F.l1_loss(out, targets)
        return loss
    
    def validation_step(self, batch):
        inputs, targets = batch
        out = self(inputs)  
        
        loss = F.l1_loss(out, targets)
        return {'val_loss': loss.detach()}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()
        return {'val_loss': epoch_loss.item()}
    
    def epoch_end(self, epoch, result, num_epochs):
        if (epoch+1) % 20 == 0 or epoch == num_epochs-1:
            print("Epoch [{}], val_loss: {:.4f}".format(epoch+1, result['val_loss']))  

In [None]:
model = InsuranceModel()

In [None]:
model.parameters()

In [None]:
model.linear.weight

In [None]:
model.linear.bias

In [None]:
print(next(iter(val_loader)))

In [None]:
model.linear.weight

# Training del model

In [None]:
def evaluate(model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        # Training Phase 
        for batch in train_loader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, val_loader)
        model.epoch_end(epoch, result, epochs)
        history.append(result)
    return history

**Domanda 9:** usa la funzione evaluate per calcolare la perdita del validation set prima del training

In [None]:
result = evaluate(model, val_loader)
print(result)

In [None]:
epochs = 500
lr = 0.1
history1 = fit(epochs, lr, model, train_loader, val_loader)

In [None]:
epochs = 500
lr = 0.1
history2 = fit(epochs, lr, model, train_loader, val_loader)

In [None]:
epochs = 500
lr = 0.1
history3 = fit(epochs, lr, model, train_loader, val_loader)

In [None]:
epochs = 500
lr = 0.1
history4 = fit(epochs, lr, model, train_loader, val_loader)

In [None]:
epochs = 500
lr = 0.1
history5 = fit(epochs, lr, model, train_loader, val_loader)

In [None]:
val_loss = evaluate(model, val_loader)
val_loss

# NB! 

In [None]:
def predict_single(input, target, model):
    inputs = input.unsqueeze(0)
    predictions = torch.max(model(inputs)).unsqueeze(0)
    prediction = predictions[0].detach()
    print("Input:", input)
    print("Target:", target)
    print("Prediction:", prediction)

In [None]:
print(next(iter(val_ds)))

In [None]:
input, target = val_ds[0]
predict_single(input, target, model)

In [None]:
input, target = val_ds[10]
predict_single(input, target, model)

In [None]:
input, target = val_ds[23]
predict_single(input, target, model)