In [None]:
import torch
import jovian
import torchvision
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torchvision.datasets.utils import download_url
from torch.utils.data import DataLoader, TensorDataset, random_split

## Step 1: Download and explore the data

Let us begin by downloading the data. We'll use the `download_url` function from PyTorch to get the data as a CSV (comma-separated values) file. 

In [None]:
DATASET_URL = "https://hub.jovian.ml/wp-content/uploads/2020/05/insurance.csv"
DATA_FILENAME = "insurance.csv"
download_url(DATASET_URL, '.')

Using downloaded and verified file: ./insurance.csv


To load the dataset into memory, we'll use the `read_csv` function from the `pandas` library. The data will be loaded as a Pandas dataframe. See this short tutorial to learn more: https://data36.com/pandas-tutorial-1-basics-reading-data-files-dataframes-data-selection/

In [None]:
dataframe_raw = pd.read_csv(DATA_FILENAME)
dataframe_raw.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


We're going to do a slight customization of the data, so that you every participant receives a slightly different version of the dataset. Fill in your name below as a string (enter at least 5 characters)

In [None]:
your_name = "santha" # at least 5 characters

The `customize_dataset` function will customize the dataset slightly using your name as a source of random numbers.

In [None]:
def customize_dataset(dataframe_raw, rand_str):
    dataframe = dataframe_raw.copy(deep=True)
    # drop some rows
    dataframe = dataframe.sample(int(0.95*len(dataframe)), random_state=int(ord(rand_str[0])))
    # scale input
    dataframe.bmi = dataframe.bmi * ord(rand_str[1])/100.
    # scale target
    dataframe.charges = dataframe.charges * ord(rand_str[2])/100.
    # drop column
    if ord(rand_str[3]) % 2 == 1:
        dataframe = dataframe.drop(['region'], axis=1)
    return dataframe

In [None]:
dataframe = customize_dataset(dataframe_raw, your_name)
dataframe.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1178,23,female,33.81905,0,no,northeast,3189.438285
1295,20,male,21.34,1,no,southwest,2161.258
205,28,female,28.0136,1,no,northeast,4771.50872
1067,39,male,41.37535,0,no,northeast,6333.154795
523,38,female,36.5981,0,no,southeast,5937.37837


Let us answer some basic questions about the dataset. 


**Q: How many rows does the dataset have?**

In [None]:
num_rows = dataframe.shape[0]
print(num_rows)

1271


**Q: How many columns doe the dataset have**

In [None]:
num_cols = dataframe.shape[1]
print(num_cols)

7


**Q: What are the column titles of the input variables?**

In [None]:
input_cols = dataframe.columns.to_list()[0:-1]
print(input_cols)

['age', 'sex', 'bmi', 'children', 'smoker', 'region']


**Q: Which of the input columns are non-numeric or categorial variables ?**

Hint: `sex` is one of them. List the columns that are not numbers.

In [None]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1271 entries, 1178 to 604
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1271 non-null   int64  
 1   sex       1271 non-null   object 
 2   bmi       1271 non-null   float64
 3   children  1271 non-null   int64  
 4   smoker    1271 non-null   object 
 5   region    1271 non-null   object 
 6   charges   1271 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 79.4+ KB


In [None]:
dataframe[["sex","smoker","region"]].head(5)

Unnamed: 0,sex,smoker,region
1178,female,no,northeast
1295,male,no,southwest
205,female,no,northeast
1067,male,no,northeast
523,female,no,southeast


In [None]:
categorical_cols = ["sex","smoker","region"]

**Q: What are the column titles of output/target variable(s)?**

In [None]:
output_cols = ["charges"]

**Q: (Optional) What is the minimum, maximum and average value of the `charges` column? Can you show the distribution of values in a graph?**
Use this data visualization cheatsheet for referece: https://jovian.ml/aakashns/dataviz-cheatsheet

In [None]:
dataframe["charges"].describe()

count     1271.000000
mean     14647.332621
std      13422.657623
min       1234.061290
25%       5201.902613
50%      10356.412000
75%      18240.352466
max      70147.470811
Name: charges, dtype: float64

In [None]:
# Write your answer here
# min = 1234.061290   ,max = 70147.470811 , Average = 14647.332621

Remember to commit your notebook to Jovian after every step, so that you don't lose your work.

## Step 2: Prepare the dataset for training

We need to convert the data from the Pandas dataframe into a PyTorch tensors for training. To do this, the first step is to convert it numpy arrays. If you've filled out `input_cols`, `categorial_cols` and `output_cols` correctly, this following function will perform the conversion to numpy arrays.

In [None]:
def dataframe_to_arrays(dataframe):
    # Make a copy of the original dataframe
    dataframe1 = dataframe.copy(deep=True)
    # Convert non-numeric categorical columns to numbers
    for col in categorical_cols:
        dataframe1[col] = dataframe1[col].astype('category').cat.codes
    # Extract input & outupts as numpy arrays
    inputs_array = dataframe1[input_cols].to_numpy()
    targets_array = dataframe1[output_cols].to_numpy()
    return inputs_array, targets_array

Read through the [Pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html) to understand how we're converting categorical variables into numbers.

In [None]:
inputs_array, targets_array = dataframe_to_arrays(dataframe)
inputs_array, targets_array

(array([[23.     ,  0.     , 33.81905,  0.     ,  0.     ,  0.     ],
        [20.     ,  1.     , 21.34   ,  1.     ,  0.     ,  3.     ],
        [28.     ,  0.     , 28.0136 ,  1.     ,  0.     ,  0.     ],
        ...,
        [53.     ,  0.     , 25.899  ,  2.     ,  0.     ,  3.     ],
        [26.     ,  1.     , 26.2482 ,  0.     ,  1.     ,  2.     ],
        [19.     ,  0.     , 27.4607 ,  0.     ,  1.     ,  1.     ]]),
 array([[ 3189.438285],
        [ 2161.258   ],
        [ 4771.50872 ],
        ...,
        [12265.858   ],
        [18747.67554 ],
        [19215.88229 ]]))

**Q: Convert the numpy arrays `inputs_array` and `targets_array` into PyTorch tensors. Make sure that the data type is `torch.float32`.**

In [None]:
inputs = torch.tensor(inputs_array,dtype=torch.float32)
targets = torch.tensor(targets_array,dtype=torch.float32)

In [None]:
inputs.dtype, targets.dtype

(torch.float32, torch.float32)

Next, we need to create PyTorch datasets & data loaders for training & validation. We'll start by creating a `TensorDataset`.

In [None]:
dataset = TensorDataset(inputs, targets)

**Q: Pick a number between `0.1` and `0.2` to determine the fraction of data that will be used for creating the validation set. Then use `random_split` to create training & validation datasets.**

In [None]:
val_percent = 0.18 # between 0.1 and 0.2
val_size = int(num_rows * val_percent)
train_size = num_rows - val_size


train_ds, val_ds = random_split(dataset,[train_size,val_size]) # Use the random_split function to split dataset into 2 parts of the desired length

Finally, we can create data loaders for training & validation.

**Q: Pick a batch size for the data loader.**

In [None]:
batch_size = 256

In [None]:
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size)

Let's look at a batch of data to verify everything is working fine so far.

In [None]:
for xb, yb in train_loader:
    print("inputs:", xb)
    print("targets:", yb)
    break

inputs: tensor([[59.0000,  1.0000, 39.9058,  1.0000,  1.0000,  2.0000],
        [23.0000,  0.0000, 27.6353,  1.0000,  1.0000,  2.0000],
        [40.0000,  1.0000, 29.0030,  2.0000,  0.0000,  3.0000],
        ...,
        [27.0000,  0.0000, 17.4163,  2.0000,  1.0000,  0.0000],
        [50.0000,  0.0000, 26.2628,  1.0000,  0.0000,  0.0000],
        [38.0000,  0.0000, 39.3480,  1.0000,  0.0000,  1.0000]])
targets: tensor([[53867.2734],
        [20161.0625],
        [ 7260.3970],
        [ 7982.3955],
        [11117.9424],
        [12190.8350],
        [ 6512.1147],
        [ 6015.9072],
        [38676.1484],
        [ 5941.0786],
        [ 4351.6787],
        [48686.8242],
        [12488.5508],
        [37679.4570],
        [39841.3477],
        [13013.6680],
        [ 3529.6658],
        [22326.5488],
        [12080.7510],
        [ 8404.3398],
        [ 2383.4053],
        [ 1792.8169],
        [20944.9629],
        [13445.1885],
        [11155.2500],
        [39693.7422],
        [ 591

## Step 3: Create a Linear Regression Model

Our model itself is a fairly straightforward linear regression (we'll build more complex models in the next assignment). 


In [None]:
input_size = len(input_cols)
output_size = len(output_cols)


**Q: Complete the class definition below by filling out the constructor (`__init__`), `forward`, `training_step` and `validation_step` methods.**

Hint: Think carefully about picking a good loss fuction (it's not cross entropy). Maybe try 2-3 of them and see which one works best. See https://pytorch.org/docs/stable/nn.functional.html#loss-functions

In [None]:
class InsuranceModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_size,output_size)                 # fill this (hint: use input_size & output_size defined above)
        
    def forward(self, xb):
        xb = xb.reshape(-1, input_size)
        out = self.linear(xb)                          # fill this
        return out
    
    def training_step(self, batch):
        inputs, targets = batch 
        # Generate predictions
        out = self(inputs)          
        # Calcuate loss
        loss = F.l1_loss(out, targets)                     # fill this
        return loss
    
    def validation_step(self, batch):
        inputs, targets = batch
        # Generate predictions
        out = self(inputs)
        # Calculate loss
        
        loss = F.l1_loss(out,targets)                           # fill this    
        return {'val_loss': loss.detach()}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        return {'val_loss': epoch_loss.item()}
    
    def epoch_end(self, epoch, result, num_epochs):
        # Print result every 20th epoch
        if (epoch+1) % 20 == 0 or epoch == num_epochs-1:
            print("Epoch [{}], val_loss: {:.4f}".format(epoch+1, result['val_loss']))

Let us create a model using the `InsuranceModel` class. You may need to come back later and re-run the next cell to reinitialize the model, in case the loss becomes `nan` or `infinity`.

In [None]:
model = InsuranceModel()

Let's check out the weights and biases of the model using `model.parameters`.

In [None]:
list(model.parameters())

[Parameter containing:
 tensor([[-0.3450, -0.2726,  0.3580, -0.3144,  0.2528, -0.0575]],
        requires_grad=True), Parameter containing:
 tensor([0.1438], requires_grad=True)]

One final commit before we train the model.

## Step 4: Train the model to fit the data

To train our model, we'll use the same `fit` function explained in the lecture. That's the benefit of defining a generic training loop - you can use it for any problem.

In [None]:
def evaluate(model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        # Training Phase 
        for batch in train_loader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, val_loader)
        model.epoch_end(epoch, result, epochs)
        history.append(result)
    return history

**Q: Use the `evaluate` function to calculate the loss on the validation set before training.**

In [None]:
result = evaluate(model,val_loader) # Use the the evaluate function
print(result)

{'val_loss': 13587.1494140625}



We are now ready to train the model. You may need to run the training loop many times, for different number of epochs and with different learning rates, to get a good result. Also, if your loss becomes too large (or `nan`), you may have to re-initialize the model by running the cell `model = InsuranceModel()`. Experiment with this for a while, and try to get to as low a loss as possible.

**Q: Train the model 4-5 times with different learning rates & for different number of epochs.**

Hint: Vary learning rates by orders of 10 (e.g. `1e-2`, `1e-3`, `1e-4`, `1e-5`, `1e-6`) to figure out what works.

In [None]:
epochs = 300
lr = 1e-1
history1 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 7164.4497
Epoch [40], val_loss: 7038.7725
Epoch [60], val_loss: 6914.0239
Epoch [80], val_loss: 6796.0176
Epoch [100], val_loss: 6693.2822
Epoch [120], val_loss: 6618.9258
Epoch [140], val_loss: 6561.0820
Epoch [160], val_loss: 6531.3877
Epoch [180], val_loss: 6514.2305
Epoch [200], val_loss: 6503.4077
Epoch [220], val_loss: 6492.8389
Epoch [240], val_loss: 6496.9604
Epoch [260], val_loss: 6488.2632
Epoch [280], val_loss: 6485.9604
Epoch [300], val_loss: 6482.3447


In [None]:
epochs = 300
lr = 1
history2 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 6432.8652
Epoch [40], val_loss: 6658.8120
Epoch [60], val_loss: 6508.2471
Epoch [80], val_loss: 6529.8521
Epoch [100], val_loss: 6387.1650
Epoch [120], val_loss: 6379.8784
Epoch [140], val_loss: 6560.3511
Epoch [160], val_loss: 6812.1826
Epoch [180], val_loss: 6369.2964
Epoch [200], val_loss: 6464.6357
Epoch [220], val_loss: 6332.3418
Epoch [240], val_loss: 6466.4790
Epoch [260], val_loss: 6347.1401
Epoch [280], val_loss: 6326.6104
Epoch [300], val_loss: 6651.3301


In [None]:
epochs = 1000
lr = 2
history3 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 7477.7974
Epoch [40], val_loss: 6417.1709
Epoch [60], val_loss: 6292.8882
Epoch [80], val_loss: 6272.5508
Epoch [100], val_loss: 6285.7285
Epoch [120], val_loss: 6364.3716
Epoch [140], val_loss: 6247.4336
Epoch [160], val_loss: 6357.3672
Epoch [180], val_loss: 7356.7485
Epoch [200], val_loss: 6220.9077
Epoch [220], val_loss: 6218.0493
Epoch [240], val_loss: 6358.1636
Epoch [260], val_loss: 7444.4526
Epoch [280], val_loss: 6363.7563
Epoch [300], val_loss: 6495.9062
Epoch [320], val_loss: 6369.0537
Epoch [340], val_loss: 6202.3062
Epoch [360], val_loss: 6232.5371
Epoch [380], val_loss: 6178.3804
Epoch [400], val_loss: 6181.5117
Epoch [420], val_loss: 6561.5542
Epoch [440], val_loss: 6754.6738
Epoch [460], val_loss: 6276.4951
Epoch [480], val_loss: 7037.0977
Epoch [500], val_loss: 6260.3789
Epoch [520], val_loss: 6825.3569
Epoch [540], val_loss: 6194.2222
Epoch [560], val_loss: 6061.8833
Epoch [580], val_loss: 6129.0498
Epoch [600], val_loss: 6253.5142
Epoch [620], v

In [None]:
epochs = 10000
lr = 1e-1
history4 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 5854.9790
Epoch [40], val_loss: 5853.3032
Epoch [60], val_loss: 5854.7056
Epoch [80], val_loss: 5849.8848
Epoch [100], val_loss: 5852.9941
Epoch [120], val_loss: 5849.8735
Epoch [140], val_loss: 5848.9136
Epoch [160], val_loss: 5849.2217
Epoch [180], val_loss: 5846.6270
Epoch [200], val_loss: 5846.3511
Epoch [220], val_loss: 5846.0625
Epoch [240], val_loss: 5856.0674
Epoch [260], val_loss: 5847.3887
Epoch [280], val_loss: 5845.3330
Epoch [300], val_loss: 5844.1025
Epoch [320], val_loss: 5844.1245
Epoch [340], val_loss: 5843.3418
Epoch [360], val_loss: 5841.7134
Epoch [380], val_loss: 5842.3740
Epoch [400], val_loss: 5841.7114
Epoch [420], val_loss: 5847.5801
Epoch [440], val_loss: 5840.5889
Epoch [460], val_loss: 5840.1738
Epoch [480], val_loss: 5839.1577
Epoch [500], val_loss: 5838.4336
Epoch [520], val_loss: 5841.7939
Epoch [540], val_loss: 5839.4824
Epoch [560], val_loss: 5838.4702
Epoch [580], val_loss: 5839.4468
Epoch [600], val_loss: 5838.0542
Epoch [620], v

In [None]:
epochs = 500
lr = 1e-1
history5 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 3607.5906
Epoch [40], val_loss: 3606.8621
Epoch [60], val_loss: 3606.6816
Epoch [80], val_loss: 3607.6594
Epoch [100], val_loss: 3609.0867
Epoch [120], val_loss: 3606.8306
Epoch [140], val_loss: 3607.2292
Epoch [160], val_loss: 3606.7717
Epoch [180], val_loss: 3609.0889
Epoch [200], val_loss: 3610.2295
Epoch [220], val_loss: 3607.8794
Epoch [240], val_loss: 3611.4490
Epoch [260], val_loss: 3610.0464
Epoch [280], val_loss: 3616.0364
Epoch [300], val_loss: 3609.6318
Epoch [320], val_loss: 3607.2827
Epoch [340], val_loss: 3607.0242
Epoch [360], val_loss: 3607.5989
Epoch [380], val_loss: 3607.6526
Epoch [400], val_loss: 3607.5830
Epoch [420], val_loss: 3607.3992
Epoch [440], val_loss: 3607.5027
Epoch [460], val_loss: 3609.5076
Epoch [480], val_loss: 3608.7419
Epoch [500], val_loss: 3607.5776


**Q: What is the final validation loss of your model?**

In [None]:
val_loss = 3607.5776

## Step 5: Make predictions using the trained model

**Q: Complete the following function definition to make predictions on a single input**

In [None]:
def predict_single(input, target, model):
    inputs = input.unsqueeze(0)
    predictions = model(inputs)                # fill this
    prediction = predictions[0].detach()
    print("Input:", input)
    print("Target:", target)
    print("Prediction:", prediction)

In [None]:
input, target = val_ds[0]
predict_single(input, target, model)

Input: tensor([18.0000,  0.0000, 37.9852,  0.0000,  0.0000,  2.0000])
Target: tensor([1796.3489])
Prediction: tensor([1513.9250])


In [None]:
input, target = val_ds[10]
predict_single(input, target, model)

Input: tensor([47.0000,  0.0000, 32.3447,  0.0000,  0.0000,  0.0000])
Target: tensor([22966.6621])
Prediction: tensor([10197.9023])


In [None]:
input, target = val_ds[23]
predict_single(input, target, model)

Input: tensor([58.0000,  1.0000, 24.4198,  0.0000,  0.0000,  0.0000])
Target: tensor([13124.2373])
Prediction: tensor([12844.5830])
