In [25]:
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
import torch

# 1 - Get data & EDA

In [2]:
# Section 1
print("== Section 1: EDA ==")

# Check the dataset
df = pd.read_csv('insurance.csv')
print(df.head())

print(f"Rows: {len(df)} | Columns: {len(df.columns)}") # 1338 x 6 features + 1 target

print(f"\n\nRange for target variable: {df['charges'].describe()}")
print("\n---------------------------------\n")

== Section 1: EDA ==
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
Rows: 1338 | Columns: 7


Range for target variable: count     1338.000000
mean     13270.422265
std      12110.011237
min       1121.873900
25%       4740.287150
50%       9382.033000
75%      16639.912515
max      63770.428010
Name: charges, dtype: float64

---------------------------------



In [9]:
df_copy = df.copy()

# 2: Prepare dataset

Summary

1. Treat non-categoricals with df._get_numeric_data().columns
2. Df -> numpy with .to_numpy()
3. Create X, y tensors torch.from_numpy()
4. TensorDataset(X, y)
5. DataLoader(ds, batch_size=args, shuffle=True for train)
6. Create DataLoader() for testing

In [59]:
# Section 2
print("== Section 2: Prepare dataset ==")

# 1. Treat categorical variables
# 2. Split out features and labels
# 3. Convert to tensors of approrpriate size
# 4. Convert to training and testing batches using DataLoader

== Section 2: Prepare dataset ==


In [5]:
numeric_cols = df._get_numeric_data().columns

In [6]:
numeric_cols

Index(['age', 'bmi', 'children', 'charges'], dtype='object')

In [8]:
categorical_cols = list(set(df.columns) - set(numeric_cols))
categorical_cols


['sex', 'region', 'smoker']

In [11]:
for categorical_col in categorical_cols:
    df_copy[categorical_col] = df_copy[categorical_col].astype('category').cat.codes

In [12]:
df_copy

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


In [13]:
y = df_copy['charges']
X_reqd_columns = df_copy.columns[0:len(df_copy.columns)-1]
X = df_copy[X_reqd_columns]

In [22]:
y.to_numpy()

array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
       29141.3603])

In [21]:
X.to_numpy()

array([[19.  ,  0.  , 27.9 ,  0.  ,  1.  ,  3.  ],
       [18.  ,  1.  , 33.77,  1.  ,  0.  ,  2.  ],
       [28.  ,  1.  , 33.  ,  3.  ,  0.  ,  2.  ],
       ...,
       [18.  ,  0.  , 36.85,  0.  ,  0.  ,  2.  ],
       [21.  ,  0.  , 25.8 ,  0.  ,  0.  ,  3.  ],
       [61.  ,  0.  , 29.07,  0.  ,  1.  ,  1.  ]])

In [26]:
X_tensor = torch.from_numpy(X.to_numpy())

In [31]:
X_tensor.shape

torch.Size([1338, 6])

In [28]:
y_tensor = torch.from_numpy(y.to_numpy())

In [32]:
y_tensor.shape

torch.Size([1338])

In [50]:
y_tensor[0]

tensor(16884.9240, dtype=torch.float64)

In [29]:
ds = TensorDataset(X_tensor, y_tensor)

In [52]:
ds.tensors

(tensor([[19.0000,  0.0000, 27.9000,  0.0000,  1.0000,  3.0000],
         [18.0000,  1.0000, 33.7700,  1.0000,  0.0000,  2.0000],
         [28.0000,  1.0000, 33.0000,  3.0000,  0.0000,  2.0000],
         ...,
         [18.0000,  0.0000, 36.8500,  0.0000,  0.0000,  2.0000],
         [21.0000,  0.0000, 25.8000,  0.0000,  0.0000,  3.0000],
         [61.0000,  0.0000, 29.0700,  0.0000,  1.0000,  1.0000]],
        dtype=torch.float64),
 tensor([16884.9240,  1725.5523,  4449.4620,  ...,  1629.8335,  2007.9450,
         29141.3603], dtype=torch.float64))

In [34]:
train_set_size = int(len(ds) * 0.8)
valid_set_size = len(ds) - train_set_size

In [36]:
train_ds, val_ds = random_split(ds, [train_set_size, valid_set_size])

In [38]:
len(train_ds)+len(val_ds)

1338

In [54]:
dir(val_ds)

['__add__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'dataset',
 'indices']

In [56]:
val_ds.dataset.tensors

(tensor([[19.0000,  0.0000, 27.9000,  0.0000,  1.0000,  3.0000],
         [18.0000,  1.0000, 33.7700,  1.0000,  0.0000,  2.0000],
         [28.0000,  1.0000, 33.0000,  3.0000,  0.0000,  2.0000],
         ...,
         [18.0000,  0.0000, 36.8500,  0.0000,  0.0000,  2.0000],
         [21.0000,  0.0000, 25.8000,  0.0000,  0.0000,  3.0000],
         [61.0000,  0.0000, 29.0700,  0.0000,  1.0000,  1.0000]],
        dtype=torch.float64),
 tensor([16884.9240,  1725.5523,  4449.4620,  ...,  1629.8335,  2007.9450,
         29141.3603], dtype=torch.float64))

In [57]:
train_loader = DataLoader(train_ds, batch_size=50, shuffle=True)
test_loader = DataLoader(val_ds, batch_size=50)

In [58]:
for X_train, y_train in train_loader:
    print(f"{X_train} | {y_train}")
    break

tensor([[36.0000,  1.0000, 28.8800,  3.0000,  0.0000,  0.0000],
        [21.0000,  1.0000, 31.1000,  0.0000,  0.0000,  3.0000],
        [28.0000,  1.0000, 38.0600,  0.0000,  0.0000,  2.0000],
        [35.0000,  0.0000, 26.1250,  0.0000,  0.0000,  0.0000],
        [48.0000,  0.0000, 22.8000,  0.0000,  0.0000,  3.0000],
        [31.0000,  1.0000, 36.3000,  2.0000,  1.0000,  3.0000],
        [27.0000,  0.0000, 23.2100,  1.0000,  0.0000,  2.0000],
        [24.0000,  1.0000, 29.3000,  0.0000,  0.0000,  3.0000],
        [42.0000,  0.0000, 26.6000,  0.0000,  1.0000,  1.0000],
        [18.0000,  1.0000, 34.1000,  0.0000,  0.0000,  2.0000],
        [29.0000,  0.0000, 20.2350,  2.0000,  0.0000,  1.0000],
        [18.0000,  0.0000, 32.1200,  2.0000,  0.0000,  2.0000],
        [38.0000,  0.0000, 37.7300,  0.0000,  0.0000,  2.0000],
        [53.0000,  0.0000, 35.9000,  2.0000,  0.0000,  3.0000],
        [18.0000,  0.0000, 24.0900,  1.0000,  0.0000,  2.0000],
        [34.0000,  0.0000, 27.5000,  1.0

# Build model

In [62]:
import torch.nn.functional as F
import torch.nn as nn

In [64]:
class LogisticModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)
        
    def forward(self, X):
        out = self.linear(X)
        return out
    
    def training_step(self, batch):
        X, y = batch
        out = self(X)
        loss = F.mse_loss(out, y)
        return loss
    
    def validation_step(self, batch):
        X, y = batch
        out = self(X)
        loss = F.mse_loss(out, y)
        return loss
    
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        return {'val_loss': epoch_loss.item()}
        
    
    def epoch_end(self, epoch, result, num_epochs):
        # Print result every 20th epoch
        if (epoch+1) % 20 == 0 or epoch == num_epochs-1:
            print("Epoch [{}], val_loss: {:.4f}".format(epoch+1, result['val_loss']))
        

In [65]:
model = LogisticModel()

NameError: name 'input_size' is not defined