# Custom DataSet components
- Dataset `torch.utils.data.Dataset`
- DataLoader `torch.utils.data.DataLoader`

# Dataset has
- __init__() for initialize the data object, load data
- __len__() for return the total length of samples
- __getitem__() for returning a element from dataset

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mirichoi0218/insurance")

Using Colab cache for faster access to the 'insurance' dataset.


In [3]:
# importing essential module
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv(path + '/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
# Split data into train and test set
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
# Encoding the categorical features
label_encoder = {}

for col in ['sex', 'smoker', 'region']:
  le = LabelEncoder()
  train_df[col] = le.fit_transform(train_df[col])
  test_df[col] = le.transform(test_df[col])
  label_encoder[col] = le

In [7]:
# Featured and targeting
X_train = train_df.drop(columns='charges')
y_train = train_df['charges']

X_test = test_df.drop(columns='charges')
y_test = test_df['charges']

In [8]:
# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:
# Convert numpy to tensor
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view((-1, 1))

# Create a custom Dataset and DataLoader

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [26]:
# Custom Dataset
class InsuranceData(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, indx):
    features = torch.tensor(self.X[indx], dtype=torch.float32)
    target = torch.tensor(self.y.values[indx], dtype=torch.float32).view(1)

    return features, target

In [27]:
# Dataset instance
insurance = InsuranceData(X_train, y_train)

In [29]:
# DataLoader
dataloader = DataLoader(insurance, batch_size=50, shuffle=True)

# # For checking
# for i, j in dataloader:
#   print(i.shape, j.shape)

# Create Model and train

In [30]:
class SimpleNNReg(nn.Module):
  def __init__(self, input_size):
    super(SimpleNNReg, self).__init__()

    self.network = nn.Sequential(
        nn.Linear(input_size, 64),
        nn.ReLU(),
        nn.Linear(64, 128),
        nn.ReLU(),
        nn.Linear(128, 1)
    )

  def forward(self, x):
    return self.network(x)

In [31]:
# Create model instance
in_sz = 6
model = SimpleNNReg(in_sz)
model

SimpleNNReg(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [32]:
# Loss function and optimizer
loss_fun = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [33]:
# Training loop
epochs = 5000

for e in range(epochs):
  model.train() # For train mode
  for batch, (x_tensor, y_tensor) in enumerate(dataloader):
    optimizer.zero_grad()
    outputs = model(x_tensor)

    loss = loss_fun(outputs, y_tensor)
    loss.backward()
    optimizer.step()
    print(f'--> batch [{batch + 1}], loss = {loss.item():0.4f}')

  if (e + 1) % 100 == 0:
    print(f'Epoch [{e + 1}/{epochs}], loss = {loss.item():0.4f}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
--> batch [20], loss = 12863479.0000
--> batch [21], loss = 6177952.0000
--> batch [22], loss = 8205086.5000
--> batch [1], loss = 8968028.0000
--> batch [2], loss = 4155403.0000
--> batch [3], loss = 7750973.5000
--> batch [4], loss = 2852914.0000
--> batch [5], loss = 12577306.0000
--> batch [6], loss = 11076983.0000
--> batch [7], loss = 13173992.0000
--> batch [8], loss = 10134867.0000
--> batch [9], loss = 5477068.0000
--> batch [10], loss = 10236419.0000
--> batch [11], loss = 4316387.0000
--> batch [12], loss = 6336782.5000
--> batch [13], loss = 7650898.0000
--> batch [14], loss = 10074815.0000
--> batch [15], loss = 12746090.0000
--> batch [16], loss = 10996419.0000
--> batch [17], loss = 6207789.5000
--> batch [18], loss = 11365037.0000
--> batch [19], loss = 9292947.0000
--> batch [20], loss = 6730351.5000
--> batch [21], loss = 6310391.5000
--> batch [22], loss = 10148958.0000
--> batch [1], loss = 6767605.000

# Validate the model

In [37]:
model.eval() # Evaluate mode
y_predic = model(X_test_tensor).detach().numpy()

In [38]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [39]:
mse = mean_squared_error(y_test_tensor.numpy(), y_predic)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test_tensor.numpy(), y_predic)
r2 = r2_score(y_test_tensor.numpy(), y_predic)

print(f'Mean Square Error: {mse:.2f}')
print(f'Root Mean Square Error: {rmse:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'R2 Score: {r2:.2f}')

Mean Square Error: 33413968.00
Root Mean Square Error: 5780.48
Mean Absolute Error: 3833.89
R2 Score: 0.78


# Predict the charges

In [42]:
def predict_one(age, sex, bmi, children, smoker, region):
  # Create a dataframe
  predict_df = pd.DataFrame([[age, sex, bmi, children, smoker, region]], columns=['age', 'sex', 'bmi', 'children', 'smoker', 'region'])

  # Label encode for categorical using above encoder
  for col in ['sex', 'smoker', 'region']:
    le = label_encoder[col]
    predict_df[col] = le.transform(predict_df[col])

  # Normalize the value
  predict_df = scaler.transform(predict_df)

  # convert dataframe to tensor
  x_tensor = torch.tensor(predict_df, dtype=torch.float32)

  # run model
  model.eval()
  charges = model(x_tensor).item()
  return charges

In [45]:
one = predict_one(19, 'female', 27.9, 0, 'yes', 'southwest')

print(f'Predicted value: {one:.2f}')

Predicted value: 17583.75
