# Step of Process
1. Data gathering
2. Data Preprocessing
3. Feature engineering
4. Model train
5. Testing
6. Predicting

# Dataset Downloading and processing

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mirichoi0218/insurance")

Using Colab cache for faster access to the 'insurance' dataset.


In [6]:
# importing essential module
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv(path + '/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [9]:
# Split data into train and test set
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [17]:
# Encoding the categorical features
label_encoder = {}

for col in ['sex', 'smoker', 'region']:
  le = LabelEncoder()
  train_df[col] = le.fit_transform(train_df[col])
  test_df[col] = le.transform(test_df[col])
  label_encoder[col] = le


In [100]:
# Featured and targeting
X_train = train_df.drop(columns='charges')
y_train = train_df['charges']

X_test = test_df.drop(columns='charges')
y_test = test_df['charges']

In [101]:
# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Prepare for model train

In [66]:
# Importing library
import torch
import torch.nn as nn
import torch.optim as optim

In [67]:
# Convert numpy to tensor
X_train_tensor = torch.tensor(X_train, dtype=torch.float64)
X_test_tensor = torch.tensor(X_test, dtype=torch.float64)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.float64).view((-1, 1))
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float64).view((-1, 1))

### Define a Model

In [72]:
class SimpleNNReg(nn.Module):
  def __init__(self, input_size):
    super(SimpleNNReg, self).__init__()

    self.network = nn.Sequential(
        nn.Linear(input_size, 64),
        nn.ReLU(),
        nn.Linear(64, 128),
        nn.ReLU(),
        nn.Linear(128, 1)
    )

  def forward(self, x):
    return self.network(x)

In [73]:
# Create model instance
in_sz = X_train_tensor.shape[1]
model = SimpleNNReg(in_sz)
model.double()
model

SimpleNNReg(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)

### Loss Function and Optimizer

In [74]:
loss_fun = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

### Train the model

In [81]:
# Training loop
epochs = 30000

for e in range(epochs):
  model.train() # For train mode
  optimizer.zero_grad()
  predict = model(X_train_tensor)
  loss = loss_fun(predict, y_train_tensor)
  loss.backward()
  optimizer.step()

  if (e + 1) % 100 == 0:
    print(f'Epoch [{e + 1}/{epochs}], loss = {loss.item():0.4f}')

Epoch [100/30000], loss = 3021652.5526
Epoch [200/30000], loss = 3025951.7137
Epoch [300/30000], loss = 2979687.2829
Epoch [400/30000], loss = 2981982.6021
Epoch [500/30000], loss = 2987482.8265
Epoch [600/30000], loss = 2986914.4598
Epoch [700/30000], loss = 2946926.8385
Epoch [800/30000], loss = 2990742.1919
Epoch [900/30000], loss = 2941605.5072
Epoch [1000/30000], loss = 2943836.0459
Epoch [1100/30000], loss = 2940635.6134
Epoch [1200/30000], loss = 2987160.3164
Epoch [1300/30000], loss = 2917477.4059
Epoch [1400/30000], loss = 2910832.5341
Epoch [1500/30000], loss = 2951265.4059
Epoch [1600/30000], loss = 2945282.0731
Epoch [1700/30000], loss = 2904267.9814
Epoch [1800/30000], loss = 2898235.3690
Epoch [1900/30000], loss = 2927914.4103
Epoch [2000/30000], loss = 2937943.0572
Epoch [2100/30000], loss = 2894786.2458
Epoch [2200/30000], loss = 2878544.6625
Epoch [2300/30000], loss = 2894678.9351
Epoch [2400/30000], loss = 2879047.8227
Epoch [2500/30000], loss = 2904657.8876
Epoch [26

### Evaluate the model

In [84]:
model.eval() # Evaluate mode
y_predic = model(X_test_tensor).detach().numpy()

### Check the errors

In [85]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [86]:
mse = mean_squared_error(y_test_tensor.numpy(), y_predic)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test_tensor.numpy(), y_predic)
r2 = r2_score(y_test_tensor.numpy(), y_predic)

print(f'Mean Square Error: {mse:.2f}')
print(f'Root Mean Square Error: {rmse:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'R2 Score: {r2:.2f}')

Mean Square Error: 55516852.11
Root Mean Square Error: 7450.96
Mean Absolute Error: 4984.39
R2 Score: 0.64


# Predic the charges

In [102]:
def predict_one(age, sex, bmi, children, smoker, region):
  # Create a dataframe
  predict_df = pd.DataFrame([[age, sex, bmi, children, smoker, region]], columns=['age', 'sex', 'bmi', 'children', 'smoker', 'region'])

  # Label encode for categorical using above encoder
  for col in ['sex', 'smoker', 'region']:
    le = label_encoder[col]
    predict_df[col] = le.transform(predict_df[col])

  # Normalize the value
  predict_df = scaler.transform(predict_df)

  # convert dataframe to tensor
  x_tensor = torch.tensor(predict_df, dtype=torch.float64)

  # run model
  model.eval()
  charges = model(x_tensor).item()
  return charges

In [105]:
one = predict_one(50, 'female', 27.9, 0, 'no', 'southwest')

print(f'Predicted value: {one:.2f}')

Predicted value: 6615.42
