In [1]:
# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Evaluation Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



In [3]:
from google.colab import files
data = files.upload()  # uploading the file and viewing it, please add the insurance.csv file in the upload option
data = pd.read_csv("insurance.csv",encoding="latin1")
data


Saving insurance.csv to insurance.csv


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
data = pd.get_dummies(data, columns=['sex', 'smoker','region']) # one-hot encoding the given columns
data

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,True,False,False,True,False,False,False,True
1,18,33.770,1,1725.55230,False,True,True,False,False,False,True,False
2,28,33.000,3,4449.46200,False,True,True,False,False,False,True,False
3,33,22.705,0,21984.47061,False,True,True,False,False,True,False,False
4,32,28.880,0,3866.85520,False,True,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,False,True,True,False,False,True,False,False
1334,18,31.920,0,2205.98080,True,False,True,False,True,False,False,False
1335,18,36.850,0,1629.83350,True,False,True,False,False,False,True,False
1336,21,25.800,0,2007.94500,True,False,True,False,False,False,False,True


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data["bmi"] = scaler.fit_transform(data[["bmi"]])  # BMI scaling

data

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,-0.453320,0,16884.92400,True,False,False,True,False,False,False,True
1,18,0.509621,1,1725.55230,False,True,True,False,False,False,True,False
2,28,0.383307,3,4449.46200,False,True,True,False,False,False,True,False
3,33,-1.305531,0,21984.47061,False,True,True,False,False,True,False,False
4,32,-0.292556,0,3866.85520,False,True,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,0.050297,3,10600.54830,False,True,True,False,False,True,False,False
1334,18,0.206139,0,2205.98080,True,False,True,False,True,False,False,False
1335,18,1.014878,0,1629.83350,True,False,True,False,False,False,True,False
1336,21,-0.797813,0,2007.94500,True,False,True,False,False,False,False,True


In [6]:
# splitting the training and testing data into 80-20 split

training_size = int(0.8*len(data))
training_data = data[:training_size]
test_data = data[training_size:]

training_data

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,-0.453320,0,16884.92400,True,False,False,True,False,False,False,True
1,18,0.509621,1,1725.55230,False,True,True,False,False,False,True,False
2,28,0.383307,3,4449.46200,False,True,True,False,False,False,True,False
3,33,-1.305531,0,21984.47061,False,True,True,False,False,True,False,False
4,32,-0.292556,0,3866.85520,False,True,True,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1065,42,-0.879836,1,7045.49900,True,False,True,False,False,False,False,True
1066,48,1.087058,2,8978.18510,False,True,True,False,False,False,True,False
1067,39,1.967156,0,5757.41345,False,True,True,False,True,False,False,False
1068,63,-1.476958,1,14349.85440,False,True,True,False,False,True,False,False


In [7]:

x = training_data.drop('charges',axis=1)
y = training_data['charges']


In [8]:
# calling the model

model = RandomForestRegressor()


In [9]:
#training the model and viewing the accuracy (ALMOST 97%!!)

model.fit(x, y)
y_pred = model.predict(x)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))

results_df = pd.DataFrame([{
    "R2 Score": r2,
    "MAE": mae,
    "MSE": mse,
    "RMSE": rmse
}])


# Display the full error table
print(results_df.round(4))


   R2 Score       MAE           MSE       RMSE
0    0.9755  1016.986  3.549382e+06  1883.9804
