## Insurance Dataset

In [40]:
# imports and csv read
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./data/insurance.csv')

In [41]:
# df size
df.shape

(1338, 7)

In [42]:
# df sample
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [43]:
# look for which columns we need
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [44]:
# describe the df
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [45]:
# verify null values
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [46]:
# watch nulls and dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [47]:
# drop unnecesary columns
df = df[['age', 'bmi', 'children', 'charges']]
df.head()

Unnamed: 0,age,bmi,children,charges
0,19,27.9,0,16884.924
1,18,33.77,1,1725.5523
2,28,33.0,3,4449.462
3,33,22.705,0,21984.47061
4,32,28.88,0,3866.8552


In [48]:
!pip list

Package                 Version
----------------------- -----------
asttokens               3.0.0
colorama                0.4.6
comm                    0.2.2
contourpy               1.3.1
cycler                  0.12.1
debugpy                 1.8.13
decorator               5.2.1
exceptiongroup          1.2.2
executing               2.1.0
fonttools               4.57.0
importlib_metadata      8.6.1
ipykernel               6.29.5
ipython                 9.1.0
ipython_pygments_lexers 1.1.1
jedi                    0.19.2
joblib                  1.4.2
jupyter_client          8.6.3
jupyter_core            5.7.2
kiwisolver              1.4.7
matplotlib              3.10.1
matplotlib-inline       0.1.7
munkres                 1.1.4
nest_asyncio            1.6.0
numpy                   2.2.4
packaging               24.2
pandas                  2.2.3
parso                   0.8.4
patsy                   1.0.1
pickleshare             0.7.5
pillow                  11.1.0
pip                     25

In [49]:
# save predictors and target columns
X = df.drop(columns=['charges'])

In [50]:
X.head()

Unnamed: 0,age,bmi,children
0,19,27.9,0
1,18,33.77,1
2,28,33.0,3
3,33,22.705,0
4,32,28.88,0


In [51]:
X.shape

(1338, 3)

In [52]:
y = df['charges']

In [53]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [54]:
y.shape

(1338,)

In [58]:
# using  scikit learn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
X_train.shape, y_train.shape

((1070, 3), (1070,))

In [60]:
X_test.shape, y_test.shape

((268, 3), (268,))

In [63]:
# validate if the number of rows are the same
X_train.shape[0] + X_test.shape[0] == X.shape[0]

True

In [68]:
# validate if test represents the 20% of the total data
X_test.shape[0] == round(X.shape[0] * 0.2)

True

In [71]:
# validate n° of columns
X_train.shape[1] == X.shape[1]

True

In [72]:
# train our linear model
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [73]:
# get intercept and coefficient
intercepto = model.intercept_
intercepto

np.float64(-6118.046178106708)

In [87]:
coefficient = model.coef_
coefficient

array([220.75776541, 332.20247584, 563.01943242])

In [78]:
X_train.columns

Index(['age', 'bmi', 'children'], dtype='object')

In [88]:
pd.DataFrame(coefficient, index = X_train.columns, columns=['Coefficient'])

Unnamed: 0,Coefficient
age,220.757765
bmi,332.202476
children,563.019432


In [90]:
# make the model doing predictions with test values
y_pred = model.predict(X_test)

In [94]:
# list first rows
y_pred[:5]

array([13305.28945949, 11801.95170145, 16941.71437111, 14278.42206855,
        8680.25439362])

In [93]:
# compare shape
y_pred.shape, y_test.shape

((268,), (268,))

## Get the RMSE and R2

In [110]:
def get_rmse(y_test, y_pred):
    mse = np.sum((y_pred - y_test) ** 2) / len(y_test)
    return np.sqrt(mse)

In [None]:
# get rmse
rmse = get_rmse(y_test, y_pred)

In [115]:
rmse

np.float64(11454.315153980095)

In [124]:
# metric for comparing
np.std(y), np.mean(y)

(np.float64(12105.484975561612), np.float64(13270.422265141257))

In [122]:
from sklearn.metrics import mean_squared_error, r2_score

# calculate root mean squared error
np.sqrt(mean_squared_error(y_test, y_pred))

np.float64(11454.315153980095)

In [123]:
# calculate r2 score
r2_score(y_test, y_pred)

0.15489592484270776

## Using all the variables (need dummy variables)

### Let's do all again but now with all the variables

In [125]:
df = pd.read_csv('./data/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [127]:
categorical_columns = ['sex', 'smoker', 'region']
new_df = pd.get_dummies(df, columns=categorical_columns, drop_first=True, dtype=int)
new_df

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,0,1,0,0,1
1,18,33.770,1,1725.55230,1,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.880,0,3866.85520,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,1,0,0
1334,18,31.920,0,2205.98080,0,0,0,0,0
1335,18,36.850,0,1629.83350,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,1


In [129]:
# get X and y
X = new_df.drop(['charges'], axis=1)
X

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,0,1,0,0,1
1,18,33.770,1,1,0,0,1,0
2,28,33.000,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.880,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,50,30.970,3,1,0,1,0,0
1334,18,31.920,0,0,0,0,0,0
1335,18,36.850,0,0,0,0,1,0
1336,21,25.800,0,0,0,0,0,1


In [130]:
y = new_df['charges']
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [131]:
# retrain the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [137]:
new_model = LinearRegression()
new_model.fit(X_train, y_train)

In [139]:
y_pred = new_model.predict(X_test)

In [141]:
mean_squared_error(y_test, y_pred) ** 0.5

5796.284659276271

In [142]:
r2_score(y_test, y_pred)

0.7835929767120724