In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error

In [3]:
train_file_path = "insurance.csv"

train_data = pd.read_csv(train_file_path)

train_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Data Analysis

In [4]:
features = list(train_data.columns)
features

['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']

In [5]:
sexEncoder = LabelEncoder()
sexEncoder.fit(train_data['sex'])
train_data['sex'] = sexEncoder.transform(train_data['sex'])

smokerEncoder = LabelEncoder()
smokerEncoder.fit(train_data['smoker'])
train_data['smoker'] = smokerEncoder.transform(train_data['smoker'])

regionEncoder = LabelEncoder()
regionEncoder.fit(train_data['region'])
train_data['region'] = regionEncoder.transform(train_data['region'])

In [6]:
train_data.describe()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,0.505232,30.663397,1.094918,0.204783,1.515695,13270.422265
std,14.04996,0.50016,6.098187,1.205493,0.403694,1.104885,12110.011237
min,18.0,0.0,15.96,0.0,0.0,0.0,1121.8739
25%,27.0,0.0,26.29625,0.0,0.0,1.0,4740.28715
50%,39.0,1.0,30.4,1.0,0.0,2.0,9382.033
75%,51.0,1.0,34.69375,2.0,0.0,2.0,16639.912515
max,64.0,1.0,53.13,5.0,1.0,3.0,63770.42801


In [7]:
scaler = StandardScaler()
scaler.fit(train_data[features])
scaled_train_data = scaler.transform(train_data[features])
scaled_df = pd.DataFrame(scaled_train_data, columns=features)

In [8]:
scaled_df.describe()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,-1.805565e-16,-1.234687e-16,-2.124194e-16,-5.576008e-17,1.0620970000000002e-17,9.027823000000001e-17,-8.098488000000001e-17
std,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374
min,-1.509965,-1.010519,-2.412011,-0.9086137,-0.5074631,-1.372326,-1.003557
25%,-0.8691547,-1.010519,-0.7164063,-0.9086137,-0.5074631,-0.4669155,-0.7046504
50%,-0.01474046,0.9895908,-0.0432088,-0.07876719,-0.5074631,0.4384945,-0.3212089
75%,0.8396738,0.9895908,0.6611572,0.7510793,-0.5074631,0.4384945,0.2783441
max,1.765289,0.9895908,3.685522,3.240619,1.970587,1.343905,4.171663


## Machine Learning

In [21]:
features = ['age', 'sex', 'bmi', 'children', 'smoker', 'region']
X = scaled_df[features]
y = scaled_df.charges

In [22]:
train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.7, random_state=0)

In [11]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(train_x, train_y)

y_pred = model.predict(test_x)

mse = mean_squared_error(test_y, y_pred)
print("Mean Squared Error: ", mse)

Mean Squared Error:  0.22731490662370302
