# INSURANCE COST PREDICTION

## REGRESSION MODEL

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [2]:
df = pd.read_csv("insurance.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.shape

(1338, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [7]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [8]:
df.describe(include ='all')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338,1338.0,1338.0,1338,1338,1338.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,676,,,1064,364,
mean,39.207025,,30.663397,1.094918,,,13270.422265
std,14.04996,,6.098187,1.205493,,,12110.011237
min,18.0,,15.96,0.0,,,1121.8739
25%,27.0,,26.29625,0.0,,,4740.28715
50%,39.0,,30.4,1.0,,,9382.033
75%,51.0,,34.69375,2.0,,,16639.912515


In [9]:
# convert the categorical column sex,smoker,region into numrical column
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [10]:
print(df['sex'].unique())
print(df['smoker'].unique())
print(df['region'].unique())


['female' 'male']
['yes' 'no']
['southwest' 'southeast' 'northwest' 'northeast']


In [11]:
# Encoding 'sex' column
df['sex'] = df['sex'].map({'male':1, 'female':0})

# Encoding 'smoker' column
df['smoker'] = df['smoker'].map({'yes':1, 'no':0})

# Encoding 'region' column
df['region'] = df['region'].map({'southwest':0, 'southeast':1, 'northwest':2, 'northeast':3})


In [12]:
# # CAN ALSO BE DONE 
# # ENCODING USING LABEL ENCODING 
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()

# # Encode 'sex'
# df['sex'] = le.fit_transform(df['sex'])

#  for sex:
# 'female' → 0
# 'male' → 1

# # Encode 'smoker'
# df['smoker'] = le.fit_transform(df['smoker'])

#  for smoker:
# 'no' → 0
# 'yes' → 1


# # Encode 'region'
# df['region'] = le.fit_transform(df['region'])

# For region, it will assign values like:
# northeast → 0
# northwest → 1
# southeast → 2
# southwest → 3


In [13]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


In [14]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [15]:
x = df.drop(['charges'] , axis = 1)
y = df['charges']

In [16]:
Model_Score = {}
Model_score_Precentage = {}

def model_val(Model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    Model.fit(x_train, y_train)
    y_pred = Model.predict(x_test)
    
    # Check if model is a regressor or classifier
    if Model._estimator_type == 'regressor':
        print(f'{Model} \nR2 Score: {r2_score(y_test, y_pred)}')
        print(f'{Model} \nRMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
        
        # Cross validation with regression scoring
        score = cross_val_score(Model, x, y, cv=10, scoring='r2')
        
    elif Model._estimator_type == 'classifier':
        print(f'{Model} \nAccuracy score: {accuracy_score(y_test, y_pred)}')
        
        # Cross validation with classification scoring (default)
        score = cross_val_score(Model, x, y, cv=10)
        
    else:
        print(f"{Model} type is not recognized.")
        return
    
    print(f'{Model}\nAverage Cross Validation Score: {np.mean(score)}')
    print("-------------------------------------------------")
    
    Model_Score[Model] = np.mean(score)
    Model_score_Precentage[Model] = round(np.mean(score)*100)

In [17]:
# Linear Regression
model1 = LinearRegression()
model_val(model1, x, y)

# Support Vector Regressor
model2 = SVR()
model_val(model2, x, y)

# Decision Tree Regressor
model3 = DecisionTreeRegressor()
model_val(model3, x, y)

# Random Forest Regressor
model4 = RandomForestRegressor()
model_val(model4, x, y)

LinearRegression() 
R2 Score: 0.7833463107364539
LinearRegression() 
RMSE: 5799.587091438356
LinearRegression()
Average Cross Validation Score: 0.7448047213193172
-------------------------------------------------
SVR() 
R2 Score: -0.07230841842488811
SVR() 
RMSE: 12902.500599822259
SVR()
Average Cross Validation Score: -0.10491809489828101
-------------------------------------------------
DecisionTreeRegressor() 
R2 Score: 0.7167148334400998
DecisionTreeRegressor() 
RMSE: 6631.716426300229
DecisionTreeRegressor()
Average Cross Validation Score: 0.6988467134273645
-------------------------------------------------
RandomForestRegressor() 
R2 Score: 0.8654385035100158
RandomForestRegressor() 
RMSE: 4570.612829556313
RandomForestRegressor()
Average Cross Validation Score: 0.8344204165104518
-------------------------------------------------
