In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error


import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [12, 5]

In [21]:
df = pd.read_csv(r"C:\Users\pulla\Downloads\insurance.csv")

In [22]:
encoder = LabelEncoder()
df['sex'] = encoder.fit_transform(df['sex'])
df['smoker'] = encoder.fit_transform(df['smoker'])
df['region'] = encoder.fit_transform(df['region'])

In [23]:
x = df.drop("charges", axis = 1)
y = df["charges"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
lr = LinearRegression().fit(x_train_scaled, y_train)

In [24]:
print(f'model accuracy on training set : {lr.score(x_train_scaled, y_train)}')

model accuracy on training set : 0.7417049283233981


In [25]:
print(f'model accuracy on test set : {lr.score(x_test_scaled, y_test)}')

model accuracy on test set : 0.7833463107364537


In [27]:
pf = PolynomialFeatures(degree = 3)
x_train_quad = pf.fit_transform(x_train_scaled)
x_test_quad = pf.transform(x_test_scaled)
lr_pf = LinearRegression().fit(x_train_quad, y_train)

In [28]:
print(f'model accuracy on training set : {lr_pf.score(x_train_quad, y_train)}')
print(f'model accuracy on test set : {lr_pf.score(x_test_quad, y_test)}')

model accuracy on training set : 0.8500275601828835
model accuracy on test set : 0.8614310017069202


In [29]:
forest = RandomForestRegressor(n_estimators = 100,
                               criterion = 'squared_error',
                               max_depth = 4,
                               random_state = 42,
                               n_jobs = -1)
forest.fit(x_train_scaled, y_train)
forest_train_pred = forest.predict(x_train_scaled)
forest_test_pred = forest.predict(x_test_scaled)

In [30]:
print('MSE train data: %.3f, Mse test data: %.3f' % (mean_squared_error(y_train, forest_train_pred), mean_squared_error(y_test, forest_test_pred)))

MSE train data: 18381787.853, Mse test data: 20037431.207


In [31]:
print('R2 train data: %.3f, R2 test data: %.3f' % (r2_score(y_train, forest_train_pred), r2_score(y_test, forest_test_pred)))

R2 train data: 0.873, R2 test data: 0.871
