In [1]:
import pandas as pd

df=pd.read_csv('../data/cleaned.csv')

In [2]:
from sklearn.model_selection import train_test_split

X=df.drop('charges',axis=1)
y=df['charges']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=33)


In [3]:
from sklearn.preprocessing import StandardScaler

scalar=StandardScaler()
cols=['age','bmi','children','smoker_bmi','smoker_children']

X_train_scaled=scalar.fit_transform(X_train[cols])
X_test_scaled=scalar.transform(X_test[cols])


In [4]:
import numpy as np

X_train_lin = np.hstack([X_train_scaled, X_train.drop(cols, axis=1).values])
X_test_lin = np.hstack([X_test_scaled, X_test.drop(cols, axis=1).values])


In [5]:
print("Original X_train shape:", X_train.shape)
print("X_train_scaled shape :", X_train_scaled.shape)
print("X_train_lin shape:", X_train_lin.shape)

Original X_train shape: (1069, 11)
X_train_scaled shape : (1069, 5)
X_train_lin shape: (1069, 11)


In [6]:
from sklearn.linear_model import LinearRegression,Ridge

linear=LinearRegression()
linear.fit(X_train_lin,y_train)
linear_pred=linear.predict(X_test_lin)

In [7]:
from sklearn.model_selection import GridSearchCV

param_grid_ridge={
    'alpha':[0.1,1.0,5,10]
}
ridge=GridSearchCV(Ridge(),param_grid=param_grid_ridge,cv=3)
ridge.fit(X_train_lin,y_train)
ridge_pred=ridge.predict(X_test_lin)

In [8]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

poly_model=make_pipeline(PolynomialFeatures(),LinearRegression())
param_grid_poly={
    'polynomialfeatures__degree':[2,3,4,5]
}

poly=GridSearchCV(poly_model,param_grid=param_grid_poly,cv=3)
poly.fit(X_train_lin,y_train)
poly_pred=poly.predict(X_test_lin)

In [9]:
from sklearn.svm import SVR

param_grid_svr={
    'epsilon':[0.1,0.3,0.5],
    'C':[0.1,1,5],
    'kernel':['linear','rbf']
}

scalr_y=StandardScaler()
y_train_scaled=scalr_y.fit_transform(y_train.values.reshape(-1,1)).ravel()

svr=GridSearchCV(SVR(),param_grid=param_grid_svr,cv=2,n_jobs=-1)
svr.fit(X_train_lin,y_train_scaled)
svr_pred_scaled=svr.predict(X_test_lin)
svr_pred=scalr_y.inverse_transform(svr_pred_scaled.reshape(-1,1)).ravel()


In [10]:
from sklearn.ensemble import RandomForestRegressor

param_grid_random={
    'n_estimators':[100,200],
    'min_samples_split':[2,5],
    'max_depth':[None,5,10,20]
}

random=GridSearchCV(RandomForestRegressor(random_state=44),param_grid=param_grid_random,cv=3)
random.fit(X_train,y_train)
random_pred=random.predict(X_test)

In [11]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

def eval(name,ytest,ypred):
    print(name)
    print('mae',mean_absolute_error(ytest,ypred))
    print('mse',mean_squared_error(ytest,ypred))
    print('r2',r2_score(ytest,ypred),'\n')

eval('Linear Regression',y_test,linear_pred)
eval('Ridge Regression',y_test,ridge_pred)
eval('Polynomial Regression',y_test,poly_pred)
eval('svr',y_test,svr_pred)
eval('Randome Forest',y_test,random_pred)

Linear Regression
mae 0.28491006197532587
mse 0.19370497433830597
r2 0.772378560351457 

Ridge Regression
mae 0.2840034355206889
mse 0.19333392061236307
r2 0.7728145831411379 

Polynomial Regression
mae 0.2118870028571226
mse 0.14611604303107043
r2 0.8283000001208354 

svr
mae 0.27676450720889484
mse 0.20311353533847779
r2 0.7613226222827735 

Randome Forest
mae 0.20167093976618664
mse 0.13912137947167613
r2 0.8365193832042346 



In [13]:
random_pred_train=random.predict(X_train)
poly_pred_train=poly.predict(X_train_lin)

print("random forest train R²:", r2_score(y_train, random_pred_train))
print("polynomial train R²:", r2_score(y_train, poly_pred_train))

random forest train R²: 0.8618106538084896
polynomial train R²: 0.847964780056742


In [14]:
import joblib

joblib.dump(random,'../data/random_forest.joblib')

['../data/random_forest.joblib']