In [22]:
import pandas as pd
import numpy as np
import pickle

In [25]:
import pickle
print(pickle.__doc__)


Create portable serialized representations of Python objects.

See module copyreg for a mechanism for registering custom picklers.
See module pickletools source for extensive comments.

Classes:

    Pickler
    Unpickler

Functions:

    dump(object, file)
    dumps(object) -> string
    load(file) -> object
    loads(string) -> object

Misc variables:

    __version__
    format_version
    compatible_formats




In [2]:
data = pd.read_csv("final.csv")

In [3]:
data.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


In [4]:
data.drop(columns=["sex_female"], axis=1, inplace=True)
data.head()


Unnamed: 0,age,bmi,children,charges,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,1,1,0,0,1,0,0


In [5]:
data.drop(columns=["smoker_no"], axis=1, inplace=True)
data.head()


Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,0,1
1,18,33.77,1,1725.5523,1,0,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,0,1,0
3,33,22.705,0,21984.47061,1,0,0,1,0,0
4,32,28.88,0,3866.8552,1,0,0,1,0,0


In [6]:
X = data.drop(["charges"], axis=1)
y = data["charges"]

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=0)

In [8]:
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt
from sklearn.model_selection import cross_val_predict

In [9]:
def model_summary(model, model_name, cvn=20):  # Default value for cvn = 20
    print(model_name)
    y_pred_model_train = model.predict(x_train)
    y_pred_model_test = model.predict(x_test)
    R2Score_model_train = r2_score(y_train, y_pred_model_train)
    print("Training R2 Score: ", R2Score_model_train)
    R2Score_model_test = r2_score(y_test, y_pred_model_test)
    print("Testing R2 Score: ",  R2Score_model_test)
    RMSE_model_train = sqrt(mean_squared_error(y_train, y_pred_model_train))
    print("RMSE for Training Data: ", RMSE_model_train)
    RMSE_model_test = sqrt(mean_squared_error(y_test, y_pred_model_test))
    print("RMSE for Testing Data: ", RMSE_model_test)
    y_pred_cv_model = cross_val_predict(model, X, y, cv=cvn)
    accuracy_cv_model = r2_score(y, y_pred_cv_model)
    print("Accuracy for", cvn, "- Fold Cross Predicted: ", accuracy_cv_model)

 -----------------------------------------------------------------Linear regression -----------------------------------------------------------------------------------------------------------

In [10]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)

In [11]:
model_summary(lr, "Multiple_linear_Regression")

Multiple_linear_Regression
Training R2 Score:  0.7370262574551634
Testing R2 Score:  0.7999876970680433
RMSE for Training Data:  6140.157418880165
RMSE for Testing Data:  5641.62655885019
Accuracy for 20 - Fold Cross Predicted:  0.7466820128784701


 -----------------------------------------------------------------DecisionTreeRegressor----------------------------------------------------------------------------------------------------

In [12]:
from sklearn.tree import DecisionTreeRegressor

In [13]:
decision_tree_reg = DecisionTreeRegressor(max_depth=5, random_state=13)
decision_tree_reg.fit(x_train, y_train)
model_summary(decision_tree_reg, "Decision_Tree_Regression")

Decision_Tree_Regression
Training R2 Score:  0.8731782445850186
Testing R2 Score:  0.8426354825033413
RMSE for Training Data:  4264.029416244485
RMSE for Testing Data:  5004.139260660397
Accuracy for 20 - Fold Cross Predicted:  0.8515260951093238



 -----------------------------------------------------------------Applying Random Forest---------------------------------------------------------------------------------------------------



In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
random_forest_reg=RandomForestRegressor()
random_forest_reg.fit(x_train,y_train)

In [16]:
model_summary(random_forest_reg,"Random_Forest_Regressor")

Random_Forest_Regressor
Training R2 Score:  0.9738016579831458
Testing R2 Score:  0.8871567054239484
RMSE for Training Data:  1938.0287516409803
RMSE for Testing Data:  4237.540467557992
Accuracy for 20 - Fold Cross Predicted:  0.8399763311400297


In [28]:
import pickle
import joblib
filename = 'random_forest_reg.sav'
pickle.dump(random_forest_reg, open(filename, 'wb'))
# save the model to disk
filename = 'finalized_model.sav'
joblib.dump(random_forest_reg, filename)


['finalized_model.sav']


 -----------------------------------------------------------------xgboost ---------------------------------------------------------------------------------------------------



In [18]:
import xgboost as xgb

In [19]:
xgb_r = xgb.XGBRegressor(objective='reg:linear',
                         n_estimators=10, seed=123, verbosity=0)
# Fitting the model
xgb_r.fit(x_train, y_train)

# Predict the model
pred = xgb_r.predict(x_test)

In [20]:
model_summary(xgb_r,"xgboost")

xgboost
Training R2 Score:  0.9137338439628875
Testing R2 Score:  0.8779611351089913
RMSE for Training Data:  3516.766512303415
RMSE for Testing Data:  4406.817442547534
Accuracy for 20 - Fold Cross Predicted:  0.852100274994592
