In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import os
from math import sqrt
import imblearn
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.cross_validation import train_test_split
#from sklearn import svm
from sklearn.metrics import *
import zipfile
from zipfile import ZipFile
from io import BytesIO
import sys
import glob
import datetime
import time
import boto.s3
from boto.s3.key import Key 
import pickle

In [39]:
error_metric = pd.DataFrame({'r2_train': [],
                            'r2_test': [],
                             'rms_train':[], 
                            'rms_test': [],
                            'mae_train': [],
                            'mae_test':[],
                            'mape_train':[],
                            'mape_test':[]})
rmse_dict = {}   

In [40]:
def rmse(correct,estimated):
    rmse_val = np.sqrt(mean_squared_error(correct,estimated)) 
    return rmse_val

def calc_error_metric(modelname, model, X_train_scale, y_train, X_test_scale, y_test):
    global error_metric
    y_train_predicted = model.predict(X_train)
    y_test_predicted = model.predict(X_test)
        
    #MAE, RMS, MAPE, R2
    
    r2_train = r2_score(y_train, y_train_predicted)
    r2_test = r2_score(y_test, y_test_predicted)
    
    rms_train = sqrt(mean_squared_error(y_train, y_train_predicted))
    rms_test = sqrt(mean_squared_error(y_test, y_test_predicted))
        
    mae_train = mean_absolute_error(y_train, y_train_predicted)
    mae_test = mean_absolute_error(y_test, y_test_predicted)
        
    mape_train = np.mean(np.abs((y_train - y_train_predicted) / y_train)) * 100
    mape_test = np.mean(np.abs((y_test - y_test_predicted) / y_test)) * 100
        
    rmse_dict[modelname] = rms_test
        
    df_local = pd.DataFrame({'Model':[modelname],
                            'r2_train': [r2_train],
                            'r2_test': [r2_test],
                            'rms_train':[rms_train], 
                            'rms_test': [rms_test],
                            'mae_train': [mae_train],
                            'mae_test': [mae_test],
                            'mape_train':[mape_train],
                            'mape_test':[mape_test]})
        
    error_metric = pd.concat([error_metric, df_local])
    return error_metric

## Fitting Random Forest Model & Pickle for Clustor 0

In [41]:
from sklearn.ensemble import RandomForestRegressor

In [42]:
dataset = pd.read_csv('Insurance_c0.csv')
dataset.drop(["Unnamed: 0"], axis=1, inplace=True)
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,60,0,25.84,0,0,2,28923.13692
1,62,0,26.29,0,1,3,27808.7251
2,56,0,39.82,0,0,3,11090.7178
3,52,0,30.78,1,0,1,10797.3362
4,56,1,40.3,0,0,4,10602.385


In [43]:
X = dataset.drop(['charges'], axis =1)
Y = dataset['charges']

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=np.random)

In [45]:
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf.fit(X_train,Y_train);
filename = 'randforest_c0.pckl'
pickle.dump(rf,open(filename,'wb'))
calc_error_metric('RandomForestRegression C0', rf, X_train, Y_train, X_test, Y_test)
print('RandomForestRegression completed!')

RandomForestRegression completed!


In [46]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train,Y_train)
filename = 'linreg_c0.pckl'
pickle.dump(linreg,open(filename,'wb'))
calc_error_metric('Linear Regression', linreg, X_train, Y_train, X_test, Y_test)
print('LinearRegression completed!')

LinearRegression completed!


In [47]:
from sklearn.ensemble import GradientBoostingRegressor 

gb = GradientBoostingRegressor(n_estimators=300,learning_rate= 0.1,max_features=1.0,random_state=42)
gb.fit(X_train,Y_train);
filename = 'gradboost_c0.pckl'
pickle.dump(gb,open(filename,'wb'))
calc_error_metric('Gradient Boosting Regression', rf, X_train, Y_train, X_test, Y_test)
print('GradientBoostingRegression completed!')

GradientBoostingRegression completed!


## Fitting Random Forest Model & Pickle for Clustor 1

In [48]:
dataset = pd.read_csv('Insurance_c1.csv')
dataset.drop(["Unnamed: 0"], axis=1, inplace=True)
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,4,16884.924
1,18,1,33.77,1,0,3,1725.5523
2,28,1,33.0,3,0,3,4449.462
3,31,0,25.74,0,0,3,3756.6216
4,25,1,26.22,0,0,1,2721.3208


In [49]:
X = dataset.drop(['charges'], axis =1)
Y = dataset['charges']

In [50]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=np.random)

In [51]:
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf.fit(X_train,Y_train);
filename = 'randforest_c1.pckl'
pickle.dump(rf,open(filename,'wb'))
calc_error_metric('RandomForestRegression C1', rf, X_train, Y_train, X_test, Y_test)
print('RandomForestRegression completed!')

RandomForestRegression completed!


In [52]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train,Y_train)
filename = 'linreg_c1.pckl'
pickle.dump(linreg,open(filename,'wb'))
calc_error_metric('Linear Regression', linreg, X_train, Y_train, X_test, Y_test)
print('LinearRegression completed!')

LinearRegression completed!


In [53]:
from sklearn.ensemble import GradientBoostingRegressor 

gb = GradientBoostingRegressor(n_estimators=300,learning_rate= 0.1,max_features=1.0,random_state=42)
gb.fit(X_train,Y_train);
filename = 'gradboost_c1.pckl'
pickle.dump(gb,open(filename,'wb'))
calc_error_metric('Gradient Boosting Regression', rf, X_train, Y_train, X_test, Y_test)
print('GradientBoostingRegression completed!')

GradientBoostingRegression completed!


## Fitting Random Forest Model & Pickle for Clustor 2

In [54]:
dataset = pd.read_csv('Insurance_c2.csv')
dataset.drop(["Unnamed: 0"], axis=1, inplace=True)
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,33,1,22.705,0,0,2,21984.47061
1,32,1,28.88,0,0,2,3866.8552
2,46,0,33.44,1,0,3,8240.5896
3,37,0,27.74,3,0,2,7281.5056
4,37,1,29.83,2,0,1,6406.4107


In [55]:
X = dataset.drop(['charges'], axis =1)
Y = dataset['charges']

In [56]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=np.random)

In [57]:
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf.fit(X_train,Y_train);
filename = 'randforest_c2.pckl'
pickle.dump(rf,open(filename,'wb'))
calc_error_metric('RandomForestRegression C2', rf, X_train, Y_train, X_test, Y_test)
print('RandomForestRegression completed!')

RandomForestRegression completed!


In [58]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train,Y_train)
filename = 'linreg_c2.pckl'
pickle.dump(linreg,open(filename,'wb'))
calc_error_metric('Linear Regression', linreg, X_train, Y_train, X_test, Y_test)
print('LinearRegression completed!')

LinearRegression completed!


In [59]:
from sklearn.ensemble import GradientBoostingRegressor 

gb = GradientBoostingRegressor(n_estimators=300,learning_rate= 0.1,max_features=1.0,random_state=42)
gb.fit(X_train,Y_train);
filename = 'gradboost_c2.pckl'
pickle.dump(gb,open(filename,'wb'))
calc_error_metric('Gradient Boosting Regression', rf, X_train, Y_train, X_test, Y_test)
print('GradientBoostingRegression completed!')

GradientBoostingRegression completed!


# Exporting model metrics csv file

In [60]:
error_metric.to_csv('Clustor_Error_metrics.csv', index = "False")

In [61]:
readerr = pd.read_csv('Clustor_Error_metrics.csv')
readerr

Unnamed: 0.1,Unnamed: 0,Model,mae_test,mae_train,mape_test,mape_train,r2_test,r2_train,rms_test,rms_train
0,0,RandomForestRegression C0,2474.048744,1141.207198,15.429439,7.011788,0.819666,0.970964,4706.456668,1956.265431
1,0,Linear Regression,3969.757807,4133.724142,21.702814,21.683874,0.728637,0.7236,5773.386623,6035.743841
2,0,Gradient Boosting Regression,2474.048744,1141.207198,15.429439,7.011788,0.819666,0.970964,4706.456668,1956.265431
3,0,RandomForestRegression C1,2503.135912,1110.672796,62.046136,21.319986,0.867441,0.971936,4233.429017,1952.566894
4,0,Linear Regression,3755.186305,4499.267973,64.964523,74.496346,0.811104,0.69645,5053.577528,6421.623552
5,0,Gradient Boosting Regression,2503.135912,1110.672796,62.046136,21.319986,0.867441,0.971936,4233.429017,1952.566894
6,0,RandomForestRegression C2,2416.576756,941.499942,23.818654,7.700176,0.849968,0.976902,4345.754136,1840.641546
7,0,Linear Regression,3841.286625,4053.578114,29.636042,32.178944,0.749253,0.765502,5618.115695,5864.837122
8,0,Gradient Boosting Regression,2416.576756,941.499942,23.818654,7.700176,0.849968,0.976902,4345.754136,1840.641546
