In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import os
from math import sqrt
import imblearn
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.cross_validation import train_test_split
#from sklearn import svm
from sklearn.metrics import *
import zipfile
from zipfile import ZipFile
from io import BytesIO
import sys
import glob
import datetime
import time
import boto.s3
from boto.s3.key import Key 
import pickle



In [2]:
dataset = pd.read_csv('insurance_revised.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,0,19,0,27.9,0,1,4,16884.924
1,1,18,1,33.77,1,0,3,1725.5523
2,2,28,1,33.0,3,0,3,4449.462
3,3,33,1,22.705,0,0,2,21984.47061
4,4,32,1,28.88,0,0,2,3866.8552


In [3]:
X = dataset.drop(['charges'], axis =1)
Y = dataset['charges']

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=np.random)

In [5]:
error_metric = pd.DataFrame({'r2_train': [],
                            'r2_test': [],
                             'rms_train':[], 
                            'rms_test': [],
                            'mae_train': [],
                            'mae_test':[],
                            'mape_train':[],
                            'mape_test':[]})
rmse_dict = {}    

In [6]:
def rmse(correct,estimated):
    rmse_val = np.sqrt(mean_squared_error(correct,estimated)) 
    return rmse_val

def calc_error_metric(modelname, model, X_train_scale, y_train, X_test_scale, y_test):
    global error_metric
    y_train_predicted = model.predict(X_train)
    y_test_predicted = model.predict(X_test)
        
    #MAE, RMS, MAPE, R2
    
    r2_train = r2_score(y_train, y_train_predicted)
    r2_test = r2_score(y_test, y_test_predicted)
    
    rms_train = sqrt(mean_squared_error(y_train, y_train_predicted))
    rms_test = sqrt(mean_squared_error(y_test, y_test_predicted))
        
    mae_train = mean_absolute_error(y_train, y_train_predicted)
    mae_test = mean_absolute_error(y_test, y_test_predicted)
        
    mape_train = np.mean(np.abs((y_train - y_train_predicted) / y_train)) * 100
    mape_test = np.mean(np.abs((y_test - y_test_predicted) / y_test)) * 100
        
    rmse_dict[modelname] = rms_test
        
    df_local = pd.DataFrame({'Model':[modelname],
                            'r2_train': [r2_train],
                            'r2_test': [r2_test],
                            'rms_train':[rms_train], 
                            'rms_test': [rms_test],
                            'mae_train': [mae_train],
                            'mae_test': [mae_test],
                            'mape_train':[mape_train],
                            'mape_test':[mape_test]})
        
    error_metric = pd.concat([error_metric, df_local])
    return error_metric

# Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
linreg = LinearRegression()
linreg.fit(X_train,Y_train)
filename = 'linreg_model.pckl'
pickle.dump(linreg,open(filename,'wb'))
calc_error_metric('Linear Regression', linreg, X_train, Y_train, X_test, Y_test)
print('LinearRegression completed!')

LinearRegression completed!


# Random Forest Regression

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
rf.fit(X_train,Y_train);
filename = 'randforest_model.pckl'
pickle.dump(rf,open(filename,'wb'))
calc_error_metric('Random Forest Regression', rf, X_train, Y_train, X_test, Y_test)
print('RandomForestRegression completed!')

RandomForestRegression completed!


# GradientBoostingRegressor

In [11]:
from sklearn.ensemble import GradientBoostingRegressor  

In [12]:
gb = GradientBoostingRegressor(n_estimators=300,learning_rate= 0.1,max_features=1.0,random_state=42)
gb.fit(X_train,Y_train);
filename = 'gradboost_model.pckl'
pickle.dump(gb,open(filename,'wb'))
calc_error_metric('Gradient Boosting Regression', rf, X_train, Y_train, X_test, Y_test)
print('GradientBoostingRegression completed!')

GradientBoostingRegression completed!


# Exporting model metrics csv file

In [13]:
error_metric.to_csv('Final_Error_metrics.csv', index = "False")

# Checking pickled models

In [14]:
Linear_reg_model = pickle.load(open('linreg_model.pckl', 'rb'))
Random_forest_reg_model = pickle.load(open('randforest_model.pckl', 'rb'))
Gradient_boost_reg_model = pickle.load(open('gradboost_model.pckl', 'rb'))

# Compressing pickled files

In [15]:
def zipping(path, ziph):
    ziph.write(os.path.join('linreg_model.pckl'))
    ziph.write(os.path.join('randforest_model.pckl'))
    ziph.write(os.path.join('gradboost_model.pckl'))

In [16]:
zf = zipfile.ZipFile('ModelComp.zip', 'w')
zipping('/', zf)
zf.close()

# Uploading compressed zip to S3 bucket

In [17]:
from boto.s3.key import Key

accessKey = ''
secretAccessKey =''

if not accessKey or not secretAccessKey:
    print('Access Key and Secret Access Key not provided!!')
    exit()

AWS_ACCESS_KEY_ID = accessKey
AWS_SECRET_ACCESS_KEY = secretAccessKey
try:
    conn = boto.connect_s3(AWS_ACCESS_KEY_ID,
            AWS_SECRET_ACCESS_KEY)

    print("Connected to S3")

except:
    print("Amazon keys are invalid!!")
    exit()

#ts = time.time()
#st = datetime.datetime.fromtimestamp(ts)    
bucket_name = 'insuranceprediction'
bucket = conn.create_bucket(bucket_name)

#ts = time.time()
#st = datetime.datetime.fromtimestamp(ts)    
# bucket_name2 = 'csv101'
# bucket2 = conn.create_bucket(bucket_name2)

filename = ('ModelComp.zip')
filename_csv = ("/Final_Error_metrics.csv")
print(filename)
print ("Created S3 bucket successfully")

def percent_cb(complete, total):
    sys.stdout.write('.')
    sys.stdout.flush()

k = Key(bucket)
k.key = 'ModelComp'
k.set_contents_from_filename(filename,cb=percent_cb, num_cb=10)

print("File successfully uploaded to S3")


Connected to S3
ModelComp.zip
Created S3 bucket successfully
..........File successfully uploaded to S3
