#### import libraries

In [108]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### import dataset

In [4]:
insurance_df = pd.read_csv('insurance.csv')
insurance_df.head(1) #visualize 1st row 
insurance_df.tail(5) #visualize last 5 rows 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


# data analysis

##### check if there are null variables 

#### method 1: 

In [5]:
sns.heatmap(insurance_df.isnull(), yticklabels= False, cbar = False, cmap="Blues")

<matplotlib.axes._subplots.AxesSubplot at 0x7f2760a2b210>

#### method 2:

In [6]:
insurance_df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [7]:
df_region = insurance_df.groupby(by='region').mean()
df_region

Unnamed: 0_level_0,age,bmi,children,charges
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
northeast,39.268519,29.173503,1.046296,13406.384516
northwest,39.196923,29.199785,1.147692,12417.575374
southeast,38.93956,33.355989,1.049451,14735.411438
southwest,39.455385,30.596615,1.141538,12346.937377


In [8]:
df_age = insurance_df.groupby(by='age').mean()
df_age

Unnamed: 0_level_0,bmi,children,charges
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18,31.326159,0.449275,7086.217556
19,28.596912,0.426471,9747.909335
20,30.632759,0.862069,10159.697736
21,28.185714,0.785714,4730.46433
22,31.087679,0.714286,10012.932802
23,31.454464,1.0,12419.82004
24,29.142679,0.464286,10648.015962
25,29.693929,1.285714,9838.365311
26,29.428929,1.071429,6133.825309
27,29.333571,0.964286,12184.701721


#### transform categorical values into numeric values

#### 1) for smokers 

In [11]:
insurance_df['smoker'].unique() # look for unique values in smoker column

array(['yes', 'no'], dtype=object)

In [12]:
insurance_df['smoker']= insurance_df['smoker'].apply(lambda x: 0 if x=='no' else 1)
insurance_df.head(1)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,1,southwest,16884.924


#### 2) for sex 

In [14]:
insurance_df['sex'].unique() 

array(['female', 'male'], dtype=object)

In [15]:
insurance_df['sex']= insurance_df['sex'].apply(lambda x: 0 if x=='female' else 1)
insurance_df.head(1)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924


#### 3) for regions: 

In [18]:
region_dummies = pd.get_dummies(insurance_df['region'], drop_first=True)

KeyError: 'region'

In [19]:
insurance_df= pd.concat([insurance_df,region_dummies],axis=1)
insurance_df.head(1)

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,19,0,27.9,0,1,16884.924,0,0,1


In [21]:
insurance_df.drop(['region'],axis=1, inplace=True)
insurance_df.head(1)

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,19,0,27.9,0,1,16884.924,0,0,1


# create training and testing datasets

In [70]:
X = insurance_df.drop(columns=['charges'])
y= insurance_df['charges']

In [35]:
X

Unnamed: 0,age,sex,bmi,children,smoker,northwest,southeast,southwest
0,19,0,27.900,0,1,0,0,1
1,18,1,33.770,1,0,0,1,0
2,28,1,33.000,3,0,0,1,0
3,33,1,22.705,0,0,1,0,0
4,32,1,28.880,0,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,0,0
1334,18,0,31.920,0,0,0,0,0
1335,18,0,36.850,0,0,0,1,0
1336,21,0,25.800,0,0,0,0,1


In [63]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [71]:
y.shape

(1338,)

In [72]:
# convert to np array
X= np.array(X).astype('float32')
y= np.array(y).astype('float32')

In [73]:
y = y.reshape(-1,1)

In [74]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler_x = StandardScaler()
X = scaler_x.fit_transform(X)

scaler_y = StandardScaler()
y = scaler_y.fit_transform(y)

In [75]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2)
X_train.shape

(1070, 8)

In [76]:
X_test.shape

(268, 8)

# train model

### 1. train model locally 

In [77]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

regression_model_sklearn = LinearRegression()
regression_model_sklearn.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [78]:
regression_model_sklearn_accuracy = regression_model_sklearn.score(X_test, y_test)
regression_model_sklearn_accuracy

0.7381829951630141

In [79]:
y_predict = regression_model_sklearn.predict(X_test)

In [80]:
y_predict

array([[-2.50021547e-01],
       [-2.53894925e-01],
       [ 1.46815372e+00],
       [-2.46394686e-02],
       [-1.17366302e+00],
       [-1.29000023e-01],
       [-3.66376460e-01],
       [-3.68861169e-01],
       [ 6.34180009e-02],
       [ 1.11613834e+00],
       [-6.26666188e-01],
       [-2.52857059e-01],
       [-1.04669261e+00],
       [-3.55574079e-02],
       [-2.55950421e-01],
       [-3.70558113e-01],
       [-6.17642045e-01],
       [-4.65301663e-01],
       [-2.69515216e-01],
       [-3.30879927e-01],
       [-7.97425210e-01],
       [-9.26934004e-01],
       [ 1.12139904e+00],
       [ 2.02622318e+00],
       [-1.22671455e-01],
       [-3.73358876e-01],
       [-9.72388506e-01],
       [-7.55786419e-01],
       [-4.60379928e-01],
       [-2.50657558e-01],
       [-1.95484683e-01],
       [ 1.40265495e-01],
       [-1.97214872e-01],
       [-3.72041494e-01],
       [-1.00930482e-01],
       [ 1.30079651e+00],
       [-9.14133549e-01],
       [-2.97471844e-02],
       [-4.1

In [81]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt 

RMSE = float(format(np.sqrt(mean_squared_error(y_test,y_predict)),'.3f'))
MSE= mean_squared_error(y_test,y_predict)
print ('RMSE:',RMSE)
print ('MSE:',MSE)

RMSE: 0.484
MSE: 0.23394296


In [82]:
MAE = mean_absolute_error(y_test,y_predict)
r2= r2_score(y_test, y_predict)
print ('MAE:',MAE,'R2:',r2)

MAE: 0.33258393 R2: 0.7381829951630141


### 2. train linear learner model on sagemaker

In [83]:
import sagemaker
import boto3

sess= sagemaker.Session()
bucket = 'medicalinsuranceprediction'
prefix ='linear_learner'
role= sagemaker.get_execution_role()

In [84]:
X_train.shape

(1070, 8)

In [85]:
y_train.shape

(1070, 1)

In [86]:
# prepare buffer to upload to s3
import io
import numpy as np
import sagemaker.amazon.common as smac 

y_train = y_train[:,0]
buffer = io.BytesIO()
smac.write_numpy_to_dense_tensor(buffer, X_train, y_train)
buffer.seek(0) 

0

In [88]:
import os 

key = 'train-data'
path = 's3://{}/{}/train/{}'.format(bucket,prefix,key)
# uploading buffer content to S3 
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix,'train',key)).upload_fileobj(buffer)
print ('successfully uploaded to : {}'.format(path))

successfully uploaded to : s3://medicalinsuranceprediction/linear_learner/train/train-data


In [89]:
output_path ='s3://{}/{}/result'.format(bucket,prefix)

#### -- get algorithm container for linear learner algorithm: 

In [90]:
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, 'linear-learner')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


#### -- train model:

In [92]:
linear = sagemaker.estimator.Estimator( container, role, instance_count=1,instance_type='ml.m5.large', output_path= output_path , sagemaker_session=sess)
linear.set_hyperparameters(feature_dim = 8, predictor_type = 'regressor', mini_batch_size = 100, epochs = 100, num_models =32, loss= 'absolute_loss')
linear.fit({'train': path})

INFO:sagemaker:Creating training-job with name: linear-learner-2023-04-26-13-44-01-452


2023-04-26 13:44:02 Starting - Starting the training job...
2023-04-26 13:44:17 Starting - Preparing the instances for training...
2023-04-26 13:45:02 Downloading - Downloading input data......
2023-04-26 13:45:43 Training - Downloading the training image......
2023-04-26 13:47:09 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[04/26/2023 13:46:56 INFO 140510060754752] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 

#### --deploy model:

In [93]:
linear_regressor = linear.deploy(initial_instance_count=1, instance_type='ml.m5.large')

INFO:sagemaker:Creating model with name: linear-learner-2023-04-26-13-53-44-290
INFO:sagemaker:Creating endpoint-config with name linear-learner-2023-04-26-13-53-44-290
INFO:sagemaker:Creating endpoint with name linear-learner-2023-04-26-13-53-44-290


------!

#### -- prepare test environment and predict on test data:

In [101]:
from sagemaker.predictor import csv_serializer, json_deserializer

linear_regressor.serializer= sagemaker.serializers.CSVSerializer()
linear_regressor.deserializer= sagemaker.deserializers.JSONDeserializer()

In [102]:
result = linear_regressor.predict(X_test)

In [103]:
result

{'predictions': [{'score': -0.711105465888977},
  {'score': -0.4264349639415741},
  {'score': 2.1784417629241943},
  {'score': -0.09387718141078949},
  {'score': -1.139939785003662},
  {'score': -0.3526690900325775},
  {'score': -0.547627866268158},
  {'score': -0.6151759624481201},
  {'score': -0.21982760727405548},
  {'score': 1.5254321098327637},
  {'score': -0.8674352765083313},
  {'score': -0.3637911081314087},
  {'score': -0.9912424087524414},
  {'score': -0.2685876190662384},
  {'score': -0.8852575421333313},
  {'score': -0.22418595850467682},
  {'score': -0.9384983777999878},
  {'score': -0.43587812781333923},
  {'score': -0.4446893334388733},
  {'score': -0.6693238615989685},
  {'score': -0.8512455821037292},
  {'score': -0.7814111709594727},
  {'score': 1.7435634136199951},
  {'score': 2.1763839721679688},
  {'score': -0.26842400431632996},
  {'score': -0.3374113440513611},
  {'score': -0.9059420228004456},
  {'score': -0.94321209192276},
  {'score': -0.6498516201972961},
  {

In [105]:
predictions = np.array([i['score']for i in result['predictions']])
predictions

array([-0.71110547, -0.42643496,  2.17844176, -0.09387718, -1.13993979,
       -0.35266909, -0.54762787, -0.61517596, -0.21982761,  1.52543211,
       -0.86743528, -0.36379111, -0.99124241, -0.26858762, -0.88525754,
       -0.22418596, -0.93849838, -0.43587813, -0.44468933, -0.66932386,
       -0.85124558, -0.78141117,  1.74356341,  2.17638397, -0.268424  ,
       -0.33741134, -0.90594202, -0.94321209, -0.64985162, -0.46828061,
       -0.39912954, -0.08852671, -0.53284526, -0.35944328, -0.31674507,
        1.84772909, -1.02832401, -0.11377002, -0.55825257, -0.299631  ,
        1.93243587, -0.54242909, -0.53458065,  0.03206675, -0.20041759,
       -0.5054599 ,  1.53514528, -0.57696062, -0.43128309, -0.59832013,
       -0.5700959 ,  1.7315073 , -0.30475155, -0.49159971, -0.63488656,
       -0.39979014, -0.70382214, -0.23747809, -0.55165172, -0.78748071,
       -0.59709507,  2.42409897, -1.10703611, -0.67172116,  1.69947493,
       -0.3462784 , -0.77790707, -0.30259314, -0.11791392, -0.68