#### import libraries

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### import dataset

In [4]:
insurance_df = pd.read_csv('insurance.csv')
insurance_df.head(1) #visualize 1st row 
insurance_df.tail(5) #visualize last 5 rows 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


# data analysis

##### check if there are null variables 

#### method 1: 

In [5]:
sns.heatmap(insurance_df.isnull(), yticklabels= False, cbar = False, cmap="Blues")

<matplotlib.axes._subplots.AxesSubplot at 0x7f2760a2b210>

#### method 2:

In [6]:
insurance_df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [7]:
df_region = insurance_df.groupby(by='region').mean()
df_region

Unnamed: 0_level_0,age,bmi,children,charges
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
northeast,39.268519,29.173503,1.046296,13406.384516
northwest,39.196923,29.199785,1.147692,12417.575374
southeast,38.93956,33.355989,1.049451,14735.411438
southwest,39.455385,30.596615,1.141538,12346.937377


In [8]:
df_age = insurance_df.groupby(by='age').mean()
df_age

Unnamed: 0_level_0,bmi,children,charges
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18,31.326159,0.449275,7086.217556
19,28.596912,0.426471,9747.909335
20,30.632759,0.862069,10159.697736
21,28.185714,0.785714,4730.46433
22,31.087679,0.714286,10012.932802
23,31.454464,1.0,12419.82004
24,29.142679,0.464286,10648.015962
25,29.693929,1.285714,9838.365311
26,29.428929,1.071429,6133.825309
27,29.333571,0.964286,12184.701721


#### transform categorical values into numeric values

#### 1) for smokers 

In [11]:
insurance_df['smoker'].unique() # look for unique values in smoker column

array(['yes', 'no'], dtype=object)

In [12]:
insurance_df['smoker']= insurance_df['smoker'].apply(lambda x: 0 if x=='no' else 1)
insurance_df.head(1)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,1,southwest,16884.924


#### 2) for sex 

In [14]:
insurance_df['sex'].unique() 

array(['female', 'male'], dtype=object)

In [15]:
insurance_df['sex']= insurance_df['sex'].apply(lambda x: 0 if x=='female' else 1)
insurance_df.head(1)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924


#### 3) for regions: 

In [18]:
region_dummies = pd.get_dummies(insurance_df['region'], drop_first=True)

KeyError: 'region'

In [19]:
insurance_df= pd.concat([insurance_df,region_dummies],axis=1)
insurance_df.head(1)

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,19,0,27.9,0,1,16884.924,0,0,1


In [21]:
insurance_df.drop(['region'],axis=1, inplace=True)
insurance_df.head(1)

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,19,0,27.9,0,1,16884.924,0,0,1


# data visualization

# create training and testing datasets

# train model

### 1. train model locally 

### 2. train linear learner model on sagemaker