In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import thinkstats2
import thinkplot
import scipy.stats as ss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, log_loss, confusion_matrix, precision_score, recall_score, mean_squared_error
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as sm

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import binarize
import math


from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import ADASYN
from collections import Counter
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [35]:
# we want to know how our demographics tell us about our health. 
# is the insurance charge the correct amount for smokers?
df = pd.read_csv("insurance.csv")
df.dropna()
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [36]:
sns.pairplot(df, hue = 'smoker')

<seaborn.axisgrid.PairGrid at 0x1a8fa915460>

In [37]:
sns.boxplot(x="smoker", y="charges", data=df)

<AxesSubplot:xlabel='smoker', ylabel='charges'>

In [38]:
sns.boxplot(x="region", y="charges", data=df)

<AxesSubplot:xlabel='region', ylabel='charges'>

In [39]:
sns.countplot(x="region", data=df)

<AxesSubplot:xlabel='region', ylabel='count'>

In [40]:
sns.boxplot(x="sex", y="charges", data=df)

<AxesSubplot:xlabel='sex', ylabel='charges'>

#sns.boxplot(x="region_northwest", y="charges", data=temp,ax=ax[0,1])
#sns.boxplot(x="region_southeast", y="charges", data=temp,ax=ax[1,0])
#sns.boxplot(x="region_southwest", y="charges", data=temp,ax=ax[1,1])

In [41]:
temp = pd.get_dummies(df)
temp = pd.get_dummies(df, drop_first=True)
temp.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [42]:
corr = temp.corr()
corr.style.background_gradient()
corr.style.background_gradient().set_precision(2)

  corr.style.background_gradient().set_precision(2)


Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
age,1.0,0.11,0.04,0.3,-0.02,-0.03,-0.0,-0.01,0.01
bmi,0.11,1.0,0.01,0.2,0.05,0.0,-0.14,0.27,-0.01
children,0.04,0.01,1.0,0.07,0.02,0.01,0.02,-0.02,0.02
charges,0.3,0.2,0.07,1.0,0.06,0.79,-0.04,0.07,-0.04
sex_male,-0.02,0.05,0.02,0.06,1.0,0.08,-0.01,0.02,-0.0
smoker_yes,-0.03,0.0,0.01,0.79,0.08,1.0,-0.04,0.07,-0.04
region_northwest,-0.0,-0.14,0.02,-0.04,-0.01,-0.04,1.0,-0.35,-0.32
region_southeast,-0.01,0.27,-0.02,0.07,0.02,0.07,-0.35,1.0,-0.35
region_southwest,0.01,-0.01,0.02,-0.04,-0.0,-0.04,-0.32,-0.35,1.0


In [43]:
sns.scatterplot(data=temp, x='charges',y='smoker_yes')

<AxesSubplot:xlabel='sex', ylabel='charges'>

In [44]:
def convert_prob_to_label(prob, cutoff = 0.5):
    label = []
    for i in range(len(prob)):
        if prob[i] > cutoff:
            label.append(1)
        else:
            label.append(0)
    return label

In [45]:
# sampler 1
sampler = TomekLinks(sampling_strategy='not majority', n_jobs=-1)
#sampler = SMOTE(n_jobs=-1)
#sampler = ADASYN(n_jobs=-1)
#sampler = SVMSMOTE(n_jobs=-1)

#Build Log Reg model
#Split Data x/y
y4 = np.array(temp['smoker_yes']).reshape(-1,1)
x4 = np.array(temp.drop(columns={"smoker_yes"}))

X_train4, X_test4, y_train4, y_test4 = train_test_split(x4, y4, test_size=0.33)

#Look at relative imbalance after split
X_train4Samp, y_train4Samp = sampler.fit_resample(X_train4, y_train4)
print('Resampled dataset shape %s' % Counter(y_train4Samp))

#Normalize data
#scaler = MinMaxScaler()
#scler is used when the ranges of the variable are very different 
scaler = StandardScaler()
X_train4_norm = scaler.fit_transform(X_train4Samp)
X_test4_norm = scaler.transform(X_test4)

#Train model
model4 = LogisticRegression(max_iter=1000, n_jobs=-1).fit(X_train4_norm,y_train4Samp.ravel())

#Make predictions
preds4 = model4.predict(X_test4_norm)

#Generate 1/0 results and show results
labels4 = binarize(np.array(preds4).reshape(-1,1), threshold=.5)
conf_matrix4 = confusion_matrix(y_test4,labels4)
sns.heatmap(conf_matrix4, annot=True)

print("F1:",f1_score(y_test4, labels4))
print("Acc:",accuracy_score(y_test4, labels4))
print("Recall:",recall_score(y_test4, labels4))
print("Precision:",precision_score(y_test4, labels4))
print("RMSE test:", mean_squared_error(preds4,y_test4,squared=False))

Resampled dataset shape Counter({0: 709, 1: 162})
F1: 0.8700564971751412
Acc: 0.9479638009049773
Recall: 0.8850574712643678
Precision: 0.8555555555555555
RMSE test: 0.22811444297769184


In [46]:
# sampler 2
#sampler = TomekLinks(sampling_strategy='not majority', n_jobs=-1)
sampler = SMOTE(n_jobs=-1)
#sampler = ADASYN(n_jobs=-1)
#sampler = SVMSMOTE(n_jobs=-1)

#Build Log Reg model
#Split Data x/y
y5 = np.array(temp['smoker_yes']).reshape(-1,1)
x5 = np.array(temp.drop(columns={"smoker_yes"}))

X_train5, X_test5, y_train5, y_test5 = train_test_split(x5, y5, test_size=0.33)

#Look at relative imbalance after split
X_train5Samp, y_train5Samp = sampler.fit_resample(X_train5, y_train5)
print('Resampled dataset shape %s' % Counter(y_train5Samp))

#Normalize data
#scaler = MinMaxScaler()
#scler is used when the ranges of the variable are very different 
scaler = StandardScaler()
X_train5_norm = scaler.fit_transform(X_train5Samp)
X_test5_norm = scaler.transform(X_test5)

#Train model
model5 = LogisticRegression(max_iter=1000, n_jobs=-1).fit(X_train5_norm,y_train5Samp.ravel())

#Make predictions
preds5 = model5.predict(X_test5_norm)

#Generate 1/0 results and show results
labels5 = binarize(np.array(preds5).reshape(-1,1), threshold=.5)
conf_matrix5 = confusion_matrix(y_test5,labels5)
sns.heatmap(conf_matrix5, annot=True)

print("F1:",f1_score(y_test5, labels5))
print("Acc:",accuracy_score(y_test5, labels5))
print("Recall:",recall_score(y_test5, labels5))
print("Precision:",precision_score(y_test5, labels5))
print("RMSE test:", mean_squared_error(preds5,y_test5,squared=False))

Resampled dataset shape Counter({1: 722, 0: 722})
F1: 0.91324200913242
Acc: 0.9570135746606335
Recall: 1.0
Precision: 0.8403361344537815
RMSE test: 0.2073316795363567


In [47]:
# sampler 3
#sampler = TomekLinks(sampling_strategy='not majority', n_jobs=-1)
#sampler = SMOTE(n_jobs=-1)
sampler = ADASYN(n_jobs=-1)
#sampler = SVMSMOTE(n_jobs=-1)

#Build Log Reg model
#Split Data x/y
y6 = np.array(temp['smoker_yes']).reshape(-1,1)
x6 = np.array(temp.drop(columns={"smoker_yes"}))

X_train6, X_test6, y_train6, y_test6 = train_test_split(x6, y6, test_size=0.33)

#Look at relative imbalance after split
X_train6Samp, y_train6Samp = sampler.fit_resample(X_train6, y_train6)
print('Resampled dataset shape %s' % Counter(y_train6Samp))

#Normalize data
#scaler = MinMaxScaler()
#scler is used when the ranges of the variable are very different 
scaler = StandardScaler()
X_train6_norm = scaler.fit_transform(X_train6Samp)
X_test6_norm = scaler.transform(X_test6)

#Train model
model6 = LogisticRegression(max_iter=1000, n_jobs=-1).fit(X_train6_norm,y_train6Samp.ravel())

#Make predictions
preds6 = model6.predict(X_test6_norm)

#Generate 1/0 results and show results
labels6 = binarize(np.array(preds6).reshape(-1,1), threshold=.5)
conf_matrix6 = confusion_matrix(y_test6,labels6)
sns.heatmap(conf_matrix6, annot=True)

print("F1:",f1_score(y_test6, labels6))
print("Acc:",accuracy_score(y_test6, labels6))
print("Recall:",recall_score(y_test6, labels6))
print("Precision:",precision_score(y_test6, labels6))
print("RMSE test:", mean_squared_error(preds6,y_test6,squared=False))

Resampled dataset shape Counter({1: 706, 0: 705})
F1: 0.8783068783068783
Acc: 0.9479638009049773
Recall: 1.0
Precision: 0.7830188679245284
RMSE test: 0.22811444297769184


In [48]:
# sampler 4
#sampler = TomekLinks(sampling_strategy='not majority', n_jobs=-1)
#sampler = SMOTE(n_jobs=-1)
#sampler = ADASYN(n_jobs=-1)
sampler = SVMSMOTE(n_jobs=-1)

#Build Log Reg model
#Split Data x/y
y7 = np.array(temp['smoker_yes']).reshape(-1,1)
x7 = np.array(temp.drop(columns={"smoker_yes"}))

X_train7, X_test7, y_train7, y_test7 = train_test_split(x7, y7, test_size=0.33)

#Look at relative imbalance after split
X_train7Samp, y_train7Samp = sampler.fit_resample(X_train7, y_train7)
print('Resampled dataset shape %s' % Counter(y_train7Samp))

#Normalize data
#scaler = MinMaxScaler()
#scler is used when the ranges of the variable are very different 
scaler = StandardScaler()
X_train7_norm = scaler.fit_transform(X_train7Samp)
X_test7_norm = scaler.transform(X_test7)

#Train model
model7 = LogisticRegression(max_iter=1000, n_jobs=-1).fit(X_train7_norm,y_train7Samp.ravel())

#Make predictions
preds7 = model7.predict(X_test7_norm)

#Generate 1/0 results and show results
labels7 = binarize(np.array(preds7).reshape(-1,1), threshold=.5)
conf_matrix7 = confusion_matrix(y_test7,labels7)
sns.heatmap(conf_matrix7, annot=True)

print("F1:",f1_score(y_test7, labels7))
print("Acc:",accuracy_score(y_test7, labels7))
print("Recall:",recall_score(y_test7, labels7))
print("Precision:",precision_score(y_test7, labels7))
print("RMSE test:", mean_squared_error(preds7,y_test7,squared=False))

Resampled dataset shape Counter({1: 719, 0: 719})
F1: 0.8981481481481481
Acc: 0.9502262443438914
Recall: 1.0
Precision: 0.8151260504201681
RMSE test: 0.2231003264365801


In [49]:
# sampler 5
sampler = TomekLinks(sampling_strategy='not majority', n_jobs=-1)
#sampler = SMOTE(n_jobs=-1)
#sampler = ADASYN(n_jobs=-1)
#sampler = SVMSMOTE(n_jobs=-1)

#Split Data x/y
y8 = np.array(temp['smoker_yes']).reshape(-1,1)
x8 = np.array(temp.drop(columns={"smoker_yes"}))

X_train8, X_test8, y_train8, y_test8 = train_test_split(x8, y8, test_size=0.33)

#Look at relative imbalance after split
X_train8Samp, y_train8Samp = sampler.fit_resample(X_train8, y_train8)
print('Resampled dataset shape %s' % Counter(y_train8Samp))

#Normalize data
#scaler = MinMaxScaler()
scaler = StandardScaler()
X_train8_norm = scaler.fit_transform(X_train8Samp)
X_test8_norm = scaler.transform(X_test8)

#Train model
model8 = RandomForestClassifier(n_jobs=-1).fit(X_train8_norm,y_train8Samp.ravel())
#Make predictions
preds8 = model8.predict(X_test8_norm)

#Generate 1/0 results and show results
labels8 = binarize(np.array(preds8).reshape(-1,1), threshold=.5)
conf_matrix8 = confusion_matrix(y_test8,labels8)
sns.heatmap(conf_matrix8, annot=True)

print("F1:",f1_score(y_test8, labels8))
print("Acc:",accuracy_score(y_test8, labels8))
print("Recall:",recall_score(y_test8, labels8))
print("Precision:",precision_score(y_test8, labels8))
print("RMSE test:", mean_squared_error(preds8,y_test8,squared=False))

Resampled dataset shape Counter({0: 695, 1: 171})
F1: 0.8613138686131386
Acc: 0.9570135746606335
Recall: 0.8082191780821918
Precision: 0.921875
RMSE test: 0.2073316795363567


In [50]:
# sampler 6
#sampler = TomekLinks(sampling_strategy='not majority', n_jobs=-1)
sampler = SMOTE(n_jobs=-1)
#sampler = ADASYN(n_jobs=-1)
#sampler = SVMSMOTE(n_jobs=-1)

#Split Data x/y
y9 = np.array(temp['smoker_yes']).reshape(-1,1)
x9 = np.array(temp.drop(columns={"smoker_yes"}))

X_train9, X_test9, y_train9, y_test9 = train_test_split(x9, y9, test_size=0.33)

#Look at relative imbalance after split
X_train9Samp, y_train9Samp = sampler.fit_resample(X_train9, y_train9)
print('Resampled dataset shape %s' % Counter(y_train9Samp))

#Normalize data
#scaler = MinMaxScaler()
scaler = StandardScaler()
X_train9_norm = scaler.fit_transform(X_train9Samp)
X_test9_norm = scaler.transform(X_test9)

#Train model
model9 = RandomForestClassifier(n_jobs=-1).fit(X_train9_norm,y_train9Samp.ravel())
#Make predictions
preds9 = model9.predict(X_test9_norm)

#Generate 1/0 results and show results
labels9 = binarize(np.array(preds9).reshape(-1,1), threshold=.5)
conf_matrix9 = confusion_matrix(y_test9,labels9)
sns.heatmap(conf_matrix9, annot=True)

print("F1:",f1_score(y_test9, labels9))
print("Acc:",accuracy_score(y_test9, labels9))
print("Recall:",recall_score(y_test9, labels9))
print("Precision:",precision_score(y_test9, labels9))
print("RMSE test:", mean_squared_error(preds9,y_test9,squared=False))

Resampled dataset shape Counter({0: 715, 1: 715})
F1: 0.9025641025641027
Acc: 0.9570135746606335
Recall: 0.946236559139785
Precision: 0.8627450980392157
RMSE test: 0.2073316795363567


In [56]:
# sampler 7
#sampler = TomekLinks(sampling_strategy='not majority', n_jobs=-1)
#sampler = SMOTE(n_jobs=-1)
sampler = ADASYN(n_jobs=-1)
#sampler = SVMSMOTE(n_jobs=-1)

#Split Data x/y
y91 = np.array(temp['smoker_yes']).reshape(-1,1)
x91 = np.array(temp.drop(columns={"smoker_yes"}))

X_train91, X_test91, y_train91, y_test91 = train_test_split(x91, y91, test_size=0.33)

#Look at relative imbalance after split
X_train91Samp, y_train91Samp = sampler.fit_resample(X_train91, y_train91)
print('Resampled dataset shape %s' % Counter(y_train91Samp))

#Normalize data
#scaler = MinMaxScaler()
scaler = StandardScaler()
X_train91_norm = scaler.fit_transform(X_train91Samp)
X_test91_norm = scaler.transform(X_test91)

#Train model
model91 = RandomForestClassifier(n_jobs=-1).fit(X_train91_norm,y_train91Samp.ravel())
#Make predictions
preds91 = model91.predict(X_test91_norm)

#Generate 1/0 results and show results
labels91 = binarize(np.array(preds91).reshape(-1,1), threshold=.5)
conf_matrix91 = confusion_matrix(y_test91,labels91)
sns.heatmap(conf_matrix91, annot=True)

print("F1:",f1_score(y_test91, labels91))
print("Acc:",accuracy_score(y_test91, labels91))
print("Recall:",recall_score(y_test91, labels91))
print("Precision:",precision_score(y_test91, labels91))
print("RMSE test:", mean_squared_error(preds91,y_test91,squared=False))

Resampled dataset shape Counter({1: 726, 0: 717})
F1: 0.92
Acc: 0.9638009049773756
Recall: 0.968421052631579
Precision: 0.8761904761904762
RMSE test: 0.19026059766179765


In [52]:
# sampler 8
#sampler = TomekLinks(sampling_strategy='not majority', n_jobs=-1)
#sampler = SMOTE(n_jobs=-1)
#sampler = ADASYN(n_jobs=-1)
sampler = SVMSMOTE(n_jobs=-1)

#Split Data x/y
y92 = np.array(temp['smoker_yes']).reshape(-1,1)
x92 = np.array(temp.drop(columns={"smoker_yes"}))

X_train92, X_test92, y_train92, y_test92 = train_test_split(x92, y92, test_size=0.33)

#Look at relative imbalance after split
X_train92Samp, y_train92Samp = sampler.fit_resample(X_train92, y_train92)
print('Resampled dataset shape %s' % Counter(y_train92Samp))

#Normalize data
#scaler = MinMaxScaler()
scaler = StandardScaler()
X_train92_norm = scaler.fit_transform(X_train92Samp)
X_test92_norm = scaler.transform(X_test92)

#Train model
model92 = RandomForestClassifier(n_jobs=-1).fit(X_train92_norm,y_train92Samp.ravel())
#Make predictions
preds92 = model92.predict(X_test92_norm)

#Generate 1/0 results and show results
labels92 = binarize(np.array(preds92).reshape(-1,1), threshold=.5)
conf_matrix92 = confusion_matrix(y_test92,labels92)
sns.heatmap(conf_matrix92, annot=True)

print("F1:",f1_score(y_test92, labels92))
print("Acc:",accuracy_score(y_test92, labels92))
print("Recall:",recall_score(y_test92, labels92))
print("Precision:",precision_score(y_test92, labels92))
print("RMSE test:", mean_squared_error(preds92,y_test92,squared=False))

Resampled dataset shape Counter({0: 705, 1: 705})
F1: 0.8749999999999999
Acc: 0.9502262443438914
Recall: 0.927710843373494
Precision: 0.8279569892473119
RMSE test: 0.2231003264365801
