**PREPROCESSING**

1.1 LOADING DATA

In [None]:
import warnings
warnings.simplefilter(action='ignore', 
                      category=FutureWarning)      # suppress warnings
import numpy as np                                 # linear algebra
import pandas as pd                                # data analysis
import matplotlib.pyplot as plt                    # visualization
import seaborn as sns                              # visualization
import scipy.stats as scipystats                   # statistics  
import statsmodels.formula.api as smf              # statistics
from statsmodels.api import add_constant           # statistics
from sklearn.feature_selection import SelectKBest  # feature selection
from sklearn.feature_selection import f_regression # feature selection

pd.set_option('display.float_format', lambda x: '%.1f' % x) # format decimals
sns.set(font_scale=1.5) # increse font size for seaborn charts
%matplotlib inline

MEDICAL_DATA = pd.read_csv('../input/insurance.csv')

In [None]:
OLDER = MEDICAL_DATA.loc[MEDICAL_DATA['age'] > 0].reset_index(drop=True)
print ("Rows: ",OLDER.shape[0],"   Variables: ", OLDER.shape[1])

1.2 READING DATA

In [None]:
OLDER.head()

In [None]:
#Get categorical variables
s = (OLDER.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
#one-hot encoding id best because region because is a nominal, and not an ordinal, variable.
#from sklearn.preprocessing import OneHotEncoder
#OH_encoder = OneHotEncoder()
#OH_cols = pd.DataFrame(OH_encoder.fit_transform(OLDER[object_cols]))
#OH_cols.index = OLDER.index
#num_OLDER = OLDER.drop(object_cols, axis=1)
#OH_OLDER = pd.concat([num_OLDER, OH_cols], axis=1)


#Label Encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for col in object_cols:
    OLDER[col] = label_encoder.fit_transform(OLDER[col])

In [None]:
#Summary of correlation
OLDER.corr()['charges'].sort_values()

In [None]:
plt.figure(figsize=(15,5))
plt.hist(OLDER['charges']); # distribution of charges
plt.title('Distribution of charges');

There seems to be a large proportion of people who are paying under $15000 dollars on insurance, but also clusters around 27500-30000 and 47500-50000.

In [None]:
#Checking if there are any NANs
OLDER.isnull().sum()

In [None]:
#Take out non-numeric Variables [NOT NECESSARY]
#OLDER = OLDER.select_dtypes(include=[np.number]) # drop non-numeric fields
#print ("Rows: ",OLDER.shape[0],"   Variables: ", OLDER.shape[1])

REGRESSION

In [None]:
X = add_constant(OLDER[['charges']])
Y = OLDER['age']
regr = smf.OLS(Y,X).fit()
regr.summary()

In [None]:
sns_plot = sns.lmplot(x='charges', y='age',data=OLDER,size = 10)
plt.title('Relationship between charges and age');

It's clear that there is a 'long tail' of people who have health insurance under $10000.

In [None]:
#Testing Homoskedacity
plt.figure(figsize=(10,10))
plt.scatter(regr.predict(), regr.resid)
plt.title('Residuals versus Predicted charges');

The spread of data points is fairly consistent over the different charges. Still, let's check if there are any outliers that are particularly skewing the trend.

In [None]:
#Find outliers
OLDER  = pd.concat([OLDER, pd.Series(regr.resid, name = 'resid')], axis = 1)
OLDER  = OLDER.sort_values(ascending=False,by=['resid'])
OLDER.loc[OLDER['resid'] > 10]

First let's look specifically at Teenagers, and work out if teenagers follow the same pattern as others.

In [None]:
Teenagers = OLDER.loc[OLDER['age'] <= 18] 
plt.figure(figsize=(15,5))
plt.hist(Teenagers['charges']); # distribution of charges
plt.title('Distribution of Teenagers charges');

 

In [None]:
#Middle Aged
Middle_Aged = OLDER.loc[OLDER['age'] > 40]

plt.figure(figsize=(15,5))
plt.hist(Middle_Aged['charges']); # distribution of charges
plt.title('Distribution of People over 40');



We can see that these charges are even more concentrated below $5000.

In [None]:
#Elderly
Elderly = OLDER.loc[OLDER['age'] > 60]

plt.figure(figsize=(15,5))
plt.hist(Elderly['charges']); # distribution of charges
plt.title('Distribution of Elderly charges');


The charges distribution of elderly people and teenagers look remarkably similar.

In [None]:
fig, ax = plt.subplots(figsize=(12,12)) 
sns.heatmap(OLDER.corr(), linewidths=0.1,cbar=True, annot=True, square=True, fmt='.1f')
plt.title('Correlation between Variables');

In [None]:
#REGRESSION 1 AGAIN
X = add_constant(OLDER[['age']])
Y = OLDER['charges']
regr = smf.OLS(Y,X).fit()
regr.summary()

This is a farily poor fit. The R Squared is quite high and the F-statistic is low.

In [None]:
#Regression 2 - AGE AND BMI
X = add_constant(OLDER[['age','bmi']])
Y = OLDER['charges']
regr = smf.OLS(Y,X).fit()
regr.summary()

This is even less correlated.

In [None]:
#TESTING LINEARITY
sns_plot = sns.lmplot(x='age', y='charges',data=OLDER,size = 10)
plt.title('Relationship age and charges');

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(regr.predict(), regr.resid)
plt.title('Residuals versus Predicted Charges');

In [None]:
#Regression 3: 
X = add_constant(OLDER[['age','bmi', 'smoker', 'region', 'children']])
Y = OLDER['charges']
regr = smf.OLS(Y,X).fit()
regr.summary()

In [None]:
#Testing Homoskedacity
plt.figure(figsize=(10,10))
plt.scatter(regr.predict(), regr.resid)
plt.title('Residuals versus Predicted charges for Regression 3');