In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
# load the data into pandas dataframe:
df = pd.read_csv('/kaggle/input/insurance-premium-prediction/insurance.csv')

In [None]:
#look at the basic information
df.head()

In [None]:
df.describe()

Initial observations and actions:  
 - Check for missing values and duplicate values
 - We have 4 numerical columns and 3 categorical columns. Map sex, smoker, and region into numbers.
 - Explore distribution of the expenses  
 - Explore correlations of expenses with the independent variables  
 - Explore specific relationship between 


In [None]:
print("There are no missing values in this dataframe")
df.notnull().count()


In [None]:
#Check for duplicate values
df[df.duplicated()]

In [None]:
# Remove the one duplicative piece of data
df = df.drop_duplicates()

# Confirm duplicates are gone
df.duplicated().sum()

In [None]:
#Look at the distribution of expenses through a histogram
df.expenses.plot(kind="hist")

In [None]:
# Visualize expenses vs age, and expenses vs smoking 
sns.scatterplot(data=df, x=df['age'], y=df['expenses'], hue=df['smoker'])

Unsurprisingly, expenses tend to increase with age. It also appears there are almost three clusters for expenses here. We may inspect this later, after reviewing how the other factors interact with expenses.

In [None]:
# Check bmi distribution, mostly out of curiousity
df.bmi.plot(kind="hist")

In [None]:
# Look at scatter plot between bmi and expenses
sns.scatterplot(data=df, x=df['bmi'], y=df['expenses'])

In [None]:
# Visualize expenses vs smoking status
print("Smokers tend to have higher costs")
#sns.violinplot(data=df, x=df['smoker'], y=df['expenses'])
sns.boxplot(data=df, x=df['smoker'], y=df['expenses'])


In [None]:
# Look at various pairings of variables to see if there are any other patterns not identified yet
sns.pairplot(data=df,hue='smoker')


Observations:   
- Smoking has a very large impact on expenses (bmi vs expenses)
- Age also has a positive correlation with expenses (age vs expenses)
- Number of children seems to have very little effect (children vs expenses)



After looking at the data, it's time to clean it and start to quantify the different relationships

In [None]:
df['smoker'] = df['smoker'].map({"no": 0, "yes": 1})
df['sex'] = df['sex'].map({"female": 0, "male": 1})
df.head()

In [None]:
df['region'].unique()

In [None]:
# separate the region using one-hot coding
one_hot = pd.get_dummies(df['region'])
one_hot

In [None]:
# combine the new regional columns with the existing dataframe so it's all numerical
data = pd.concat([df,one_hot],axis=1)
data.head()

In [None]:
# groupby each region to see if there's a significant difference in costs among regions
df.groupby("region").expenses.agg(["mean","median","count"])



Eastern premium costs seem higher than western ones -- let's see how age, BMI, and smoking factor into each category

In [None]:
# Looking at the relationship of regional expenses vs specific categoies,
# to see if there are any significant profile differences between regions
print("EXPENSES BY REGION")
print(df.groupby("region").expenses.agg(["mean","median","count"]))
print("AGE VS REGION")
print(df.groupby("region").age.agg(["mean","median","count"]))
print("BMI VS REGION")
print(df.groupby("region").bmi.agg(["mean","median","count"]))
print("SMOKING VS REGION")
print(df.groupby("region").smoker.agg(['mean']))



Southeast has a higher BMI and percentage of clients who smoke, which are both correlated to higher costs.

In [None]:
#Correlation heat map to visualize correlations among all variables 
plt.figure(figsize=(10,8))
corr = df.corr()
sns.heatmap(corr, annot=True)


Smoker (.79), age (.3), and bmi (0.2).  Smoking is the only big correlation, while the other relationships aren't significant.

NEXT, CREATE A REGRESSION

In [None]:
data.head()

In [None]:
# Declare the variables
# Even though we believe age, bmi, and smoker are the only three important variables, 
# we'll include more to start
y = data['expenses']
x1 = data[['age', 'bmi', 'smoker', 'sex', 'northeast', 'northwest', 'southeast', 'southwest']]
x1.head()

In [None]:
import statsmodels.api as sm

results = sm.OLS(y,x1).fit()

In [None]:
results.summary()

Looking at the initial results:
- R-squared of 0.75 is okay, but has room for improvement
- All of the variables except for sex have a seemingly significant p-value, mostly showing 0.0000. Sex is clearly not significant with a p-value of.745.
- Inspecting further, each region has a similar coefficient that only ranges by a couple hundred dollars. In addition, the standard error is large as well.
- We should run this again, but drop the regions and sex since they aren't adding value.

In [None]:
y = data['expenses']
X = data[['age', 'bmi', 'smoker']]
results = sm.OLS(y,X).fit()

With an r-squared of .0.87, this model is mostly accurate. The three most correlated factors were input, and all three are significant, as the P value shows 0.000 for age and smoker, and .029 < 0.05 for bmi. 

In [None]:
results.summary()

This data looks much better. The r-squared is now 0.87, and all three variables seem to have predictive power. Now, we will separate the data into test/train split and see our accuracy.

In addition, the f-statistic is much higher now (3029 vs 565). This shows the significance of the model, where a higher number means more. We will use this model going forward.

Looking at the coefficients, it is also eye opening how large an impact smoking makes. There is a 23,320 estimated increase in annual premiums for smokers! 


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold



In [None]:
# Split data into test and training sets, running a linear model and a random forest
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
reg = LinearRegression().fit(X_train, y_train)
forest = RandomForestRegressor(random_state=1).fit(X_train, y_train)

y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)

y_train_forest = forest.predict(X_train)
forest_preds = forest.predict(X_test)

print("Linear regression info:")
print(reg.score(X_test, y_test))
mae = mean_absolute_error(y_test, y_test_pred)
print("MAE: {}".format(mae))


mae_forest = mean_absolute_error(y_test, forest_preds)
print("Forest model info")
print(forest.score(X_test, y_test))
print("MAE: {}".format(mae_forest))



In [None]:
# Prevent overfitting with cross validation tests, since we only did one split so far
# Run 5 scenarios of each model,and evaluate which is a better model before making predictions

from sklearn.model_selection import cross_val_score
forest_score = cross_val_score(RandomForestRegressor(),X,y)
print("Each Forest score: ", forest_score)
print("Mean Random Forest Score: ",forest_score.mean())

linear_score = cross_val_score(LinearRegression(),X,y)
print("Each Linear score: ", linear_score)
print("Mean Regression Scores: ", linear_score.mean())


As we can see, the mean Random Forest Score is 0.80, which is higher than the Linear Model's score of 0.74. 

In [None]:
# Showing a few specific predictions to see how close we were:
print("Predictions: {}".format(forest.predict(X.head())))
data['expenses'].head()


