## Introduction

This is my first notebook that I ever wrote on kaggle. I wanted to apply some data manipulation and machine learning techniques that I learned.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df=pd.read_csv('/kaggle/input/ushealthinsurancedataset/insurance.csv')

## Preprocessing

Going through to see if there are any null values in the data set

In [None]:
df.info()

In [None]:
df.isnull().any()

## Data Exploration

### Create categories for age and bmi

I wanted to create categories to get a better idea of the clients' age and bmi.

Obtained BMI interpretation from CDC webiste https://www.cdc.gov/healthyweight/assessing/bmi/adult_bmi/index.html

In [None]:
df_age = df.copy()
age_bins = [13, 20, 30, 40, 50, 60, 120]
age_labels = ['teens', '20s', '30s', '40s', '50s', '60+']
df_age['age_range'] = pd.cut(df_age.age, age_bins, labels = age_labels, include_lowest = True)

In [None]:
df_groups = df_age.copy()
bmi_bins = [0, 18.5, 24.9, 29.9, 60]
bmi_labels = ['underweight', 'healthy', 'overweight', 'obese']
df_groups['weight_category'] = pd.cut(df_groups.bmi, bmi_bins, labels = bmi_labels, include_lowest = True)

### Graph the categories

In [None]:
feature = ['age_range', 'sex', 'weight_category', 'children', 'smoker', 'region']
plt.figure(figsize = (15, 30))
for i in enumerate(feature):
    plt.subplot(6, 3, i[0]+1)
    sns.countplot(x = i[1], palette = 'dark', data = df_groups)

### Pie Chart

I wanted to get the actual percentages of each category so I can input them into a pie chart.****

In [None]:
w_percent = df_groups['weight_category'].value_counts(normalize = True)
a_percent = df_groups['age_range'].value_counts(normalize = True)
r_percent = df_groups['region'].value_counts(normalize = True)
print(w_percent)
print(a_percent)
print(r_percent)

Let's assign these values in labels

In [None]:
#labels for weight category
w_labels = ['obese', 'overweight', 'healthy', 'underweight']
w_data = w_percent
w_explode = (0.1, 0, 0, 0)  # only "explode" the 1st slice by setting value to 0.1

#lables for age_range
a_labels = ['40s', '20s', '50s', '30s', 'teens', '60+']
a_data = a_percent
a_explode = (0.1, 0, 0, 0, 0 , 0)

#labels for region
r_labels = ['southeast', 'southwest', 'northwest', 'northeast']
r_data = r_percent
r_explode = (0.1, 0, 0, 0)

Now the actual graph.

In [None]:
#I wanted to assign different colors to the second pie chart. I found it confusing having it all with the same color scheme.
colors = ['royalblue', 'orange', 'springgreen', 'lightcoral', 'mediumpurple', 'burlywood']

f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize = (18,6))

ax1.pie(w_data, explode=w_explode, labels=w_labels, autopct='%1.1f%%',
        shadow=True, startangle=45, textprops={'fontsize': 13, 'weight': 'bold'})
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.set_title("Weight Category", fontdict = {'fontsize': 24, 'fontweight': 'bold',
                                             'color': '#0000A0'})

ax2.pie(a_data, explode= a_explode, labels=a_labels, autopct='%1.1f%%',
        shadow=True, startangle=25, textprops={'fontsize': 13, 'weight': 'bold'}, 
        colors = colors)
ax2.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
ax2.set_title("Age Range", fontdict = {'fontsize': 24, 'fontweight': 'bold',
                                       'color': '#0000A0'})

ax3.pie(r_data, explode= r_explode, labels=r_labels, autopct='%1.1f%%',
        shadow=True, startangle=5, textprops={'fontsize': 13, 'weight': 'bold'})
ax3.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
ax3.set_title("Region", fontdict = {'fontsize': 24, 'fontweight': 'bold',
                                       'color': '#0000A0'})
plt.show()

### Change the strings into numbers

In [None]:
df['sex'] = df['sex'].apply({'male':0,'female':1}.get) 
df['smoker'] = df['smoker'].apply({'yes':1, 'no':0}.get)
df['region'] = df['region'].apply({'southwest':1, 'southeast':2, 'northwest':3, 'northeast':4}.get)

In [None]:
df.describe().transpose()

### Correlation

In [None]:
corr = df.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap = 'viridis')

Sex, children, and Region have the lowest correlation to charges. 

In [None]:
sns.catplot(x="children", y="charges",
            col="region",
            data=df_groups, kind="strip",
            height=4, aspect=.8, palette = "Set2")

Region has a very low correlation to charges. We see that the charges are very similar for each region.

## Machine Learning

We'll use sklearn and it's packages to see if we can predict charges for new clients.

In [None]:
#We omit the children and region variable. I made the choice to keep the sex variable for this notebook.
X = df[['age', 'sex', 'bmi', 'smoker']]
y = df['charges']

### Split the data into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

### Predict charges for new clients.

In [None]:
data1 = {'age' : 37,
        'sex' : 0,
        'bmi' : 30,
        'smoker' : 0
       }
index = [1]
new_client1_df = pd.DataFrame(data1,index)
new_client1_df

In [None]:
prediction_new_client1 = model.predict(new_client1_df)
print("Estimated charge for Client 1 is: $",str(prediction_new_client1.round(2))[1:-1])

In [None]:
data2 = {'age' : 33,
        'sex' : 1,
        'bmi' : 20,
        'smoker' : 1
       }
index = [1]
new_client2_df = pd.DataFrame(data2,index)
new_client2_df

In [None]:
prediction_new_client2 = model.predict(new_client2_df)
print("Estimated charge for Client 2 is: $",str(prediction_new_client2.round(2))[1:-1])