# import packages

In [None]:
# data cleaning
import numpy as np 
import pandas as pd 

# data visulation
from matplotlib import pyplot as plt 
import seaborn as sns

# machine learning
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.cluster import KMeans



In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')
df.head()

In [None]:
df.dtypes

## View the number of rows and columns of data

In [None]:
df.shape

In [None]:
df.nunique()

## view the sum null of data

In [None]:
df.isnull().sum()

In [None]:
## View regional distribution

In [None]:
df['region'].unique()

## 描述数据概况

In [None]:
df.describe()

# Visual overview

In [None]:
df[['age', 'bmi', 'children', 'charges']].hist(color="c", alpha=0.8, bins=50, figsize=(12, 6));

## Group by bmi 

In [None]:
bins=[15,25,35,45,55]
labels=['15-25','25-35','35-45','45-55']
df['bmi_new']=pd.cut(df['bmi'], bins, labels)
df.head()

In [None]:
df['bmi_new'].value_counts()

In [None]:
# Calculate the average medical expenses according to the bmi group
df.groupby(df.bmi_new).charges.mean()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x=df.bmi_new,y=df.charges,palette='rainbow').set(title='bmi Vs Charges')

# Regroup by age

In [None]:
bins = [0,20,40,60,100]
df['ages'] = pd.cut(df['age'], bins,labels=('teenager','young','middle','old'))
df.head()

In [None]:
df.ages.value_counts()

In [None]:
df.groupby(df.ages).charges.mean()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x=df.ages,y=df.charges,palette='rainbow', ci=0).set(title='Age Vs Charges')

In [None]:
df.pivot_table('charges',index=['ages', 'bmi_new'],aggfunc='mean').unstack()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax = sns.barplot(x='bmi_new', y='charges', hue='ages', data=df, ci=0, palette='rainbow')

## Data distribution in the four regions

In [None]:
df.region.value_counts()

In [None]:
df.pivot_table('charges',index=['sex', 'smoker'],columns='children',aggfunc='mean')

In [None]:
df.pivot_table('charges',index=['sex', 'smoker'],columns='children',aggfunc='count')

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x=df.region,y=df.charges,palette='rainbow', ci=0).set(title='region Vs Charges')

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax = sns.barplot(x='region', y='charges', hue='sex', data=df, ci=0, palette='rainbow')

In [None]:
fig, ax = plt.subplots(1,1, figsize=(10,6))
ax = sns.barplot(x = 'region', y = 'charges',
                 hue='smoker', data=df, ci=0, palette='rainbow')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
sns.barplot(x='region', y='charges', hue='children', data=df, palette='rainbow')

# Correlation coefficient 

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
ax = sns.heatmap(df.corr(), annot=True, cmap='cool')

In [None]:
sns.set(style="ticks")
sns.pairplot(df,  height=2);

In [None]:
ax = sns.lmplot(x = 'age', y = 'charges', data=df, hue='smoker', palette='rainbow')
ax = sns.lmplot(x = 'bmi', y = 'charges', data=df, hue='smoker', palette='rainbow')

In [None]:
plt.figure(figsize=(14,6))
sns.catplot(x='children', y='charges',hue='sex',data=df,palette='rainbow',kind="box")
plt.title('Box plot of charges vs children');

In [None]:
df.groupby('children').agg(['mean','min','max'])['charges']

In [None]:
plt.figure(figsize=(14,6))
sns.catplot(x='region', y='charges',hue='sex',data=df,palette='rainbow',kind="box")
plt.title('Box plot of charges vs children');

In [None]:
fig = plt.figure(figsize=(14,6))
ax = fig.add_subplot(121)
sns.scatterplot(x='age',y='charges',data=df,palette='rainbow',hue='smoker',ax=ax)
ax.set_title('Scatter plot of Charges vs age')

ax = fig.add_subplot(122)
sns.scatterplot(x='bmi',y='charges',data=df,palette='rainbow',hue='smoker')
ax.set_title('Scatter plot of Charges vs bmi')

# Linear regression

In [None]:
df['sex'] = df['sex'].astype('category').cat.codes

df['smoker'] = df['smoker'].astype('category').cat.codes
df['region'] = df['region'].astype('category').cat.codes
df.dtypes

In [None]:
X_charges = df.drop(['charges','ages','bmi_new'], axis=1)
X_charges.shape

In [None]:
y_charges = df['charges']
y_charges.shape

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test  = train_test_split(X_charges, y_charges,
                                                random_state=0, train_size=0.7)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)
print('intercept_ ', model.intercept_)
print('coef_ ',model.coef_)

## Verify the accuracy score of the model on test data

$$
R^2 = 1- \frac{RSS}{TSS}
$$


In [None]:
from sklearn.metrics import r2_score

y_predict = model.predict(x_test) 
r2_score(y_test, y_predict)

# KMeans

In [None]:
df2 = df[['bmi', 'charges']]

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)  
kmeans.fit(df2)

In [None]:
print(kmeans.cluster_centers_)
print(kmeans.labels_)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df2.values[:,0], df2.values[:,1], c=kmeans.labels_, cmap="rainbow", s=25)
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black', marker="x", s=300)
plt.title("Kmeans Clustering on insurance", fontsize=16)
plt.xlabel('bmi')
plt.ylabel('charges')
plt.show()