In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df_male = df[df["sex"] == "male"]

In [None]:
df_female = df[df["sex"] == "female"]

In [None]:
#Male who are smoking with their Insurance Charge
g = sns.FacetGrid(df_male,  row="smoker")
g = g.map(plt.hist, "charges")

In [None]:
#Female who are smoking with their Insurance Charge
g = sns.FacetGrid(df_female,  row="smoker")
g = g.map(plt.hist, "charges")

In [None]:
#Checking Male and Female Smaoking at what age
sns.boxplot(x='age',y='sex',hue='smoker',data=df)

In [None]:
#Checking bmi with charges(smoker with bmi between 30 to 50 have high charges)
sns.lmplot(x='charges', y='bmi',hue='smoker',data=df,palette='coolwarm')

In [None]:
#Smoker and NO Smoker with childrens
g = sns.FacetGrid(df, col="children",  row="smoker")
g = g.map(plt.hist, "age")

In [None]:
df['region'].unique()

In [None]:
#Region + Age wise people getting insurance
g = sns.FacetGrid(df, col="region",  row="smoker",hue='sex')
# Notice hwo the arguments come after plt.scatter call
g = g.map(plt.scatter, "charges","age").add_legend()

In [None]:
#As Region doesn't play role in Insurances redemtion so we will drop it.
df = df.drop(labels=['region'],axis=1)

In [None]:
df.head()

In [None]:
#Using Label Encoder on labels=sex,smoker,children
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder()
df['sex'] = label_encoder.fit_transform(df['sex'])
df['smoker'] = label_encoder.fit_transform(df['smoker'])
df['children'] = label_encoder.fit_transform(df['children'])

In [None]:
X = df[['age','sex','bmi','children','smoker']]
y = df['charges']

In [None]:
#Train Test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=101)

In [None]:
#Creating training Model
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)

In [None]:
#Predictions
predictions = lm.predict(X_test)

In [None]:
sns.distplot((y_test-predictions),bins=50);

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
#calculating r squared
SS_Residual = sum((y_test-predictions)**2)
SS_Total = sum((y_test-np.mean(y_test))**2)
r_squared = 1 - (float(SS_Residual))/SS_Total
print('R Squared:', r_squared)

In [None]:
#regression plot of the real test values versus the predicted values

plt.figure(figsize=(16,8))
sns.regplot(y_test,predictions)
plt.xlabel('Predictions')
plt.ylabel('Actual')
plt.title("Linear Model Predictions")
plt.grid(False)
plt.show()