In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## data preprocessing

In [None]:
data = pd.read_csv('../input/insurance/insurance.csv')
data.info()

In [None]:
# i will keep raw_data copy and use data copy for preprocessing
raw_data =data.copy()
pd.unique(data['region'])

In [None]:
region_dummies = pd.get_dummies(data['region'], drop_first = True )
data = data.drop(['region'], axis = 1)
data_1 = pd.concat([data, region_dummies], axis = 1)
data_1.columns.values

In [None]:
# so northeast will be the benchmark for region 
#rearranging data columns
data_1.columns = ['age', 'sex', 'bmi', 'children', 'smoker', 'charges', 'northwest region',
       'southeast region', 'southwest region']
data_1.head()

In [None]:
data_1['sex'] = data_1['sex'].map({'female':0, 'male':1})
data_1['smoker'] = data_1['smoker'].map({'no':0, 'yes':1})
data_processed = data_1
data_processed.head()

# EDA

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(raw_data.corr(),annot=True,cmap="winter_r")
plt.show()

In [None]:
#Plotting Categorical Variables
fig, ax = plt.subplots(3, 2 ,figsize=(14,8))
raw_data["sex"].value_counts().plot.barh(color="purple", ax=ax[0,0])
raw_data["sex"].value_counts().plot.pie(autopct='%1.1f%%',shadow=True,textprops={"fontsize": 10},ax=ax[0,1])
fig.suptitle("Gender Frequency", fontsize=15)
plt.xticks(rotation=0)
plt.yticks(rotation=45)

raw_data["smoker"].value_counts().plot.barh(color="purple", ax=ax[1,0])
raw_data["smoker"].value_counts().plot.pie(autopct='%1.1f%%',shadow=True,textprops={"fontsize": 10},ax=ax[1,1])
fig.suptitle("Smoking Frequency", fontsize=15)
plt.xticks(rotation=0)
plt.yticks(rotation=45)

raw_data["region"].value_counts().plot.barh(color="purple", ax=ax[2,0])
raw_data["region"].value_counts().plot.pie(autopct='%1.1f%%',shadow=True,textprops={"fontsize": 10},ax=ax[2,1])
fig.suptitle("Region Frequency", fontsize=15)
plt.xticks(rotation=0)
plt.yticks(rotation=45)

plt.show()

In [None]:
#Plotting Numerical Variables
fig, ax = plt.subplots(1, 3,figsize=(14,5))
fig.suptitle("Age Distribution", fontsize=15)
sns.distplot(raw_data["age"], ax=ax[0])
sns.boxplot(raw_data["age"], ax=ax[1])
sns.violinplot(raw_data["age"], ax=ax[2])
plt.figure(figsize=(20,50))


In [None]:
fig, ax = plt.subplots(1, 3,figsize=(14,5))
fig.suptitle("BMI Distribution", fontsize=15)
sns.distplot(raw_data["bmi"], ax=ax[0])
sns.boxplot(raw_data["bmi"], ax=ax[1])
sns.violinplot(raw_data["bmi"], ax=ax[2])

In [None]:
fig, ax = plt.subplots(1, 3,figsize=(14,5))
fig.suptitle("Children Distribution", fontsize=15)
sns.distplot(raw_data["children"], ax=ax[0])
sns.boxplot(raw_data["children"], ax=ax[1])
sns.violinplot(raw_data["children"], ax=ax[2])

In [None]:
fig, ax = plt.subplots(1, 3,figsize=(14,5))
fig.suptitle("Charges Distribution", fontsize=15)
sns.distplot(raw_data["charges"], ax=ax[0])
sns.boxplot(raw_data["charges"], ax=ax[1])
sns.violinplot(raw_data["charges"], ax=ax[2])


In [None]:
#removing outliers considering charges pdf
indexNames = data_processed[ data_processed['charges'] >= 50000 ].index #you can use & formulti conditions
data_processed.drop(indexNames , inplace=True)
sns.distplot(data_processed['charges'])

In [None]:
#age vs smoker
plt.figure(figsize=(12,8))
a=raw_data["age"].sort_values()
sns.countplot(a,hue=raw_data["smoker"],palette="plasma")

In [None]:
sns.relplot(x="age", y="charges", hue="smoker", data=data)
sns.relplot(x="bmi", y="charges", hue="smoker", data=data)
plt.show()

In [None]:
sns.relplot(x="age", y="charges", hue="smoker", style="sex",data=data)
sns.relplot(x="bmi", y="charges", hue="smoker", style="sex",data=data)

In [None]:
sns.pairplot(raw_data, hue="sex")

In [None]:
sns.pairplot(raw_data, hue="region")

## Checking and Relaxing the OLS assumptions

### linearity

In [None]:
# as you see in replot charges and age or charges and bmi are highly scattered so i will use log scale
log_charges = np.log(data_processed['charges'])
data_processed['log_charges'] = log_charges

In [None]:
sns.relplot(x="age", y="log_charges", hue="smoker", data=data_processed);
sns.relplot(x="bmi", y="log_charges", hue="smoker", data=data_processed);

### Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = data_processed[['bmi','age']]
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif["features"] = variables.columns
vif

## Multilinear model

In [None]:
X = data_processed.iloc[:, :-2].values
y = data_processed.iloc[:, -1].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, [0,2]] = sc.fit_transform(X_train[:, [0,2]])
X_test[:, [0,2]] = sc.transform(X_test[:, [0,2]])

In [None]:
# Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
# Evaluating the Model Performance
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

In [None]:
regressor.intercept_
regressor.coef_
inputs = data_processed.drop(['charges'],axis=1)
inputs = inputs.drop(['log_charges'],axis=1)
regressor_summary = pd.DataFrame(inputs.columns.values, columns=['Features'])
regressor_summary['Weights'] = regressor.coef_
coff = pd.DataFrame([['intercept',regressor.intercept_]], columns=['Features','Weights'])
regressor_summary = pd.concat([regressor_summary, coff])
regressor_summary

In [None]:
dataframe_performance = pd.DataFrame(np.exp(y_pred), columns = ['predictions'])
dataframe_performance['target'] = np.exp(y_test)
dataframe_performance['Residual'] = dataframe_performance['target'] - dataframe_performance['predictions']
dataframe_performance['Difference%'] = np.absolute(dataframe_performance['Residual']/dataframe_performance['target']*100)

pd.options.display.max_rows = 9999
pd.set_option('display.float_format', lambda x: '%.2f' % x)
dataframe_performance.sort_values(by=['Difference%'])

In [None]:
testing_MSE_perentage = dataframe_performance['Difference%'].mean()
testing_MSE_perentage

In [None]:
plt.scatter(y_test, y_pred, alpha=0.2)
plt.xlabel('Targets (y_test)',size=18)
plt.ylabel('Predictions (y_pred)',size=18)
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()