# Import Libs

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR

import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max.columns', None)

# Data loading and overview

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.head()

# EDA
### Distplot
We look at the destribution

In [None]:
sns.distplot(df['age'])

In [None]:
sns.distplot(df['bmi'])

In [None]:
sns.distplot(df['children'])

In [None]:
sns.distplot(df['charges'])

Feature distribution is not normal

### Barplot

In [None]:
sns.barplot(x = 'sex', y = 'charges', data = df)

In [None]:
sns.barplot(x = 'region', y = 'charges', data = df)

In [None]:
sns.barplot(x = 'smoker', y = 'charges', data = df)

### Countplot

In [None]:
sns.countplot(x = 'smoker', data = df)

In [None]:
sns.countplot(x = 'sex', data = df)

In [None]:
sns.countplot(x = 'region', data = df)

### Boxplot
We look at outliers and insights

In [None]:
sns.boxplot(x = 'smoker', y = 'bmi', hue = 'sex', data = df)

In [None]:
sns.boxplot(x = 'smoker', y = 'age', hue = 'sex', data = df)

In [None]:
sns.boxplot(x = 'smoker', y = 'charges', hue = 'sex',data = df)

Smokers spend a lot more money

# Preprocessing
## Grouping data
### Age groups

* Under 30s
* 30 - 40
* 40 - 50
* 50 - 60
* Over 60s

### bmi groups

* 5 - 10
* 10 - 15
* 15 - 20
* 20 - 25
* 25 - 30
* 30 - 35
* 35 +

### Children groups

* 0 
* 1 - 2
* 2 - 3
* 3 - 4
* 4 +


In [None]:
age_groups = []
for i in df['age']:
    if i <= 30:
        age_groups.append('0-30')
    if i > 30 and i <= 40:
        age_groups.append('30-40')
    if i > 40 and i <= 50:
        age_groups.append('40-50')
    if i > 50 and i <= 60:
        age_groups.append('50-60')
    if i > 60:
        age_groups.append('60+')
        
df['age_group'] = age_groups

In [None]:
sns.countplot(x = 'age_group', data = df)

In [None]:
bmi_groups = []
for i in df['bmi']:
    if i <= 10:
       bmi_groups.append('5-10')
    if i > 10 and i <= 15:
        bmi_groups.append('10-15')
    if i > 15 and i <= 20:
        bmi_groups.append('15-20')
    if i > 20 and i <= 25:
        bmi_groups.append('20-25')
    if i > 25 and i <= 30:
        bmi_groups.append('25-30')
    if i > 30 and i <= 35:
        bmi_groups.append('30-35')
    if i > 35:
        bmi_groups.append('35+')
        
df['bmi_group'] = bmi_groups

In [None]:
sns.countplot(x = 'bmi_group', data = df)

In [None]:
children_groups = []
for i in df['children']:
    if i <= 1:
       children_groups.append('0')
    if i > 1 and i <= 2:
        children_groups.append('1-2')
    if i > 2 and i <= 3:
        children_groups.append('2-3')
    if i > 3 and i <= 4:
        children_groups.append('3-4')
    if i > 4:
        children_groups.append('4+')
        
df['children_group'] = children_groups

In [None]:
sns.countplot(x = 'children_group', data = df)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


In [None]:
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])

In [None]:
df = pd.get_dummies(df, columns = ['age_group'])
df = pd.get_dummies(df, columns = ['bmi_group'])
df = pd.get_dummies(df, columns = ['children_group'])
df = pd.get_dummies(df, columns = ['region'])


# Modeling

In [None]:
df.corr()['charges'].sort_values()

In [None]:
df = df.drop(['age', 'bmi', 'children'], axis = 1)

In [None]:
mm = MinMaxScaler()
df_mm = mm.fit_transform(df)
df_mm = pd.DataFrame(df_mm, columns = df.columns)

We uses scalling

In [None]:
std = StandardScaler()
df_std = std.fit_transform(df)
df_std = pd.DataFrame(df_std, columns = df.columns)

In [None]:
df_std.head()

In [None]:
X = df_std.drop(['charges'], axis = 1)
y = df_std.charges

### Train split test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
regressor = sm.OLS(y_train, X_train).fit()
print(regressor.summary())

X_train_dropped = X_train.copy()

### The fuction removes features with high p-value

In [None]:
while True:
    if max(regressor.pvalues) > 0.05:
        drop_variable = regressor.pvalues[regressor.pvalues == max(regressor.pvalues)]
        print("Dropping " + drop_variable.index[0] + " and running regression again because pvalue is: " + str(drop_variable[0]))
        X_train_dropped = X_train_dropped.drop(columns = [drop_variable.index[0]])
        regressor = sm.OLS(y_train, X_train_dropped).fit()
    else:
        print("All p values less than 0.05")
        break


In [None]:
print(regressor.summary())


In [None]:
X_train_dropped.shape

### Uses SelectKBest

In [None]:
column_names = df.drop(columns = ['charges']).columns

no_of_features = []
r_squared_train = []
r_squared_test = []

# Look at X_train_dropped shape
for k in range(1, 13):
    selector = SelectKBest(f_regression, k = k)
    X_train_transformed = selector.fit_transform(X_train, y_train)
    X_test_transformed = selector.transform(X_test)
    regressor = LinearRegression()
    regressor.fit(X_train_transformed, y_train)
    no_of_features.append(k)
    r_squared_train.append(regressor.score(X_train_transformed, y_train))
    r_squared_test.append(regressor.score(X_test_transformed, y_test))
    
sns.lineplot(x = no_of_features, y = r_squared_train, legend = 'full')
sns.lineplot(x = no_of_features, y = r_squared_test, legend = 'full')

In [None]:
# Best score k = 8, look at orange line
selector = SelectKBest(f_regression, k = 8)
X_train_transformed = selector.fit_transform(X_train, y_train)
X_test_transformed = selector.transform(X_test)
column_names[selector.get_support()]


In [None]:
def regression_model(model):
    """
    Will fit the regression model passed and will return the regressor object and the score
    """
    regressor = model
    regressor.fit(X_train_transformed, y_train)
    score = regressor.score(X_test_transformed, y_test)
    return regressor, score

In [None]:
model_performance = pd.DataFrame(columns = ["Features", "Model", "Score"])

models_to_evaluate = [LinearRegression(), Ridge(), Lasso(), SVR(), RandomForestRegressor(), MLPRegressor()]

for model in models_to_evaluate:
    regressor, score = regression_model(model)
    model_performance = model_performance.append({"Features": "Linear","Model": model, "Score": round(score, 2)}, ignore_index=True)

model_performance

# Thanks for watching!
## If you liked notebook then upvoted it or write your opinion