Hi,

Appreciate your comments, suggestions for this easy attempt on Regression & KNN :)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/insurance/insurance.csv")
data.head()

In [None]:
data.describe(include = 'all')

# Step 1 - Missing Values Handling

In [None]:
data.isnull().any()

# Step 2- Data Exploration



In [None]:
sns.heatmap(data.corr(), annot = True, fmt = '.2f', linewidths = .5)

Observations:
1. Very low correlation of each numerical variables with the charges
2. No Multicollinearity(Regression Coefficients can be analyzed)
3. Age is effecting the charges highly.

In [None]:
sns.pairplot(data)

Similar behaviour can be observed in pairplot, age shows highest correlation.
Looking at the above features, Polynomial features can be little bit helpful in the final model creation.

In [None]:
#Minimum Value for Age is 18 and maxium is 64
bins = [0,30,40,50,60,np.inf]
names = ['18-30', '30-40', '40-50', '50-60', '60-64']
data['AgeRange'] = pd.cut(data['age'], bins, labels = names)
sns.barplot(data = data[['AgeRange','charges']].groupby('AgeRange').mean().reset_index(), x = 'AgeRange', y = 'charges')


Age- Validates the obvious assumption that charges increases with age.

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (15,5))

#smoker Count Plot
sns.countplot(data = data, x = 'smoker',  hue = 'sex', ax = axes[0])

#Sex v Smoker v Charges
sns.barplot(data = data, x = 'smoker', hue = 'sex', y = 'charges', estimator = np.mean, ax = axes[1])\
                                            .set(xlabel = 'Smoker', ylabel = 'Mean of Charges if the person is smoker')

#Descriptive Statistics
print("Descriptive Statistics based on Smoker, Sex & Charges")
data[['smoker','sex','charges']].groupby(['smoker','sex']).agg({'charges':['mean','median','count']})

Some Observations:
    About 17% of the population(Its a sample though :P) of data correspond to Smokers.
    We can clearly observe Smoking increase your medicinal charges by 80%.

In [None]:
#Smoking Effects with BMI, Age, Children on Charges

fig, axes = plt.subplots(2,2, figsize = (18,15))

#BMI
sns.scatterplot(data = data, x = 'bmi', y = 'charges', hue = 'smoker', ax = axes[0][0])

#Age
sns.barplot(data = data, x = 'AgeRange', y = 'charges', hue = 'smoker', ax = axes[0][1], estimator = np.median)

#Age Wise Distribution
sns.boxplot(data = data, x = 'AgeRange', y = 'charges',hue = 'smoker', ax = axes[1][0])

#Children
sns.barplot(data = data, x = 'smoker', y = 'children', ax = axes[1][1], estimator = np.mean)

In [None]:
fig, axes = plt.subplots(figsize = (10,5))
sns.scatterplot(data = data, x = 'age', y = 'charges', hue = 'smoker')

print("Smoker Average Cost")
data[['AgeRange','smoker','charges']].groupby(by = ['AgeRange','smoker']).agg({'charges':['mean','median']})

Observations from above graphs:
Charges increases with increase in BMI(Obese have high effect) if person smokes. Growing OLD doesn't necessarily mean exponentially high medicinal costs, but in case of smoking costs are gonna get increase exponentially.

Average charges between 18-30 bracket in case its a smoker is almost 50% higher than charges in 60-64 age bracket in case the person is a non smoker. Better Not Smoke then.

Smoking KILLS(Not sure through data)!!! but it definately outweighs the wallet.
Rejects the assumption about smokers having low children.

# Step 3 - Feature Engineering

In [None]:
#Required Libraries
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures

In [None]:
def print_sample_categorical(data):
    print(data.head())
    print(data['sex'].value_counts())
    print(data['smoker'].value_counts())
    print(data['region'].value_counts())
    
print_sample_categorical(data)    

Normalizing and Encoding the Categorical Features

In [None]:
X = data
X = X.drop(columns = ['AgeRange'], axis = 1)

#Label Encoder on Each of the values
print("Transforming Categorical Values:\n")
le = LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X['region'] = le.fit_transform(X['region'])
print_sample_categorical(X)

#Standard Scaler Cause KNN Would be applied on the data
print("Scaling Each of the Values using standard Scaler Cause of KNN")
sscaler = StandardScaler()
def scale_Features(data):
    for col in data.columns:
        data[col] = sscaler.fit_transform(pd.DataFrame(data[col]))
    return data
    
X = scale_Features(X)
print(X.head())    

Feature Selection - Based on Pearson Correlation

In [None]:
sns.heatmap(X.corr(), annot = True, fmt = '.2f', linewidths = .5)

Removing region and sex from the dataset as it has very low correlation, Would create noise in Regression as well as KNN.

In [None]:
sns.pairplot(data)

In [None]:
y = X['charges']
X = X.drop(columns = ['charges','region','sex'], axis = 1)
print(X.head())
print(y.head())

In [None]:
#Adding Polynomial Features into the dataset
sqr = PolynomialFeatures(degree = 2)
X_sqr = sqr.fit_transform(X)
X_sqr = scale_Features(pd.DataFrame(X_sqr))
print(X_sqr)
print(X.head())

# Step 4 - Modelling

In [None]:
print("Cross Validation Score LR: ", cross_val_score(estimator = LinearRegression(), X = X, y = y, cv = 5).mean())
print("Cross Validation Score Polynomial Regression: ", cross_val_score(estimator = LinearRegression(), X = X_sqr, y = y, cv = 5).mean())
print("Cross Validation Score KNN: ", cross_val_score(estimator = KNeighborsRegressor(), X = X_sqr, y = y, cv = 5).mean())

In [None]:
def model_validation(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state= 0)
    model.fit(X_train, y_train)
    print("R2 Train :", model.score(X_train, y_train))
    print("R2 Test :", model.score(X_test, y_test))
    print("MAE Train:",mean_absolute_error(model.predict(X_train),y_train))
    print("MAE Test:",mean_absolute_error(model.predict(X_test), y_test))
    cv_error = cross_val_score(estimator = model, X = X, y = y, cv = 5).mean()
    return model, cv_error
    

In [None]:
lr_model, cv_error = model_validation(LinearRegression(), X,y)
print("Regression Coefficients :", dict(zip(X.columns , lr_model.coef_)))
print("Regression Intercept :", lr_model.intercept_)

Tuning KNN:

In [None]:
#Tuning KNN
knn_model,cv_error = model_validation(KNeighborsRegressor(),X,y)

#Changing K(neigbhours values)
fig, axes = plt.subplots(figsize = (20,5))
n_iter = 50
cv_list = [cross_val_score(estimator = KNeighborsRegressor(n_neighbors = i), X = X, y = y, cv = 5).mean() for i in range(1,n_iter)]
#sns.set_style("whitegrid")
sns.lineplot(x = [i for i in range(1,n_iter)], y = cv_list, markers = True, dashes = True, marker = 'o').set(xlabel = 'K Value',
                                                                                                                ylabel = 'Accuracy',
                                                                                                                xticks = range(1,50))            

Based on above graph it can be observed accuracy gets increase after 10 and it remains nearly constant after that. Chossing 15 as neighbours.

In [None]:
knn_model,cv_error = model_validation(KNeighborsRegressor(n_neighbors = 15),X,y)

Training all the data on KNN & Polynomial as they yield better accuracy

In [None]:
#KNN
knn = KNeighborsRegressor(n_neighbors = 15)
knn.fit(X,y)
print("R2 score KNN: ",knn.score(X,y))

#Polynomial Regression
plr = LinearRegression()
#Already created squared dataset X_sql
plr.fit(X_sqr, y)
print("R2 score Polynomial Regression", plr.score(X_sqr,y))