In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/insurance/insurance.csv")
df.head()

# Explore the Data

In [None]:
df.describe()

The mean and median values of variables 'age', 'bmi' and 'children' shows they are normally distributed.

In [None]:
df.plot(kind='box', subplots=True, layout=(5,5),
sharex=False, sharey=False, figsize=(20,10))
plt.show()

In [None]:
df.info()

There are no null values in our data set.

In [None]:
# changing datatype for nominal/ordinal data
for column in ['sex', 'smoker', 'region']:
    df[column] = df[column].astype('category')

In [None]:
# this function plots the frequency distribution of a categorical feature
def show_frequency_distribution(feature_name, feature_values):
    freq = (feature_name.value_counts() ).sort_index()
    df = pd.DataFrame((feature_name.value_counts()).sort_index())
    bars = feature_values
    fig = plt.figure(figsize=(5,3))
    ax1 = fig.add_subplot(1, 2, 1)
    y = np.arange(len(bars))
    _ = plt.bar(freq.index, freq , color = 'salmon');
    _ = plt.xticks(freq.index, bars, rotation = 45);
    _ = plt.ylabel("Frequency count");
    _ = plt.xlabel(str(feature_name.name) + " type");
    _ = plt.title("Frequency Distribution of " + str(feature_name.name));
    ax2 = fig.add_subplot(1, 2, 2)
    font_size=14
    bbox=[1, 0, 1, 1]
    ax2.axis('off')
    table = ax2.table(cellText = df.values, rowLabels = feature_values, bbox=bbox, colLabels=df.columns)
    table.auto_set_font_size(False)
    table.set_fontsize(font_size)
    plt.show();

In [None]:
for column in ['sex', 'smoker', 'region', 'children']:
    show_frequency_distribution(df[column], list(df[column].value_counts().sort_index().index))

In [None]:
sns.scatterplot(df['age'], df['charges'], hue = df['sex'], alpha = 0.5)
plt.title("Exploring relation between 'age' and 'charges'")
plt.show()

In [None]:
sns.scatterplot(df['bmi'], df['charges'], hue = df['sex'], alpha = 0.5)
plt.title("Exploring relation between 'bmi' and 'charges'")
plt.show()

**Data Modelling**

In [None]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

for column in ['sex', 'smoker', 'region']:
    df[column] = labelencoder.fit_transform(df[column])
    
df.head()

In [None]:
X = df.drop('charges', axis = 1)
y = df['charges']

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns = X.columns)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

# Looking for the most significant features to predict medical charges

In [None]:
from sklearn.linear_model import Ridge
model = Ridge()
model.fit(X_train, y_train)
importance = model.coef_
feat_importances = pd.Series(model.coef_, index=X.columns)
feat_importances.plot(kind='barh')
plt.show()

In [None]:
#apply SelectKBest class to extract top 10 best features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
bestfeatures = SelectKBest(score_func=f_regression, k=6)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(6,'Score'))  #print 10 best features

Above two methods of feature ranking show that 'sex' and 'region' are not important to predict the medical charges. Therefore, we will exclude them before building the final model.

In [None]:
X_train = X_train.drop(['sex', 'region'], axis = 1)
X_test = X_test.drop(['sex', 'region'], axis = 1)

# Building Regressor

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred_lr) 
r2_value = r2_score(y_test, y_pred_lr)                     

print("*** Multiple Linear Regression ***")
print("Mean Absolute Error:", mae)
print("R^2 Value:", r2_value)

In [None]:
# Fit the data to polynomial linear regression model and check the accuracy
from sklearn.preprocessing import PolynomialFeatures
polyFeat = PolynomialFeatures(degree=3, include_bias=True)
polyTrainX = polyFeat.fit_transform(X_train)
polyTestX = polyFeat.fit_transform(X_test)
pr = LinearRegression()
pr.fit(polyTrainX, y_train)
y_pred_pr = pr.predict(polyTestX)

mae = mean_absolute_error(y_test, y_pred_pr)   
r2_value = r2_score(y_test, y_pred_pr)                     

print("*** Polynomial Linear Regression ***")
print("Mean Absolute Error:", mae)
print("R^2 Value:", r2_value)

While using Polynomial Regressor the R^2 score increases by around 10%. Therefore, we will finalize the Polynomial Regressor for this problem.

# Learning curve for both the models

In [None]:
from sklearn.model_selection import validation_curve, learning_curve

def draw_learning_curve(model, x, y):
    train_sizes,train_scores, test_scores = learning_curve(model, x, y, 
                                                       train_sizes=[50, 100, 300, 500, 700, 900], cv=10)
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    
    plt.plot(train_sizes, train_scores_mean, color='blue', label='Train score')
    plt.plot(train_sizes, test_scores_mean, color='red', label='Cross-validation score')
    
    plt.legend(loc='best')
    plt.xlabel('Training size')
    plt.ylabel('score')

In [None]:
draw_learning_curve(lr,X_train, y_train)
plt.title("Learning curve for Multiple Linear Regressor")

In [None]:
draw_learning_curve(pr,polyTrainX, y_train)
plt.title("Learning curve for Polynomial Regressor")

In [None]:
predTest = pd.DataFrame({"prediction": y_pred_pr, "observed": y_test})
plt.scatter(predTest['prediction'], predTest['observed'])
plt.title("Polynomial Regressor: Prediction Vs Actual Data")
plt.xlabel("Predicted Medical Charges") 
plt.ylabel("Observed Medical Charges")
plt.show()