In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [48]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [49]:
df.drop_duplicates(inplace=True)

In [50]:
# Defining a function to check for outliers in a given column using iqr method.

def outliers(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_whisker = q1 - (1.5 * iqr)
    upper_whisker = q3 + (1.5 * iqr)
    num_of_outliers = df[col][(df[col] < lower_whisker) | (df[col] > upper_whisker)].shape[0]
    print(f'Lower Whisker = {lower_whisker}')
    print(f'Upper Whisker = {upper_whisker}')
    print(f'Number of Outliers = {num_of_outliers}')
    return (lower_whisker, upper_whisker, num_of_outliers)

In [51]:
lw, uw, n_outliers = outliers(df, 'bmi')

Lower Whisker = 13.674999999999994
Upper Whisker = 47.31500000000001
Number of Outliers = 9


In [52]:
df = df[(df['bmi'] >= lw) & (df['bmi'] <= uw)]

In [53]:
X = df.drop(columns = 'charges')
y = df['charges']

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 24)

In [55]:
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}\n')
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of y_test: {y_test.shape}')

Shape of X_train: (1062, 6)
Shape of y_train: (1062,)

Shape of X_test: (266, 6)
Shape of y_test: (266,)


In [56]:
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

In [57]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
569,48,male,40.565,2,yes,northwest
133,19,male,25.555,0,no,northwest
867,57,male,43.700,1,no,southwest
564,18,female,32.120,2,no,southeast
335,64,male,34.500,0,no,southwest
...,...,...,...,...,...,...
146,46,male,30.495,3,yes,northwest
345,34,female,29.260,3,no,southeast
193,56,female,26.600,1,no,northwest
906,27,male,32.585,3,no,northeast


# Defining Column Transformers

* Filling null values in 'age', 'children' using median.
* Filling null values in 'bmi' using mean.
* Filling null values in 'sex', 'smoker', 'region' using mode.

In [58]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### Step 1 Handling Missing values

In [59]:
mean_col = ['bmi']
median_col = ['children', 'age']
mode_col = ['smoker', 'sex', 'region']

In [60]:
ct_imputation = ColumnTransformer(transformers =[
    ('mode_imputer', SimpleImputer(strategy = 'most_frequent'), mode_col), 
    ('median_imputer', SimpleImputer(strategy = 'median'), median_col), 
    ('mean_imputer', SimpleImputer(strategy = 'mean'), mean_col)
    ], remainder = 'passthrough')

In [61]:
ct_imputation.fit_transform(X_train)

array([['yes', 'male', 'northwest', 2.0, 48.0, 40.565],
       ['no', 'male', 'northwest', 0.0, 19.0, 25.555],
       ['no', 'male', 'southwest', 1.0, 57.0, 43.7],
       ...,
       ['no', 'female', 'northwest', 1.0, 56.0, 26.6],
       ['no', 'male', 'northeast', 3.0, 27.0, 32.585],
       ['yes', 'male', 'southeast', 0.0, 61.0, 35.86]], dtype=object)

### Step 2 Handling Categorical Data

In [62]:
ct_encoding = ColumnTransformer(transformers =[
              ('ord_enc', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1), [0,1]),
              ('ohe_enc', OneHotEncoder(handle_unknown = 'ignore', 
                                        sparse_output = False), [2]),
], remainder = 'passthrough')

In [63]:
ct_encoding.fit_transform(ct_imputation.fit_transform(X_train))

array([[1.0, 1.0, 0.0, ..., 2.0, 48.0, 40.565],
       [0.0, 1.0, 0.0, ..., 0.0, 19.0, 25.555],
       [0.0, 1.0, 0.0, ..., 1.0, 57.0, 43.7],
       ...,
       [0.0, 0.0, 0.0, ..., 1.0, 56.0, 26.6],
       [0.0, 1.0, 1.0, ..., 3.0, 27.0, 32.585],
       [1.0, 1.0, 0.0, ..., 0.0, 61.0, 35.86]], dtype=object)

### Step 3 Introducing Polynomial Features
* We shall add polynomial features of only 3 columns ['smoker', 'age', 'bmi']
* Indexes of these columns are [0,7,8]

In [64]:
ct_poly = ColumnTransformer(transformers = [
    ('poly_features', PolynomialFeatures(degree = (1,4), 
                                         include_bias = False), [0,7,8])
], remainder= 'passthrough')

In [65]:
ct_poly.fit_transform(ct_encoding.fit_transform(ct_imputation.fit_transform(X_train)))

array([[1.0, 48.0, 40.565, ..., 0.0, 0.0, 2.0],
       [0.0, 19.0, 25.555, ..., 0.0, 0.0, 0.0],
       [0.0, 57.0, 43.7, ..., 0.0, 1.0, 1.0],
       ...,
       [0.0, 56.0, 26.6, ..., 0.0, 0.0, 1.0],
       [0.0, 27.0, 32.585, ..., 0.0, 0.0, 3.0],
       [1.0, 61.0, 35.86, ..., 1.0, 0.0, 0.0]], dtype=object)

In [66]:
ct_poly.fit_transform(ct_encoding.fit_transform(ct_imputation.fit_transform(X_train))).shape

(1062, 40)

### Step 4 Defining ML Model

In [67]:
model = LinearRegression()

# Creating Pipeline

In [68]:
pipe = Pipeline(steps = [
    ('imputation', ct_imputation),
    ('encoding', ct_encoding),
    ('poly', ct_poly),
    ('scaler', StandardScaler()),
    ('model_deploy', model)
])

In [69]:
pipe.fit(X_train,y_train_log)

# Predicting Values Using Model

In [70]:
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

In [71]:
r2_score_train = r2_score(y_train_log, y_pred_train)
r2_score_test = r2_score(y_test_log, y_pred_test)
print(f'Train R2 Score = {r2_score_train}')
print(f'Test R2 Score = {r2_score_test}')

Train R2 Score = 0.8243187373582224
Test R2 Score = 0.8523639868157293


# Exporting Our Pipeline Model as a Joblib File

In [72]:
# Importing joblib and dumping the model in a joblib file
import joblib
joblib.dump(pipe, 'insurance_prediction_1.joblib')

['insurance_prediction_1.joblib']

In [73]:
# Loading the joblib file and creating a new model using it
new_model = joblib.load('insurance_prediction_1.joblib')

In [74]:
# Taking predictions from the model created using joblib file
new_model.predict(X_test)

array([ 9.23404255,  8.69620076,  9.60220661,  8.69864216, 10.21060505,
        8.46695271, 10.26626911,  9.45865193,  9.58340779,  7.82950154,
        7.97940388,  7.88321247,  9.56924763, 10.86685505,  7.85855427,
        7.83877888,  9.44327107,  9.33072224,  7.79971638,  8.51871052,
        8.32998982,  8.47964802,  9.58438435, 10.58365193, 10.02017536,
        9.25577107,  9.19302693, 10.66470661,  9.16031208,  9.09537068,
        9.11783161,  7.68472615,  9.04776326, 10.0973238 ,  8.54703083,
        8.93619099,  9.98892536, 10.59927693,  9.73892536,  8.50796833,
        8.80997029,  9.81216755,  8.51309529,  9.5714449 ,  9.78189411,
        9.45035115,  9.47134724,  9.97818318, 10.83072224,  8.11612263,
        8.53580036,  8.06509724,  9.42178669,  8.47549763,  9.39615193,
        9.51480427,  9.50088826,  9.32803669,  9.58975544, 10.38443318,
       10.42349568,  7.75552693,  8.45181599, 10.02017536,  9.13199177,
        8.70059529,  8.55142536,  8.97671833,  7.82876911, 10.33

In [75]:
temp_df = pd.DataFrame([[44, 'male', np.nan, 2, 'no', np.nan]], columns = X_train.columns)
new_model.predict(temp_df)

array([9.01858768])