In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('Dummy-Data.csv')

## Problem 1 
### Create Training dataset

In [3]:
def height_in_meters(val):
    
    val = str(val)
    feet = int(val[0])
    inches = int(val[1:])
    meters = feet * 0.3048
    meters += inches * 0.0254
    
    return meters

def BMI(ht, wt):

    return wt/ht

def quote(age, gender, BMI):

    if (age >= 18 and age <= 39) and (BMI < 17.49 or BMI > 38.5):
        if gender == 'Male':
            quote = '750 USD'
        else:
            quote = '675 USD'
        reason = 'Age is between 18 to 39 and BMI is either less than 17.49 or greater than 38.5'
        
        return pd.Series([quote, reason])
    
    elif (age >= 40 and age <= 59) and (BMI < 18.49 or BMI > 38.5):
        if gender == 'Male':
            quote = '1000 USD'
        else:
            quote = '900 USD'
        reason = 'Age is between 40 to 59 and BMI is either less than 18.49 or greater then 38.5'
        
        return pd.Series([quote, reason])
    
    elif (age >= 60) and (BMI <= 18.49 or BMI >= 45.5):
        if gender == 'Male':
            quote = '2000 USD'
        else:
            quote = '1800 USD'
        reason = 'Age is greater than 60 and BMI is either less than 18.49 or greater than 38.5'
        
        return pd.Series([quote, reason])
    
    else:
        if gender == 'Male':
            quote = '500 USD'
        else:
            quote = '450 USD'
        reason = 'BMI is in right range'
        
        return pd.Series([quote, reason])
        
df['Ht_m'] = df['Ht'].apply(lambda x : height_in_meters(x)) #Converting height to meters
df['Wt_Kg'] = df['Wt'].apply(lambda x : x/2.2046) #Converting lb to kg
df['BMI'] = df.apply(lambda x : BMI(x['Ht_m'], x['Wt_Kg']), axis = 1)
df[['quote', 'reason']] = df.apply(lambda x : quote(x['Ins_Age'], x['Ins_Gender'], x['BMI']), axis = 1)

In [4]:
df

Unnamed: 0,AppID,Ins_Age,Ins_Gender,Ht,Wt,IssueDate,Ht_m,Wt_Kg,BMI,quote,reason
0,56372,31,Male,510,185,,1.7780,83.915450,47.196541,750 USD,Age is between 18 to 39 and BMI is either less...
1,34565,35,Male,510,205,,1.7780,92.987390,52.298870,750 USD,Age is between 18 to 39 and BMI is either less...
2,57732,45,Female,510,125,,1.7780,56.699628,31.889555,450 USD,BMI is in right range
3,87324,38,Male,503,175,,1.6002,79.379479,49.605974,750 USD,Age is between 18 to 39 and BMI is either less...
4,12323,39,Female,600,252,,1.8288,114.306450,62.503527,675 USD,Age is between 18 to 39 and BMI is either less...
...,...,...,...,...,...,...,...,...,...,...,...
95,99511,35,Male,510,275,,1.7780,124.739182,70.157020,750 USD,Age is between 18 to 39 and BMI is either less...
96,23781,27,Male,604,145,,1.9304,65.771569,34.071471,500 USD,BMI is in right range
97,99517,35,Female,507,190,,1.7018,86.183435,50.642517,675 USD,Age is between 18 to 39 and BMI is either less...
98,99520,38,Female,510,144,,1.7780,65.317972,36.736767,450 USD,BMI is in right range


## Problem 2

In [5]:
model_df = df[['Ins_Age', 'Ins_Gender', 'Ht', 'Wt', 'BMI']]

In [6]:
model_df

Unnamed: 0,Ins_Age,Ins_Gender,Ht,Wt,BMI
0,31,Male,510,185,47.196541
1,35,Male,510,205,52.298870
2,45,Female,510,125,31.889555
3,38,Male,503,175,49.605974
4,39,Female,600,252,62.503527
...,...,...,...,...,...
95,35,Male,510,275,70.157020
96,27,Male,604,145,34.071471
97,35,Female,507,190,50.642517
98,38,Female,510,144,36.736767


In [7]:
categorical_variables = ['Ins_Gender']
numerical_variables = ['Ins_Age', 'Ht', 'Wt']
target_varabile = 'BMI'

In [8]:
#Generating train set and test set
X = model_df.drop(columns=['BMI'])
y = model_df['BMI']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## ElasticNet Regressor

In [9]:
#Preprocessing data
#Scaling numerical features using standard scalar
numeric_transformer = StandardScaler()
#One hot encoding categorical features
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical_variables),
                                               ("cat", categorical_transformer, categorical_variables)])

regressor = ElasticNet()

pipe_elastic = Pipeline(steps=[("preprocessor", preprocessor),
                       ("regressor", regressor)])

alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
l1_ratio = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
param_grid_elastic = dict(regressor__alpha=alpha,
                          regressor__l1_ratio=l1_ratio)

grid_elastic = GridSearchCV(pipe_elastic,
                    param_grid=param_grid_elastic,
                    scoring='r2',
                    cv=4,
                    verbose=1,
                    n_jobs=-1,
                    refit=True)

grid_elastic.fit(x_train, y_train)

Fitting 4 folds for each of 63 candidates, totalling 252 fits


In [10]:
train_pred = grid_elastic.predict(x_train)
test_pred = grid_elastic.predict(x_test)

train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

print('R-Squared score on training set : {}'.format(train_r2))
print('R-Squared score on test set : {}'.format(test_r2))

R-Squared score on training set : 0.9428557917103405
R-Squared score on test set : 0.9449857598086919


## AdaBoost Regressor

In [11]:
adaboost = AdaBoostRegressor()
pipe_adaboost = Pipeline(steps=[("preprocessor", preprocessor),
                                ("regressor", adaboost)])

param_grid_adaboost = dict(regressor__n_estimators=[500,1000,2000],
                           regressor__learning_rate=[.001,0.01,.1],
                           regressor__random_state=[1])

grid_ada = GridSearchCV(pipe_adaboost,
                    param_grid=param_grid_adaboost,
                    scoring='r2',
                    cv=4,
                    verbose=1,
                    n_jobs=-1,
                    refit=True)

grid_ada.fit(x_train, y_train)

Fitting 4 folds for each of 9 candidates, totalling 36 fits


In [12]:
train_pred = grid_ada.predict(x_train)
test_pred = grid_ada.predict(x_test)

train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

print('R-Squared score on training set : {}'.format(train_r2))
print('R-Squared score on test set : {}'.format(test_r2))

R-Squared score on training set : 0.9782767244406141
R-Squared score on test set : 0.9180161703739343


# K Neighbors Regressor

In [13]:
kn_regressor = KNeighborsRegressor()
pipe_kn = Pipeline(steps=[("preprocessor", preprocessor),
                          ("regressor", kn_regressor)])

param_grid_bayesian = dict(regressor__n_neighbors=[2, 5, 10, 20],
                           regressor__weights=['uniform', 'distance'],
                           regressor__algorithm=['kd_tree', 'ball_tree'],
                           regressor__leaf_size=[15, 20, 30, 40, 45],
                           regressor__p=[1, 2])

grid_kn = GridSearchCV(pipe_kn,
                    param_grid=param_grid_bayesian,
                    scoring='r2',
                    cv=4,
                    verbose=1,
                    n_jobs=-1,
                    refit=True)

grid_kn.fit(x_train, y_train)

Fitting 4 folds for each of 160 candidates, totalling 640 fits


In [14]:
train_pred = grid_kn.predict(x_train)
test_pred = grid_kn.predict(x_test)

train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

print('R-Squared score on training set : {}'.format(train_r2))
print('R-Squared score on test set : {}'.format(test_r2))

R-Squared score on training set : 1.0
R-Squared score on test set : 0.6711394390842744


In [27]:
import joblib
joblib.dump(grid_elastic.best_estimator_, 'model.pkl')

['model.pkl']

In [28]:
loaded_model = joblib.load('model.pkl')

In [52]:
instance = {'Ins_Age' : [20],
                'Ins_Gender' : ['Male'],
                'Ht' : [507],
                'Wt' : [45]}

In [64]:
c = pd.DataFrame.from_dict(instance)

In [65]:
loaded_model.predict(c)

array([14.79990155])