In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import TransformedTargetRegressor

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### EDA & Feature Selection

In [2]:
df_base = pd.read_csv("Dummy-Data.csv")
df_base.head()

Unnamed: 0,AppID,Ins_Age,Ins_Gender,Ht,Wt,IssueDate
0,56372,31,Male,510,185,
1,34565,35,Male,510,205,
2,57732,45,Female,510,125,
3,87324,38,Male,503,175,
4,12323,39,Female,600,252,


#### Renaming the feature names for better consistency

In [3]:
df_base.rename(columns = {'AppID':'app_id', 
                'Ins_Age':'age', 
                'Ins_Gender':'gender', 
                'Ht': 'height',
                'Wt':'weight',
               'IssueDate':'issue_date'}, inplace=True)

In [4]:
df_base.describe()

Unnamed: 0,app_id,age,height,weight,issue_date
count,100.0,100.0,100.0,100.0,0.0
mean,79455.57,37.32,527.72,165.15,
std,31123.726942,12.103827,39.482787,27.66224,
min,12123.0,19.0,500.0,110.0,
25%,64296.0,27.75,506.0,145.0,
50%,99444.5,36.0,509.0,170.0,
75%,99917.0,47.0,511.0,180.0,
max,100345.0,59.0,605.0,275.0,


In [5]:
df_base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   app_id      100 non-null    int64  
 1   age         100 non-null    int64  
 2   gender      100 non-null    object 
 3   height      100 non-null    int64  
 4   weight      100 non-null    int64  
 5   issue_date  0 non-null      float64
dtypes: float64(1), int64(4), object(1)
memory usage: 4.8+ KB


#### Based on the info above, none of the no missing avlues, except issue_date. issue_date is empty for all instances, which doesnt help to identify the age of the instance(s) or the data. so, dropping this feature

In [6]:
df_base.drop(["issue_date"], axis=1, inplace=True)

In [7]:
df_base.tail()

Unnamed: 0,app_id,age,gender,height,weight
95,99511,35,Male,510,275
96,23781,27,Male,604,145
97,99517,35,Female,507,190
98,99520,38,Female,510,144
99,99516,52,Female,510,125


In [8]:
df_base["app_id"].duplicated()

0     False
1     False
2     False
3     False
4     False
      ...  
95     True
96    False
97     True
98     True
99     True
Name: app_id, Length: 100, dtype: bool

#### Here is the list of dupplicate rows based on the key app_id, whihc may lead to data leaks in the pipeline. Duplicate rows to be removed.

In [9]:
df_base[df_base.duplicated(['app_id'], keep='first')]

Unnamed: 0,app_id,age,gender,height,weight
33,93587,19,Male,601,180
54,99917,26,Male,503,180
55,99918,47,Male,602,160
56,99946,24,Male,604,180
58,99947,19,Male,510,150
61,99875,49,Male,508,160
62,99917,30,Male,508,160
63,99918,21,Male,503,160
64,99946,40,Male,503,180
65,99947,47,Male,602,120


#### Drop the duplicate rows based on app_id feature

In [10]:
df_base = df_base.drop_duplicates(subset=['app_id'])

#### No duplicates now

In [11]:
df_base[df_base.duplicated(['app_id'], keep='first')]

Unnamed: 0,app_id,age,gender,height,weight


In [12]:
df_base["bmi"] = 0
df_base.head()

Unnamed: 0,app_id,age,gender,height,weight,bmi
0,56372,31,Male,510,185,0
1,34565,35,Male,510,205,0
2,57732,45,Female,510,125,0
3,87324,38,Male,503,175,0
4,12323,39,Female,600,252,0


#### Checking the value of height in the dataset -  minimum & the maximum 

In [13]:
df_base['height'].min()

500

In [14]:
df_base['height'].max()

604

#### Checking the value of weight in the dataset - minimum & the maximum

In [15]:
print(df_base['weight'].min())
print(df_base['weight'].max())

110
252


In [16]:
df_base.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76 entries, 0 to 96
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   app_id  76 non-null     int64 
 1   age     76 non-null     int64 
 2   gender  76 non-null     object
 3   height  76 non-null     int64 
 4   weight  76 non-null     int64 
 5   bmi     76 non-null     int64 
dtypes: int64(5), object(1)
memory usage: 4.2+ KB


In [17]:
seed = 45

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(df_base.drop(columns=['bmi']), 
                                                    df_base['bmi'], 
                                                    test_size=.2, 
                                                    random_state=seed)

In [18]:
df_base.isnull().values.any()

False

In [19]:
X_train_p.head()

Unnamed: 0,app_id,age,gender,height,weight
20,100336,36,Male,508,180
28,13545,19,Female,601,170
53,99947,57,Male,503,180
21,93580,45,Male,508,165
41,99498,19,Male,507,180


Since there is no missing values or null data, Transformers is used convert Pounds to Kg, Inch to Meter Square & calculate BMI.

Later on, if we get any mixed metrics such as kg, pound, ounces, inches, feet, meter from different datasources, these trasformers can be extended or updated to support it.

In [20]:
class HeightTransformer(BaseEstimator, TransformerMixin):
    # initializer 
    def __init__(self, columns):
        # save the features list internally in the class
        self.columns = columns
       
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        # return the dataframe with the specified features
        X_c = X.copy()
        X_c[self.columns] =  (X[self.columns] / 100) * 0.3048
        return X_c
    
class WeightTransformer(BaseEstimator, TransformerMixin):
    # initializer 
    def __init__(self, columns):
        # save the features list internally in the class
        self.columns = columns
        
    def fit(self, X, y = None): 
        return self
    
    def transform(self, X, y = None):
        # return the dataframe 
        X_c = X.copy()
        X_c[self.columns] =  X[self.columns] * 0.453592
        return X_c
    
class BmiTransformer(BaseEstimator, TransformerMixin):
    # initializer 
    column_bmi = "bmi"
    def __init__(self, columns):
        # save the features list internally in the class
        self.columns = columns
        
    def fit(self, X, y = None): 
        return self
    
    def transform(self, X, y = None):
        # return the dataframe 
        X_c = X.copy()
        # bmi = weight in kg/ height in meter square
        X_c[self.column_bmi] = X_c[self.columns[1]] / X_c[self.columns[0]] 
        print(X_c)
        return X_c

In [21]:
numeric_features = ['height', 'weight', 'age', 'app_id']
metrics_transformer = Pipeline(
                        steps=[("transforms height", HeightTransformer(numeric_features[0])),
                               ("transforms weight", WeightTransformer(numeric_features[1]))]
                        )

categorical_features = ['gender']
cat_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("metrics", metrics_transformer, numeric_features),
    ]
)

cat_preprocessor = ColumnTransformer(
    transformers=[
        ("gender", cat_transformer, categorical_features),
    ]
)

# pipeine for numerical features
metrics_pipeline = Pipeline(
    steps=[("preprocessor", preprocessor)]
)

#pipline for categorical features
cat_pipeline = Pipeline(
    steps=[("cat preprocessor", cat_preprocessor)]
)


In [22]:
u = FeatureUnion([('categorical',cat_pipeline),
                 ('numerics', metrics_pipeline)])

In [23]:
train_data = u.fit_transform(X_train_p)
test_data = u.transform(X_test_p)

In [24]:

cat_columns = cat_preprocessor.get_feature_names()
print(cat_columns)
print(numeric_features)
columns = np.concatenate((cat_columns, numeric_features))
print(columns)

['gender__x0_Female', 'gender__x0_Male']
['height', 'weight', 'age', 'app_id']
['gender__x0_Female' 'gender__x0_Male' 'height' 'weight' 'age' 'app_id']




In [25]:
df_train = pd.DataFrame(train_data, columns=columns)
df_test = pd.DataFrame(test_data, columns=columns)
df_train['bmi'] = 0
df_test['bmi'] = 0

In [26]:
df_train.head()

Unnamed: 0,gender__x0_Female,gender__x0_Male,height,weight,age,app_id,bmi
0,0.0,1.0,1.548384,81.64656,36.0,100336.0,0
1,1.0,0.0,1.831848,77.11064,19.0,13545.0,0
2,0.0,1.0,1.533144,81.64656,57.0,99947.0,0
3,0.0,1.0,1.548384,74.84268,45.0,93580.0,0
4,0.0,1.0,1.545336,81.64656,19.0,99498.0,0


In [27]:
df_test.head()

Unnamed: 0,gender__x0_Female,gender__x0_Male,height,weight,age,app_id,bmi
0,1.0,0.0,1.557528,81.64656,19.0,99514.0,0
1,0.0,1.0,1.533144,81.64656,31.0,100335.0,0
2,1.0,0.0,1.55448,56.699,45.0,57732.0,0
3,1.0,0.0,1.542288,54.43104,46.0,99422.0,0
4,1.0,0.0,1.840992,68.0388,24.0,13134.0,0


In [28]:

target_pipeline = Pipeline(
                    steps=[("calculates bmi", BmiTransformer(numeric_features))]
                )
df_train = target_pipeline.fit_transform(df_train)
print("here")
df_test = target_pipeline.transform(df_test)
df_test.head()

    gender__x0_Female  gender__x0_Male    height      weight   age    app_id  \
0                 0.0              1.0  1.548384   81.646560  36.0  100336.0   
1                 1.0              0.0  1.831848   77.110640  19.0   13545.0   
2                 0.0              1.0  1.533144   81.646560  57.0   99947.0   
3                 0.0              1.0  1.548384   74.842680  45.0   93580.0   
4                 0.0              1.0  1.545336   81.646560  19.0   99498.0   
5                 0.0              1.0  1.524000   65.770840  19.0   93587.0   
6                 1.0              0.0  1.554480   77.110640  58.0   99517.0   
7                 0.0              1.0  1.542288   68.038800  59.0  100342.0   
8                 0.0              1.0  1.545336   77.110640  48.0   87432.0   
9                 1.0              0.0  1.533144   81.646560  34.0  100334.0   
10                0.0              1.0  1.831848   63.502880  40.0  100344.0   
11                1.0              0.0  

Unnamed: 0,gender__x0_Female,gender__x0_Male,height,weight,age,app_id,bmi
0,1.0,0.0,1.557528,81.64656,19.0,99514.0,52.420605
1,0.0,1.0,1.533144,81.64656,31.0,100335.0,53.254332
2,1.0,0.0,1.55448,56.699,45.0,57732.0,36.474577
3,1.0,0.0,1.542288,54.43104,46.0,99422.0,35.292397
4,1.0,0.0,1.840992,68.0388,24.0,13134.0,36.957684


In [29]:
df_train.head()

Unnamed: 0,gender__x0_Female,gender__x0_Male,height,weight,age,app_id,bmi
0,0.0,1.0,1.548384,81.64656,36.0,100336.0,52.730175
1,1.0,0.0,1.831848,77.11064,19.0,13545.0,42.094453
2,0.0,1.0,1.533144,81.64656,57.0,99947.0,53.254332
3,0.0,1.0,1.548384,74.84268,45.0,93580.0,48.335994
4,0.0,1.0,1.545336,81.64656,19.0,99498.0,52.83418


In [30]:
df_base.head()

Unnamed: 0,app_id,age,gender,height,weight,bmi
0,56372,31,Male,510,185,0
1,34565,35,Male,510,205,0
2,57732,45,Female,510,125,0
3,87324,38,Male,503,175,0
4,12323,39,Female,600,252,0


In [31]:
df_train.to_csv("train.csv")
df_test.to_csv("test.csv")

In [32]:
X_train = df_train.drop(columns=["bmi", "app_id"])
X_test = df_test.drop(columns=["bmi", "app_id"])
y_train = df_train["bmi"]
y_test = df_test["bmi"]

### Model Selection,Prediction & Metrics

In [33]:
alpha = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = dict(alpha=alpha)

pipe = Pipeline([
    ('standard scaler', StandardScaler()),
   # ('GS LR', GridSearchCV(LinearRegression(), param_grid={})),
    ('GS Lasso', GridSearchCV(estimator=Lasso(), param_grid=param_grid, scoring='r2', verbose=2, n_jobs=-1))
])
pipe.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


Pipeline(steps=[('standard scaler', StandardScaler()),
                ('GS Lasso',
                 GridSearchCV(estimator=Lasso(), n_jobs=-1,
                              param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10,
                                                    100]},
                              scoring='r2', verbose=2))])

In [34]:
# Predict training data
y_train_pred = pipe.predict(X_train)
print(f"Predictions on training data: {y_train_pred}")

# Predict test data
y_test_pred = pipe.predict(X_test)
print(f"Predictions on test data: {y_test_pred}")

Predictions on training data: [52.66053204 42.22063526 52.89278207 48.48802844 52.87588354 43.86673127
 49.18187804 44.43551284 49.91097229 52.6663354  34.26141578 35.58424774
 52.8840917  53.90747742 44.54681045 60.54032212 44.93091669 52.9000258
 52.81238823 52.63614867 45.14908048 32.94622671 37.58226595 50.3266309
 64.89098785 52.22645981 30.22435206 52.50915806 48.06240548 41.9178873
 52.10270666 36.82346718 45.04550881 61.0795085  52.65280609 64.55891323
 32.45924777 43.10552771 46.76693446 50.06863185 48.16862305 40.39604433
 42.20453477 49.4370648  39.84865506 40.4863278  38.03243867 52.50891695
 45.30866262 28.49577324 52.09806569 29.09893715 51.10214831 42.55257091
 51.78662735 57.98413529 44.57071161 51.67722268 52.25929245 35.49218595]
Predictions on test data: [52.14654819 53.09992542 36.98304932 35.92779539 36.47333163 48.21377946
 44.73939162 46.679538   36.47268302 52.47083565 59.34334272 44.35463678
 52.74841071 57.9527493  52.22645981 51.33899618]


In [35]:
y_test

0     52.420605
1     53.254332
2     36.474577
3     35.292397
4     36.957684
5     48.335994
6     44.570598
7     46.871267
8     35.222787
9     52.978593
10    59.818306
11    43.683837
12    52.834180
13    58.473978
14    52.523390
15    51.672317
Name: bmi, dtype: float64

#### MSE on Test data

In [36]:
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_test_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_test_pred))

Mean squared error: 0.25
Coefficient of determination: 1.00


#### MSE on Train data

In [37]:
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_train, y_train_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_train, y_train_pred))

Mean squared error: 0.30
Coefficient of determination: 1.00


### Calculating cost

#### Storing predicted bmi, actualpredicted bmi,  app_id to the csv files named predicted_bmi_train.csv & predicted_bmi_test.csv for the analysis & calculating cost

In [38]:

pd_pred_train = pd.DataFrame(data=X_train, columns=X_train.columns) 
pd_pred_test = pd.DataFrame(data=X_test, columns=X_train.columns) 

# bmi calcuated based on existing train & test data 
pd_pred_train["bmi"] = y_train 
pd_pred_test["bmi"] = y_test

#bmi predicted by the model on train & test data
pd_pred_train["pred_bmi"] = y_train_pred
pd_pred_test["pred_bmi"] = y_test_pred

#remapping the app_id to track the applicant
pd_pred_train["app_id"] = df_train["app_id"]
pd_pred_test["app_id"] = df_test["app_id"]

pd_pred_train.to_csv("predicted_bmi_train.csv")
pd_pred_test.to_csv("predicted_bmi_test.csv")



In [39]:
pd_pred_train.head()

Unnamed: 0,gender__x0_Female,gender__x0_Male,height,weight,age,bmi,pred_bmi,app_id
0,0.0,1.0,1.548384,81.64656,36.0,52.730175,52.660532,100336.0
1,1.0,0.0,1.831848,77.11064,19.0,42.094453,42.220635,13545.0
2,0.0,1.0,1.533144,81.64656,57.0,53.254332,52.892782,99947.0
3,0.0,1.0,1.548384,74.84268,45.0,48.335994,48.488028,93580.0
4,0.0,1.0,1.545336,81.64656,19.0,52.83418,52.875884,99498.0


In [40]:
# re-mapping one-hot encoded sex feature to the keywords(m,f) 
def sex(flag):
    if flag == 1.0:
        return 'm'
    else:
        return 'f'
pd_pred_train["sex"] = pd_pred_train["gender__x0_Male"].apply(lambda x: sex(x))
pd_pred_test["sex"] = pd_pred_test["gender__x0_Male"].apply(lambda x: sex(x))

In [41]:
pd_pred_train.head()

Unnamed: 0,gender__x0_Female,gender__x0_Male,height,weight,age,bmi,pred_bmi,app_id,sex
0,0.0,1.0,1.548384,81.64656,36.0,52.730175,52.660532,100336.0,m
1,1.0,0.0,1.831848,77.11064,19.0,42.094453,42.220635,13545.0,f
2,0.0,1.0,1.533144,81.64656,57.0,53.254332,52.892782,99947.0,m
3,0.0,1.0,1.548384,74.84268,45.0,48.335994,48.488028,93580.0,m
4,0.0,1.0,1.545336,81.64656,19.0,52.83418,52.875884,99498.0,m


In [42]:
   
def cal_cost(age, bmi, sex):
    '''
        age - age of the member
        bmi - body mass index
        sex - m or f or any
    
        If Age is between 18 and 39 and BMI less than 17.49 or greater than 38.5, 
        provide quote as 750 USD 

        If Age is between 40 and 59 and BMI less than 18.49 or greater than 38.5, 
        provide quote as 1000 USD

         If Age is greater than 60 and BMI less than 18.49 or BMI greater than 45.5, 
        provide quote as 2000 USD
        
         Anything other than the above condition, give a quote of 500 USD and give 
        reason as “BMI is in right range”.
        
        If Gender is Female, give a 10% discount on above quoted price
        
        returns the cost 
    '''
    cost = 0
    discount_f = 10
    
    if age in range(18, 39) and (bmi < 17.49 or bmi > 38.5):
        cost = 750
    elif age in range(40, 59) and (bmi < 18.49 or bmi > 38.5):
        cost = 1000
    elif 60 <= age and (bmi < 18.49 or bmi > 45.5):
        cost = 2000
    else:
        cost = 500
        
    if sex == "f":
        discount = (cost * discount_f / 100)
        cost = cost - discount
        
    return cost
        

#### Calcuating the cost based on the calcuated & the predicted bmi. This will help to asses the price difference between the actual cost & the predicted cost

##### cost column - actual cost
##### pred_cost - predicted cost
##### bmi - actual bmi
##### pred_bmi - predicted bmi

In [43]:
pd_pred_train["cost"] = pd_pred_train.apply(lambda x: cal_cost(x.age, x.bmi, x.sex), axis=1)
pd_pred_train["pred_cost"] =  pd_pred_train.apply(lambda x: cal_cost(x.age, x.pred_bmi, x.sex), axis=1)

pd_pred_test["cost"] = pd_pred_test.apply(lambda x: cal_cost(x.age, x.bmi, x.sex), axis=1)
pd_pred_test["pred_cost"] =  pd_pred_test.apply(lambda x: cal_cost(x.age, x.pred_bmi, x.sex), axis=1)

In [44]:
pd_pred_train.head()

Unnamed: 0,gender__x0_Female,gender__x0_Male,height,weight,age,bmi,pred_bmi,app_id,sex,cost,pred_cost
0,0.0,1.0,1.548384,81.64656,36.0,52.730175,52.660532,100336.0,m,750.0,750.0
1,1.0,0.0,1.831848,77.11064,19.0,42.094453,42.220635,13545.0,f,675.0,675.0
2,0.0,1.0,1.533144,81.64656,57.0,53.254332,52.892782,99947.0,m,1000.0,1000.0
3,0.0,1.0,1.548384,74.84268,45.0,48.335994,48.488028,93580.0,m,1000.0,1000.0
4,0.0,1.0,1.545336,81.64656,19.0,52.83418,52.875884,99498.0,m,750.0,750.0


In [45]:
# minimum bmi value
pd_pred_train['pred_bmi'].min()

28.49577324244687

In [46]:
# max bmi value
pd_pred_train['pred_bmi'].max()

64.89098785419704

#### predicted - Max age  is 49 & the cost is 1000 

In [47]:
# max cost 
pd_pred_test['pred_cost'].max()

1000.0

In [48]:
# max age in test data
pd_pred_test['age'].max()

49.0

#### Train data - Max age is 59 & the cost is 1000

In [49]:
# max age in testraint data
pd_pred_train['age'].max()

59.0

In [50]:
# max cost in testraint data
pd_pred_train['pred_cost'].max()

1000.0

In [51]:
pd_pred_train.to_csv("predicted_cost_train.csv")
pd_pred_test.to_csv("predicted_cost_test.csv")