In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# Other umports
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from scipy.stats import lognorm, loguniform, randint


from sklearn.preprocessing import StandardScaler


# Classifiers and regressors
from sklearn.dummy import DummyClassifier, DummyRegressor

# train test split and cross validation
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from lightgbm.sklearn import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import RandomizedSearchCV

# metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import roc_curve


In [2]:
df = pd.read_csv('../data/train.csv')

In [3]:
df.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1


In [4]:
train_df, test_df = train_test_split(df, test_size=0.20, random_state=123)

In [5]:
train_df.columns

Index(['id', 'age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries', 'smoking'],
      dtype='object')

## References for feature engineering:

https://www.kaggle.com/competitions/playground-series-s3e24/discussion/450314

Some more feature ideas with formulas  
1.BMI: Body Mass Index  
Formula: weight(kg) / ((height(cm) / 100)^2  

2.Blood Pressure  
Formula: systolic / relaxation  

3.Total Cholesterol  
Formula: HDL + LDL + (triglyceride / 0.2)  

4.Creatinine Clearance  
Formula: ((140 - age) * weight(kg)) / (72 * serum creatinine)  

5.Average Eyesight  
Formula: (eyesight(right) + eyesight(left)) / 2  

1. BMI
It is the ratio of weight (kg) / height (metre-squared)  
BMI < 18.5 - Underwieght  
BMI between 18.5- 24.99 - Normal  
BMI between 25- 29.99 - Overweight  
BMI between 30-34.99 - Obese- grade 1  
BMI between 35- 39.99 - Obese- grade 2  
BMI >= 40.00 - Obese- grade 3  


2. Waist circumference and obesity  
For men-  
below 94cm (37in) - low risk  
94–102cm (37-40in) - high risk  
more than 102cm (40in) - very high  
For women-  
below 80cm (31.5in) - low risk  
80–88cm (31.5-34.6in) - high risk  
more than 88cm (34.6in) - very high  


3. Age and health risk  
Age < 45 - lower risk  
Age >= 45 years - high risk  


4. Blood pressure classification  
Systolic BP < 120 mmHg or Diastolic BP < 80 mmHg - normal  
Systolic BP between 120 mmHg- 129 mmHg or Diastolic BP between 80-89 mmHg- elevated  
Systolic BP between 130 mmHg - 139 mmHg or Diastolic BP between 90-99 mmHg- stage 1 high BP  
Systolic BP between 140 mmHg - 179 mmHg or Diastolic BP between 100-119 mmHg- stage 2 high BP  
Systolic BP >= 180 mmHg or Diastolic BP >= 120 mmHg- stage 3 high BP (emergency)  


5. HDL- LDL cholesterol classification  
Total cholesterol-  
Less than 200mg/dL Desirable  
200-239 mg/dL Borderline high  
240mg/dL and above High  
HDL-  
Less then 40- High risk  
Between 40-60 - Normal  
Greater of equals 60- Low risk  
LDL-  
Less than 100mg/dL- Optimal (best for your health)  
100-129mg/dL- Near optimal  
130-159 mg/dL- Borderline high  
160-189 mg/dL- High  
190 mg/dL and above- Very High  


5. Triglyceride classification  
Less than 150 - normal  
150- 199- moderate risk  
200-499- high risk  
500+ - very high risk  


6. Haemoglobin classification for anemic cases  
Children aged under 5 years and pregnant women <110 g/L - high risk for anemic case  
Others- <120 g/L - high risk for anemic case  

7. Creatinine normal range  
For adult men - 0.74 to 1.35 mg/dL (65.4 to 119.3 micromoles/L)  
For adult women - 0.59 to 1.04 mg/dL (52.2 to 91.9 micromoles/L)  
  
8. γ-GTP normal levels-  
normal range for adults is 5 to 40 U/L  
values above 40 are considered risky for liver disease  

9. AST- ALT levels-  
The normal range of an SGOT test is generally between 8 and 45 units per liter of serum  
Higher results indicate risk for liver disease  

In [6]:
train_df.columns

Index(['id', 'age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries', 'smoking'],
      dtype='object')

## Making a gender feature

Reference: https://www.kaggle.com/competitions/playground-series-s3e24/discussion/452379

* Below code is taken from the reference mentioned

In [7]:
smoking = pd.read_csv('../data/other_data_smoking.csv',index_col='ID')

In [8]:
# imports

from sklearn.feature_selection import mutual_info_classif
from joblib import Parallel, delayed
from sklearn.preprocessing import OrdinalEncoder

In [9]:
def mi(x,y,n_iter=5):
    X = np.array(x).reshape((-1,1))
    y = np.array(y)
    if X.dtype == 'object':
        X = OrdinalEncoder().fit_transform(X)
    scores = Parallel(n_jobs=4)(delayed(mutual_info_classif)(X,y,random_state=42+i) for i in range(n_iter))
    return np.array([np.mean(scores), np.std(scores)])

def theil_u(train,target,comment=''):
    print(F'*** U(y|x) in % {comment} ***')
    e = mi(train[target],train[target])[0]
    for c in train.columns:
        if c != target:
            mu, sigma = mi(train[c],train[target])/e*100
            print(F"{c}: {mu:.2f} ± {sigma:.2f}")

theil_u(smoking,target='smoking',comment='(Original data)')

*** U(y|x) in % (Original data) ***
gender: 24.21 ± 0.29
age: 2.40 ± 0.30
height(cm): 16.02 ± 0.17
weight(kg): 8.47 ± 0.29
waist(cm): 4.89 ± 0.16
eyesight(left): 1.03 ± 0.27
eyesight(right): 1.20 ± 0.36
hearing(left): 1.10 ± 0.41
hearing(right): 1.13 ± 0.39
systolic: 1.01 ± 0.28
relaxation: 1.31 ± 0.16
fasting blood sugar: 0.83 ± 0.25
Cholesterol: 0.06 ± 0.10
triglyceride: 5.54 ± 0.39
HDL: 2.97 ± 0.36
LDL: 0.43 ± 0.28
hemoglobin: 15.69 ± 0.28
Urine protein: 0.97 ± 0.33
serum creatinine: 6.79 ± 0.20
AST: 1.17 ± 0.16
ALT: 3.27 ± 0.46
Gtp: 11.42 ± 0.14
oral: 0.22 ± 0.12
dental caries: 0.85 ± 0.30
tartar: 1.28 ± 0.28


In [10]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate

kfold = RepeatedStratifiedKFold(n_splits=10,n_repeats=5)
X_0 = smoking.drop(['gender','oral','tartar','smoking'],axis=1)
y_0 = smoking['gender'].map({'F':0,'M':1})

results = cross_validate(XGBClassifier(n_jobs=4,random_state=0),
                         X_0,y_0,
                         scoring='roc_auc',
                         cv=kfold,n_jobs=1,
                         return_estimator=True)

results['test_score'].mean(), results['test_score'].std()

(0.997400317009497, 0.0003741220375394266)

* The code above shows the mutual information between each feature column and the selected target column
* The gender was found to have high information with respect to smoking
* It is also seen that the gender can be predicted based on the other feature column values fairly well in the old dataset
* Thus we create an estimator model based off the old dataset

* We use these estimators to create the gender in new dataset

In [11]:
train_df.columns

Index(['id', 'age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries', 'smoking'],
      dtype='object')

In [12]:
X_0.columns

Index(['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries'],
      dtype='object')

In [13]:
train_df['male_probability'] = 0
cols = list(X_0.columns)

for clf in results['estimator']:
    train_df['male_probability'] += clf.predict_proba(train_df[cols])[:,1]
train_df['male_probability'] /= len(results['estimator'])
theil_u(train_df,target='smoking',comment='(train)')

*** U(y|x) in % (train) ***
id: 0.00 ± 0.01
age: 3.91 ± 0.17
height(cm): 20.07 ± 0.20
weight(kg): 11.68 ± 0.24
waist(cm): 6.37 ± 0.12
eyesight(left): 1.74 ± 0.15
eyesight(right): 2.03 ± 0.08
hearing(left): 2.36 ± 0.16
hearing(right): 2.43 ± 0.13
systolic: 1.62 ± 0.21
relaxation: 2.09 ± 0.12
fasting blood sugar: 1.32 ± 0.10
Cholesterol: 1.02 ± 0.16
triglyceride: 10.97 ± 0.14
HDL: 5.97 ± 0.24
LDL: 1.32 ± 0.06
hemoglobin: 19.28 ± 0.11
Urine protein: 2.13 ± 0.19
serum creatinine: 9.11 ± 0.18
AST: 0.68 ± 0.13
ALT: 4.42 ± 0.15
Gtp: 14.48 ± 0.14
dental caries: 0.91 ± 0.22
male_probability: 25.91 ± 0.07


* We can use the 'male_probability' as a continuous value feature for gender

In [14]:
# feat functions

def bmi_feat(df):
    ''' creates the bmi and the associated risk colum
    BMI < 18.5 - Underwieght
    BMI between 18.5- 24.99 - Normal
    BMI between 25- 29.99 - Overweight
    BMI between 30-34.99 - Obese- grade 1
    BMI between 35- 39.99 - Obese- grade 2
    BMI >= 40.00 - Obese- grade 3 
    '''
    
    
    bmi_ranges = [0, 18.5, 25, 30, 35, 40, float('inf')]
    bmi_levels = ['Underweight', 'Normal', 'Overweight', 'Obese-lvl1','Obese-lvl2', 'Obese-lvl3']
    
    df['bmi'] = df['weight(kg)'] / ((df['height(cm)']*0.01)**2)
    df['bmi_category'] = pd.cut(df['bmi'], bins = bmi_ranges, labels=bmi_levels, right=False)
    
    return df

def obesity_risk(df):
    ''' creates col for obesity risk based on waistline

        Waist circumference and obesity
        For men-
        below 94cm (37in) - low risk
        94–102cm (37-40in) - high risk
        more than 102cm (40in) - very high
        For women-
        below 80cm (31.5in) - low risk
        80–88cm (31.5-34.6in) - high risk
        more than 88cm (34.6in) - very high
    '''
        
    female = df['male_probability'] < 0.5
    male = df['male_probability'] >= 0.5
    
    male_waist_ranges = [0,94,102,float('inf')]
    female_waist_ranges = [0,80,88,float('inf')]
    
    waist_levels = ['low risk', 'high risk', 'very high risk']
    
    df['obesity_risk'] = np.where(female, pd.cut(df['waist(cm)'], bins=female_waist_ranges, labels=waist_levels), pd.cut(df['waist(cm)'], bins=male_waist_ranges, labels=waist_levels))
    
    return df

def age_risk(df):
    ''' risk based on age
        
        Age and health risk
        Age < 45 - lower risk
        Age >= 45 years - high risk 
    '''
    # for now computing with the solo gender info as men
    age_ranges = [0,45,float('inf')]
    age_levels = ['low risk', 'high risk']
    
    df['age_risk'] = pd.cut(df['age'], bins = age_ranges, labels=age_levels, right=False)
    
    return df


def bp_risk(df):
    '''
    calculates the bp risk
    
    Systolic BP < 120 mmHg or Diastolic BP < 80 mmHg - normal
    Systolic BP between 120 mmHg- 129 mmHg or Diastolic BP between 80-89 mmHg- elevated
    Systolic BP between 130 mmHg - 139 mmHg or Diastolic BP between 90-99 mmHg- stage 1 high BP
    Systolic BP between 140 mmHg - 179 mmHg or Diastolic BP between 100-119 mmHg- stage 2 high BP
    Systolic BP >= 180 mmHg or Diastolic BP >= 120 mmHg- stage 3 high BP (emergency)
        '''
    
    bp_ranges = [0,120,130,140,180,float('inf')]
    bp_levels = ['normal', 'elevated', 'high_lvl1', 'high_lvl2', 'high_lvl3']
    
    df['bp_risk'] = pd.cut(df['systolic'], bins = bp_ranges, labels=bp_levels, right=False)
    
    return df
    
    
def hdl_risk(df):
    '''
    Less then 40- High risk
    Between 40-60 - Normal
    Greater of equals 60- Low risk
    '''
    hdl_ranges = [0,40,60,float('inf')]
    hdl_levels = ['high risk', 'normal', 'low risk']
    
    df['hdl_risk'] = pd.cut(df['HDL'], bins = hdl_ranges, labels=hdl_levels, right=False)
    
    return df

def ldl_risk(df):
    '''
    Less than 100mg/dL- Optimal (best for your health)
    100-129mg/dL- Near optimal
    130-159 mg/dL- Borderline high
    160-189 mg/dL- High
    190 mg/dL and above- Very High
    '''
    ldl_ranges = [0,100,130,160,190,float('inf')]
    ldl_levels = ['optimal', 'normal', 'high_lvl1', 'high_lvl2', 'high_lvl3']
    
    df['ldl_risk'] = pd.cut(df['LDL'], bins = ldl_ranges, labels=ldl_levels, right=False)
    
    return df   

def tg_risk(df):
    '''
    Less than 150 - normal
    150- 199- moderate risk
    200-499- high risk
    500+ - very high risk 
    '''
    tg_ranges = [0,150,200,500,float('inf')]
    tg_levels = ['normal', 'moderate_risk', 'high_risk', 'very_high_risk']
    
    df['tg_risk'] = pd.cut(df['triglyceride'], bins = tg_ranges, labels=tg_levels, right=False)
    
    return df      

def anemic_risk(df):
    '''
    hemoglobin
    Less than 12
    '''
    
    hg_ranges = [0,12,float('inf')]
    hg_levels = ['high risk', 'low risk']
    
    df['anemic_risk'] = pd.cut(df['hemoglobin'], bins = hg_ranges, labels=hg_levels, right=False)
    
    return df   


def creatinine_range(df):
    '''
    For adult men - 0.74 to 1.35 mg/dL (65.4 to 119.3 micromoles/L)
    For adult women - 0.59 to 1.04 mg/dL (52.2 to 91.9 micromoles/L) 
    '''
    
    female = df['male_probability'] < 0.5
    male = df['male_probability'] >= 0.5
    
    female_cr_ranges = [0,0.60, 1.04, float('inf')]
    male_cr_ranges = [0,0.75, 1.35, float('inf')]
    cr_levels = ['low','normal', 'high']
    
    df['creatinine_cat'] = np.where(female, pd.cut(df['serum creatinine'], bins=female_cr_ranges, labels=cr_levels), pd.cut(df['serum creatinine'], bins=male_cr_ranges, labels=cr_levels))
    
    return df    


def gtp_range(df):
    '''
    normal range for adults is 5 to 40 U/L
    values above 40 are considered risky for liver disease 
    '''

    gtp_ranges = [0,5, 40, float('inf')]
    gtp_levels = ['low','normal', 'high']
    
    df['gtp_cat'] = pd.cut(df['Gtp'], bins = gtp_ranges, labels=gtp_levels, right=False)
    
    return df  


def ast_range(df):
    '''
    The normal range of an SGOT test is generally between 8 and 45 units per liter of serum
    Higher results indicate risk for liver disease    
    '''
    ast_ranges = [0,8, 45, float('inf')]
    ast_levels = ['low','normal', 'high']
    
    df['ast_cat'] = pd.cut(df['AST'], bins = ast_ranges, labels=ast_levels, right=False)
    
    return df  

def alt_range(df):
    '''
    The normal range of an alt test is generally between 29 to 33 
    international units per liter (IU/L) for males and 19 to 25 IU/L for females
    '''
    alt_ranges = [0,29, 33, float('inf')]
    alt_levels = ['low','normal', 'high']
    
    df['alt_cat'] = pd.cut(df['ALT'], bins = alt_ranges, labels=alt_levels, right=False)
    
    return df  

### Applying the functions above to create the manual features

In [15]:
bmi_feat(train_df)
obesity_risk(train_df)
age_risk(train_df)
bp_risk(train_df)
hdl_risk(train_df)
ldl_risk(train_df)
tg_risk(train_df)
anemic_risk(train_df)
creatinine_range(train_df)
gtp_range(train_df)
ast_range(train_df)
alt_range(train_df)

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,age_risk,bp_risk,hdl_risk,ldl_risk,tg_risk,anemic_risk,creatinine_cat,gtp_cat,ast_cat,alt_cat
97378,97378,35,170,65,95.0,1.0,1.5,1,1,126,...,low risk,elevated,low risk,normal,normal,low risk,normal,high,normal,low
33152,33152,40,165,55,74.0,1.0,0.9,1,1,103,...,low risk,normal,normal,normal,normal,low risk,normal,normal,normal,low
152286,152286,55,155,65,79.0,0.7,0.6,1,1,133,...,high risk,high_lvl1,normal,normal,moderate_risk,low risk,low,normal,normal,low
153823,153823,60,150,55,77.0,0.8,1.0,1,1,150,...,high risk,high_lvl2,normal,normal,normal,low risk,low,normal,normal,low
108694,108694,60,165,90,101.0,0.8,0.8,1,1,120,...,high risk,elevated,high risk,normal,moderate_risk,low risk,normal,normal,normal,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146449,146449,40,165,80,85.0,0.6,0.9,1,1,130,...,low risk,high_lvl1,normal,high_lvl1,normal,low risk,normal,normal,normal,high
119906,119906,50,165,65,83.0,1.2,0.1,1,1,112,...,high risk,normal,normal,normal,moderate_risk,low risk,normal,normal,normal,low
17730,17730,40,165,75,93.8,1.5,1.5,1,1,118,...,low risk,normal,normal,normal,moderate_risk,low risk,normal,high,normal,low
28030,28030,45,175,65,84.0,1.2,1.2,1,1,112,...,high risk,normal,normal,normal,normal,low risk,normal,normal,normal,low


## Creating these features for the test set

In [16]:
test_df['male_probability'] = 0
cols = list(X_0.columns)

for clf in results['estimator']:
    test_df['male_probability'] += clf.predict_proba(test_df[cols])[:,1]
test_df['male_probability'] /= len(results['estimator'])

In [17]:
bmi_feat(test_df)
obesity_risk(test_df)
age_risk(test_df)
bp_risk(test_df)
hdl_risk(test_df)
ldl_risk(test_df)
tg_risk(test_df)
anemic_risk(test_df)
creatinine_range(test_df)
gtp_range(test_df)
ast_range(test_df)
alt_range(test_df)

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,age_risk,bp_risk,hdl_risk,ldl_risk,tg_risk,anemic_risk,creatinine_cat,gtp_cat,ast_cat,alt_cat
123726,123726,50,170,85,95.0,0.8,0.8,1,1,136,...,high risk,high_lvl1,low risk,optimal,normal,low risk,normal,high,high,high
70373,70373,40,160,55,65.0,0.6,0.8,1,1,100,...,low risk,normal,normal,high_lvl1,normal,high risk,normal,normal,normal,low
118032,118032,50,155,55,73.0,1.0,0.8,1,1,108,...,high risk,normal,low risk,optimal,normal,low risk,normal,normal,normal,low
21285,21285,40,175,80,91.0,1.0,1.2,1,1,120,...,low risk,elevated,normal,high_lvl2,normal,low risk,normal,high,normal,high
67141,67141,40,170,70,84.0,1.2,1.2,1,1,116,...,low risk,normal,high risk,normal,high_risk,low risk,normal,high,normal,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39999,39999,60,155,65,81.0,0.4,0.7,1,1,142,...,high risk,high_lvl2,low risk,normal,normal,low risk,normal,normal,normal,low
26648,26648,35,170,70,89.9,0.6,0.6,1,1,126,...,low risk,elevated,low risk,optimal,normal,low risk,normal,normal,normal,high
17709,17709,55,155,60,77.0,1.0,1.0,1,1,135,...,high risk,high_lvl1,normal,high_lvl1,high_risk,low risk,normal,normal,normal,high
45244,45244,50,150,50,64.0,0.9,1.2,1,1,119,...,high risk,normal,low risk,high_lvl1,normal,low risk,normal,normal,normal,low


## Separating into X and y components on the train and test data

In [18]:
X_train = train_df.drop(columns=["smoking", "id"])
y_train = train_df["smoking"]

X_test = test_df.drop(columns=["smoking", "id"])
y_test = test_df["smoking"]

In [19]:
# Writing the feature engineered dataset to a file

X_train.to_csv('../processed/X_train.csv')
y_train.to_csv('../processed/y_train.csv')
X_test.to_csv('../processed/X_test.csv')
y_test.to_csv('../processed/y_test.csv')


## Creating the same features for the submission dataset 

In [20]:
# read the submission dataset
X_submission = pd.read_csv('../data/test.csv')
X_set = X_submission.drop('id', axis = 1)
X_set.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries
0,40,165,70,84.0,1.2,1.2,1,1,130,89,...,186,49,115,14.2,1,0.9,19,25,32,0
1,80,160,60,93.0,1.0,1.0,2,2,144,72,...,158,35,104,13.0,1,1.1,20,12,24,0
2,60,170,70,86.5,0.6,0.7,1,1,117,75,...,173,39,88,15.4,1,1.4,38,60,36,0
3,40,160,50,67.0,0.3,0.4,1,1,116,62,...,47,75,128,14.5,1,0.6,25,18,10,1
4,40,170,75,89.4,1.0,0.9,1,1,132,94,...,100,39,123,16.5,1,1.0,30,39,27,1


In [21]:
X_set['male_probability'] = 0
cols = list(X_0.columns)

for clf in results['estimator']:
    X_set['male_probability'] += clf.predict_proba(X_set[cols])[:,1]
X_set['male_probability'] /= len(results['estimator'])

In [22]:
# Generating the features

bmi_feat(X_set)
obesity_risk(X_set)
age_risk(X_set)
bp_risk(X_set)
hdl_risk(X_set)
ldl_risk(X_set)
tg_risk(X_set)
anemic_risk(X_set)
creatinine_range(X_set)
gtp_range(X_set)
ast_range(X_set)
alt_range(X_set)

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,age_risk,bp_risk,hdl_risk,ldl_risk,tg_risk,anemic_risk,creatinine_cat,gtp_cat,ast_cat,alt_cat
0,40,165,70,84.0,1.2,1.2,1,1,130,89,...,low risk,high_lvl1,normal,normal,moderate_risk,low risk,normal,normal,normal,low
1,80,160,60,93.0,1.0,1.0,2,2,144,72,...,high risk,high_lvl2,high risk,normal,moderate_risk,low risk,normal,normal,normal,low
2,60,170,70,86.5,0.6,0.7,1,1,117,75,...,high risk,normal,high risk,optimal,moderate_risk,low risk,high,normal,normal,high
3,40,160,50,67.0,0.3,0.4,1,1,116,62,...,low risk,normal,low risk,normal,normal,low risk,low,normal,normal,low
4,40,170,75,89.4,1.0,0.9,1,1,132,94,...,low risk,high_lvl1,high risk,normal,normal,low risk,normal,normal,normal,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106166,40,165,60,78.0,0.8,0.9,1,1,112,78,...,low risk,normal,normal,high_lvl1,normal,low risk,normal,normal,normal,normal
106167,40,170,85,95.0,1.2,1.2,1,1,130,90,...,low risk,high_lvl1,normal,normal,moderate_risk,low risk,normal,normal,normal,normal
106168,35,170,85,89.0,1.2,1.2,1,1,131,86,...,low risk,high_lvl1,normal,high_lvl1,normal,low risk,normal,normal,normal,normal
106169,40,160,60,67.0,0.7,0.8,1,1,120,80,...,low risk,elevated,normal,normal,normal,low risk,low,normal,normal,high


In [23]:
# writing the features to folder

X_set.to_csv('../processed/X_set.csv')
