In [4]:
import pandas as pd
import numpy as np

#Model 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier


#Model metrics for evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,recall_score,roc_auc_score

#for class imbalance problem
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler,ADASYN
from imblearn.under_sampling import AllKNN,TomekLinks

#Normalizing and Standardising Technique
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#Feauture engineering technique
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import RFE
from sklearn.decomposition import KernelPCA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

#For finding best parameter values
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [5]:
df = pd.read_csv('cleaned_data.csv')

In [6]:
df.head()

Unnamed: 0,age,gender,height,weight,bmi,waist_size,blood_pressure,heart_rate,cholesterol,glucose,...,device_usage,healthcare_access,insurance,sunlight_exposure,meals_per_day,caffeine_intake,family_history,pet_owner,is_healthy,age_bins
0,56,Male,173.416872,56.88664,18.915925,72.16513,118.264254,60.749825,214.580523,103.008176,...,High,Poor,No,High,5,Moderate,No,Yes,0,41-60
1,69,Female,163.20738,97.799859,36.716278,85.598889,117.917986,66.463696,115.794002,116.905134,...,Moderate,Moderate,No,High,5,High,Yes,No,0,61-80
2,46,Male,177.281966,80.687562,25.67305,90.29503,123.073698,76.043212,138.134787,89.180302,...,High,Good,Yes,High,4,Moderate,No,No,0,41-60
3,32,Female,172.101255,63.142868,21.31848,100.504211,148.173453,68.781981,203.017447,128.375798,...,Low,Moderate,No,High,1,Missing,No,Yes,0,21-40
4,60,Female,163.608816,40.0,14.943302,69.02115,150.613181,92.335358,200.412439,94.813332,...,Low,Moderate,Yes,High,1,High,Yes,Yes,0,41-60


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84540 entries, 0 to 84539
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    84540 non-null  int64  
 1   gender                 84540 non-null  object 
 2   height                 84540 non-null  float64
 3   weight                 84540 non-null  float64
 4   bmi                    84540 non-null  float64
 5   waist_size             84540 non-null  float64
 6   blood_pressure         84540 non-null  float64
 7   heart_rate             84540 non-null  float64
 8   cholesterol            84540 non-null  float64
 9   glucose                84540 non-null  float64
 10  insulin                84540 non-null  float64
 11  sleep_hours            84540 non-null  float64
 12  sleep_quality          84540 non-null  object 
 13  work_hours             84540 non-null  float64
 14  physical_activity      84540 non-null  float64
 15  da

In [8]:
df['is_healthy'].value_counts()

is_healthy
0    59355
1    25185
Name: count, dtype: int64

In [9]:
df['is_healthy'].value_counts()/len(df)*100

is_healthy
0    70.209368
1    29.790632
Name: count, dtype: float64

In [10]:
X = df.drop(columns=['age_bins','is_healthy'])
y = df['is_healthy']

In [11]:
X.head()

Unnamed: 0,age,gender,height,weight,bmi,waist_size,blood_pressure,heart_rate,cholesterol,glucose,...,diet_type,exercise_type,device_usage,healthcare_access,insurance,sunlight_exposure,meals_per_day,caffeine_intake,family_history,pet_owner
0,56,Male,173.416872,56.88664,18.915925,72.16513,118.264254,60.749825,214.580523,103.008176,...,Vegan,Strength,High,Poor,No,High,5,Moderate,No,Yes
1,69,Female,163.20738,97.799859,36.716278,85.598889,117.917986,66.463696,115.794002,116.905134,...,Vegan,Cardio,Moderate,Moderate,No,High,5,High,Yes,No
2,46,Male,177.281966,80.687562,25.67305,90.29503,123.073698,76.043212,138.134787,89.180302,...,Vegan,Cardio,High,Good,Yes,High,4,Moderate,No,No
3,32,Female,172.101255,63.142868,21.31848,100.504211,148.173453,68.781981,203.017447,128.375798,...,Vegetarian,Mixed,Low,Moderate,No,High,1,Missing,No,Yes
4,60,Female,163.608816,40.0,14.943302,69.02115,150.613181,92.335358,200.412439,94.813332,...,Vegan,Missing,Low,Moderate,Yes,High,1,High,Yes,Yes


In [12]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: is_healthy, dtype: int64

In [13]:
cat_cols = X.select_dtypes(include='object').columns.tolist()

X = pd.get_dummies(X,columns=cat_cols,drop_first=True).astype(int)

In [14]:
X.head()

Unnamed: 0,age,height,weight,bmi,waist_size,blood_pressure,heart_rate,cholesterol,glucose,insulin,...,device_usage_Moderate,healthcare_access_Moderate,healthcare_access_Poor,insurance_Yes,sunlight_exposure_Low,sunlight_exposure_Moderate,caffeine_intake_Missing,caffeine_intake_Moderate,family_history_Yes,pet_owner_Yes
0,56,173,56,18,72,118,60,214,103,26,...,0,0,1,0,0,0,0,1,0,1
1,69,163,97,36,85,117,66,115,116,10,...,1,1,0,0,0,0,0,0,1,0
2,46,177,80,25,90,123,76,138,89,18,...,0,0,0,1,0,0,0,1,0,0
3,32,172,63,21,100,148,68,203,128,18,...,0,1,0,0,0,0,1,0,0,1
4,60,163,40,14,69,150,92,200,94,16,...,0,1,0,1,0,0,0,0,1,1


In [15]:
#Split into train and test with x,y variable
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=41)

In [16]:
len(X_train)

63405

In [17]:
len(X_test)

21135

In [18]:
#We have to make sure scale this value to perform better

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [19]:
X_train

array([[-1.37094553, -0.05574163, -2.01953863, ..., -0.70703151,
         0.99884933,  1.00498046],
       [ 1.14521205,  0.643961  , -0.24818393, ..., -0.70703151,
         0.99884933,  1.00498046],
       [-1.7064332 ,  0.84387604, -1.33824836, ..., -0.70703151,
         0.99884933, -0.99504422],
       ...,
       [-0.3644825 ,  0.643961  ,  1.25065466, ...,  1.41436413,
         0.99884933,  1.00498046],
       [-0.47631172,  0.04421589,  2.00007396, ..., -0.70703151,
        -1.00115199, -0.99504422],
       [ 1.70435818,  2.04336626, -0.24818393, ..., -0.70703151,
        -1.00115199, -0.99504422]])

In [20]:
X_test

array([[ 1.3688705 ,  1.04379108,  0.77375147, ..., -0.70703151,
         0.99884933,  1.00498046],
       [ 1.48069973,  0.24413093, -0.24818393, ..., -0.70703151,
        -1.00115199,  1.00498046],
       [ 0.80972437, -0.55552922,  0.56936439, ..., -0.70703151,
        -1.00115199, -0.99504422],
       ...,
       [ 0.25057824, -0.75544426,  0.16059023, ..., -0.70703151,
         0.99884933,  1.00498046],
       [-0.92362862, -0.35561418, -0.72508712, ..., -0.70703151,
         0.99884933, -0.99504422],
       [ 0.25057824,  0.04421589, -0.45257101, ...,  1.41436413,
        -1.00115199,  1.00498046]])

In [21]:
#feature engineering
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [22]:
lda = LinearDiscriminantAnalysis()
X_train_lda = lda.fit_transform(X_train,y_train)
X_test_lda = lda.transform(X_test)

In [23]:
smote = SMOTE(random_state=42)
X_train_resampled_sme, y_train_resampled_sme = smote.fit_resample(X_train_pca,y_train)

In [24]:
smoteen = SMOTEENN(random_state=42)
X_train_resampled_sme_een, y_train_resampled_sme_een = smoteen.fit_resample(X_train_pca,y_train)

In [25]:
adasyn = ADASYN()
X_train_resampled_adasyn, y_train_resampled_adasyn = adasyn.fit_resample(X_train_pca,y_train)

In [26]:
tomek = TomekLinks(sampling_strategy='majority')
X_train_resampled_tk, y_train_resampled_tk = tomek.fit_resample(X_train_pca, y_train)

In [27]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    accuracy = accuracy_score(test_labels, predictions)
    auc_roc = roc_auc_score(test_labels,predictions)
    cls_report = classification_report(test_labels, predictions)
    print("Accuracy: {:.2f}%".format(accuracy * 100))
    print('AUC ROC:',auc_roc)
    print("Classification Report:\n", cls_report)
    return accuracy, cls_report

In [28]:
model_knn = KNeighborsClassifier()
model_knn.fit(X_train_resampled_sme, y_train_resampled_sme)

In [29]:
model_knn_score = evaluate(model_knn,X_test_pca,y_test)

Accuracy: 51.58%
AUC ROC: 0.5103345585732761
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.52      0.60     14940
           1       0.30      0.50      0.38      6195

    accuracy                           0.52     21135
   macro avg       0.51      0.51      0.49     21135
weighted avg       0.59      0.52      0.54     21135



In [74]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train_resampled_tk, y_train_resampled_tk)

In [75]:
model_dt_score = evaluate(model_dt,X_test_pca,y_test)

Accuracy: 56.08%
AUC ROC: 0.5018635478151509
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.64      0.67     14940
           1       0.30      0.36      0.32      6195

    accuracy                           0.56     21135
   macro avg       0.50      0.50      0.50     21135
weighted avg       0.59      0.56      0.57     21135



In [76]:
model_lr = LogisticRegression()
model_lr.fit(X_train_resampled_tk, y_train_resampled_tk)

In [77]:
model_lr_score = evaluate(model_lr,X_test_pca,y_test)

Accuracy: 70.69%
AUC ROC: 0.5
Classification Report:
               precision    recall  f1-score   support

           0       0.71      1.00      0.83     14940
           1       0.00      0.00      0.00      6195

    accuracy                           0.71     21135
   macro avg       0.35      0.50      0.41     21135
weighted avg       0.50      0.71      0.59     21135



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
model_nvb = BernoulliNB()
model_nvb.fit(X_train_resampled_tk, y_train_resampled_tk)

In [53]:
model_nvb_score = evaluate(model_nvb,X_test,y_test)

Accuracy: 70.69%
AUC ROC: 0.5
Classification Report:
               precision    recall  f1-score   support

           0       0.71      1.00      0.83     14940
           1       0.00      0.00      0.00      6195

    accuracy                           0.71     21135
   macro avg       0.35      0.50      0.41     21135
weighted avg       0.50      0.71      0.59     21135



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [66]:
# Manual HPO
# n_estimator_list = [200,220,250,270,300,320,350,370,400]

# for estim_list in n_estimator_list:
#   classifier = RandomForestClassifier(n_estimators=estim_list)
#   classifier.fit(X_train_resampled_tk, y_train_resampled_tk)
#   y_pred = classifier.predict(X_test_pca)
#   print("estimated value:",estim_list)
#   accuracy = accuracy_score(y_test, y_pred)
#   # cls_report = classification_report(X_test_pca, y_pred)
#   print("Accuracy: {:.2f}%".format(accuracy * 100))


# min_samples_leaf_list = [70,80,90,100,110]
# for min_leaf in min_samples_leaf_list:
#   classifier = RandomForestClassifier(n_estimators=200,min_samples_leaf=min_leaf)
#   classifier.fit(X_train_resampled_tk, y_train_resampled_tk)
#   y_pred = classifier.predict(X_test_pca)
#   print("estimated value:",min_leaf)
#   accuracy = accuracy_score(y_test, y_pred)
#   print("Accuracy: {:.2f}%".format(accuracy * 100))



# max_depth_list = [2,3,4,5,6,7,8,9,10,11,12,20,30,40,50,60,70,100,150,300,190,250]
# for max_depth in max_depth_list:
#   classifier = RandomForestClassifier(n_estimators=200,max_depth=max_depth)
#   classifier.fit(X_train_resampled_tk, y_train_resampled_tk)
#   y_pred = classifier.predict(X_test_pca)
#   print("estimated value:",max_depth)
#   accuracy = accuracy_score(y_test, y_pred)
#   print("Accuracy: {:.2f}%".format(accuracy * 100))
#   cls_report = classification_report(y_test, y_pred)  
#   print('cls report',cls_report




estimated value: 2
Accuracy: 66.45%
cls report               precision    recall  f1-score   support

           0       0.71      0.90      0.79     14940
           1       0.29      0.10      0.15      6195

    accuracy                           0.66     21135
   macro avg       0.50      0.50      0.47     21135
weighted avg       0.58      0.66      0.60     21135

estimated value: 3
Accuracy: 66.34%
cls report               precision    recall  f1-score   support

           0       0.71      0.90      0.79     14940
           1       0.29      0.10      0.15      6195

    accuracy                           0.66     21135
   macro avg       0.50      0.50      0.47     21135
weighted avg       0.58      0.66      0.60     21135

estimated value: 4
Accuracy: 66.75%
cls report               precision    recall  f1-score   support

           0       0.71      0.91      0.79     14940
           1       0.29      0.09      0.14      6195

    accuracy                           0.

In [61]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [200],
    'max_depth': [50, 100],
    'min_samples_split': [2, 10],
    'max_features': ['sqrt'],
    'criterion': ['gini','entropy']
}

classifier = RandomForestClassifier()

grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, n_jobs=1)  # use 1 or 2 if memory limited
grid_search.fit(X_train_resampled_tk, y_train_resampled_tk)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)



KeyboardInterrupt



In [65]:
# Randomized Search
min_samples_split = [2,5,6,7,8,10,11,12,13,14,15,20,25]
min_samples_leaf = [1,2,5,6,7,8,10,12,14]
bootstrap = [True, False]
criterion = ['gini','entropy']

random_grid = {
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion
}
rf = RandomForestClassifier(n_estimators=200,max_depth=250)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train_resampled_tk, y_train_resampled_tk)
model_rfc_score = evaluate(model_rfc,X_test_pca,y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

In [55]:
model_rfc = RandomForestClassifier(n_estimators=200,max_depth=250)
model_rfc.fit(X_train_resampled_tk, y_train_resampled_tk)

In [56]:
model_rfc_score = evaluate(model_rfc,X_test_pca,y_test)

Accuracy: 66.46%
AUC ROC: 0.5010245177643585
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.90      0.79     14940
           1       0.30      0.11      0.16      6195

    accuracy                           0.66     21135
   macro avg       0.50      0.50      0.47     21135
weighted avg       0.59      0.66      0.60     21135



In [80]:
model_ada = AdaBoostClassifier(n_estimators=200)
model_ada.fit(X_train_resampled_tk, y_train_resampled_tk)



In [81]:
model_ada_score = evaluate(model_ada,X_test_pca,y_test)

Accuracy: 70.66%
AUC ROC: 0.5000964309214259
Classification Report:
               precision    recall  f1-score   support

           0       0.71      1.00      0.83     14940
           1       0.33      0.00      0.00      6195

    accuracy                           0.71     21135
   macro avg       0.52      0.50      0.42     21135
weighted avg       0.60      0.71      0.59     21135



In [83]:
model_grd = GradientBoostingClassifier(n_estimators=200)
model_grd.fit(X_train_resampled_tk, y_train_resampled_tk)

In [85]:
model_grd_score = evaluate(model_grd,X_test_pca,y_test)

Accuracy: 70.62%
AUC ROC: 0.5003011237848894
Classification Report:
               precision    recall  f1-score   support

           0       0.71      1.00      0.83     14940
           1       0.35      0.00      0.01      6195

    accuracy                           0.71     21135
   macro avg       0.53      0.50      0.42     21135
weighted avg       0.60      0.71      0.59     21135



In [107]:
model_xgb = XGBClassifier()
model_xgb.fit(X_train_resampled_tk, y_train_resampled_tk)

In [108]:
model_xgb_score = evaluate(model_xgb,X_test_pca,y_test)

Accuracy: 68.84%
AUC ROC: 0.4990420114679865
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.96      0.81     14940
           1       0.28      0.04      0.07      6195

    accuracy                           0.69     21135
   macro avg       0.50      0.50      0.44     21135
weighted avg       0.58      0.69      0.60     21135

