In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier, StackingClassifier

In [2]:
df = pd.read_csv("novagen_dataset.csv")

In [3]:
df.head()
# df.info()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,...,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type__Vegan,Diet_Type__Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
0,2.0,26.0,111.0,198.0,99.0,72.0,4.0,1.0,5.0,5.0,...,1,2,1,0,1,False,True,True,False,False
1,8.0,24.0,121.0,199.0,103.0,75.0,2.0,1.0,2.0,9.0,...,1,2,1,2,2,False,False,True,False,False
2,81.0,27.0,147.0,203.0,100.0,74.0,10.0,-0.0,5.0,1.0,...,2,0,0,1,0,True,False,False,False,False
3,25.0,21.0,150.0,199.0,102.0,70.0,7.0,3.0,3.0,3.0,...,1,2,1,2,0,True,False,False,True,False
4,24.0,26.0,146.0,202.0,99.0,76.0,10.0,2.0,5.0,1.0,...,2,0,2,0,2,False,True,False,True,False


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9549 entries, 0 to 9548
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    9549 non-null   float64
 1   BMI                    9549 non-null   float64
 2   Blood_Pressure         9549 non-null   float64
 3   Cholesterol            9549 non-null   float64
 4   Glucose_Level          9549 non-null   float64
 5   Heart_Rate             9549 non-null   float64
 6   Sleep_Hours            9549 non-null   float64
 7   Exercise_Hours         9549 non-null   float64
 8   Water_Intake           9549 non-null   float64
 9   Stress_Level           9549 non-null   float64
 10  Target                 9549 non-null   int64  
 11  Smoking                9549 non-null   int64  
 12  Alcohol                9549 non-null   int64  
 13  Diet                   9549 non-null   int64  
 14  MentalHealth           9549 non-null   int64  
 15  Phys

In [5]:
df.columns

Index(['Age', 'BMI', 'Blood_Pressure', 'Cholesterol', 'Glucose_Level',
       'Heart_Rate', 'Sleep_Hours', 'Exercise_Hours', 'Water_Intake',
       'Stress_Level', 'Target', 'Smoking', 'Alcohol', 'Diet', 'MentalHealth',
       'PhysicalActivity', 'MedicalHistory', 'Allergies', 'Diet_Type__Vegan',
       'Diet_Type__Vegetarian', 'Blood_Group_AB', 'Blood_Group_B',
       'Blood_Group_O'],
      dtype='object')

In [6]:
#Split features and target
X = df.drop("Target", axis =1)
y = df["Target"]
results = []

#Train-test-split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state = 42)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, classification_report, f1_score, precision_score
#Logistic Regression

lr = LogisticRegression(max_iter=5000)
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)

results.append({
    "Model": "Logistic Regression",
    "Accuracy": accuracy_score(y_test, y_pred_lr),
    "Precision": precision_score(y_test, y_pred_lr),
    "Recall": recall_score(y_test, y_pred_lr),
    "F1 Score": f1_score(y_test, y_pred_lr)
})


# y_pred= lr.predict(X_test_scaled)
# print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
# print("Logistic Regression Recall:", recall_score(y_test, y_pred))
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

# Model2 - SVC

In [8]:
svc = SVC()
svc.fit(X_train_scaled, y_train)
# y_pred= svc.predict(X_test_scaled)
# print(" SVC Accuracy:", accuracy_score(y_test, y_pred))
# print("SVC recall:", recall_score(y_test, y_pred))
# print("Classification Report of SVC:")
# print(classification_report(y_test, y_pred))

y_pred_svc = svc.predict(X_test_scaled)

results.append({
    "Model": "SVC",
    "Accuracy": accuracy_score(y_test, y_pred_svc),
    "Precision": precision_score(y_test, y_pred_svc),
    "Recall": recall_score(y_test, y_pred_svc),
    "F1 Score": f1_score(y_test, y_pred_svc)
})


# Model3 - Decision Tree Classifier

In [9]:
dtc = DecisionTreeClassifier(max_depth=3)
dtc.fit(X_train_scaled, y_train)
# y_pred= dtc.predict(X_test_scaled)
# print("Decision Tree Classifier Accuracy:", accuracy_score(y_test, y_pred))
# print("Decision Tree Classifier Recall:",recall_score(y_test, y_pred))
# print("Classification Report of Decision Tree Classifier:")
# print(classification_report(y_test, y_pred))

y_pred_dtc = dtc.predict(X_test_scaled)

results.append({
    "Model": "Decision Tree",
    "Accuracy": accuracy_score(y_test, y_pred_dtc),
    "Precision": precision_score(y_test, y_pred_dtc),
    "Recall": recall_score(y_test, y_pred_dtc),
    "F1 Score": f1_score(y_test, y_pred_dtc)
})


# Model 4- Random Forest Classifier

In [10]:
rfc =RandomForestClassifier(max_depth=3, n_estimators=1000, oob_score=True)
rfc.fit(X_train_scaled, y_train)
# y_pred= rfc.predict(X_test_scaled)
# print("Random forest classifier Accuracy:", accuracy_score(y_test, y_pred))
# print("Random forest classifier Recall:", recall_score(y_test, y_pred))
# print("Classification Report of Random Forest Classifier:")
# print(classification_report(y_test, y_pred))

y_pred_rfc = rfc.predict(X_test_scaled)

results.append({
    "Model": "Random Forest",
    "Accuracy": accuracy_score(y_test, y_pred_rfc),
    "Precision": precision_score(y_test, y_pred_rfc),
    "Recall": recall_score(y_test, y_pred_rfc),
    "F1 Score": f1_score(y_test, y_pred_rfc)
})


# model 5 - Gradient Boosting classifier

In [11]:
gbc = GradientBoostingClassifier(max_depth=3, n_estimators=1000, random_state=42,learning_rate=0.1)
gbc.fit(X_train_scaled, y_train)

y_pred_gbc = gbc.predict(X_test_scaled)
results.append({
    "Model" : "Gradient Classifier",
    "Accuracy":accuracy_score(y_test, y_pred_gbc),
    "Precision":precision_score(y_test, y_pred_gbc),
    "Recall":recall_score(y_test, y_pred_gbc),
    "F1 Score":f1_score(y_test, y_pred_gbc)
})
# y_pred= gbc.predict(X_test_scaled)
# print("Gradient Classifier Accuracy:", accuracy_score(y_test, y_pred))
# print("Gradient classifier Recall:", recall_score(y_test, y_pred))
# print("Classification Report of Gradient Classifier:")
# print(classification_report(y_test, y_pred))

# Model 6 - Voting Classifier

In [12]:
voting_clf = VotingClassifier(
    estimators=[
        ("lr", lr),
        ("svc", svc),
        ("dtc", dtc)
    ]
)
voting_clf.fit(X_train_scaled, y_train)

In [13]:
# y_pred = voting_clf.predict(X_test_scaled)
# print("Voting Classifier Accuracy score:", accuracy_score(y_test, y_pred))
# print("Voting Classifier Recall:", recall_score(y_test, y_pred))
# print("CLASSIFICATION REPORT of Voting Classifier:")
# print(classification_report(y_test, y_pred))

y_pred_vote = voting_clf.predict(X_test_scaled)

results.append({
    "Model": "Voting Classifier",
    "Accuracy": accuracy_score(y_test, y_pred_vote),
    "Precision": precision_score(y_test, y_pred_vote),
    "Recall": recall_score(y_test, y_pred_vote),
    "F1 Score": f1_score(y_test, y_pred_vote)
})


# Model 7 - Stacking Classifier

In [14]:
stacking_clf = StackingClassifier(
     estimators=[
        ("lr", lr),
        ("svc", svc),
        ("dtc", dtc)
    ],
    cv =5
)

stacking_clf .fit(X_train_scaled, y_train)

In [15]:
# y_pred = stacking_clf.predict(X_test_scaled)
# print("Stacking Classifier Accuracy score:", accuracy_score(y_test, y_pred))
# print("Stacking Classifier Recall:", recall_score(y_test, y_pred))
# print("CLASSIFICATION REPORT of Stacking Classifier:")
# print(classification_report(y_test, y_pred))

y_pred_stack = stacking_clf.predict(X_test_scaled)

results.append({
    "Model": "Stacking Classifier",
    "Accuracy": accuracy_score(y_test, y_pred_stack),
    "Precision": precision_score(y_test, y_pred_stack),
    "Recall": recall_score(y_test, y_pred_stack),
    "F1 Score": f1_score(y_test, y_pred_stack)
})


In [24]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Recall", ascending=False)

print(results_df)
print()
print()


conclusions = "Based on recall performance, the Support Vector Classifier (SVC) was identified as the best model, achieving a recall of 94.26% and an accuracy of 93.35% in classifying individuals as healthy or unhealthy."
print(conclusions)
print()
print()
results_df.style.background_gradient(cmap="Greens")


                 Model  Accuracy  Precision    Recall  F1 Score
1                  SVC  0.933508   0.932419  0.942574  0.937469
4  Gradient Classifier  0.940838   0.947159  0.940594  0.943865
6  Stacking Classifier  0.935602   0.941294  0.936634  0.938958
5    Voting Classifier  0.886911   0.866236  0.929703  0.896848
2        Decision Tree  0.813613   0.771144  0.920792  0.839350
3        Random Forest  0.835602   0.811828  0.897030  0.852305
0  Logistic Regression  0.822513   0.828599  0.837624  0.833087


Based on recall performance, the Support Vector Classifier (SVC) was identified as the best model, achieving a recall of 94.26% and an accuracy of 93.35% in classifying individuals as healthy or unhealthy.




Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
1,SVC,0.933508,0.932419,0.942574,0.937469
4,Gradient Classifier,0.940838,0.947159,0.940594,0.943865
6,Stacking Classifier,0.935602,0.941294,0.936634,0.938958
5,Voting Classifier,0.886911,0.866236,0.929703,0.896848
2,Decision Tree,0.813613,0.771144,0.920792,0.83935
3,Random Forest,0.835602,0.811828,0.89703,0.852305
0,Logistic Regression,0.822513,0.828599,0.837624,0.833087
