In [17]:
import pandas as pd
import mlflow
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [18]:
data = pd.read_csv('synthetic_asthma_dataset.csv')

In [19]:
df = data.drop(columns=['Patient_ID'])
df.drop(columns=['Asthma_Control_Level'], inplace=True)


In [20]:
df.head()

Unnamed: 0,Age,Gender,BMI,Smoking_Status,Family_History,Allergies,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Comorbidities,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma
0,52,Female,27.6,Former,1,,Moderate,Sedentary,Outdoor,Diabetes,0.38,0,421.0,46.0,0
1,15,Male,24.6,Former,0,Dust,Low,Moderate,Indoor,Both,0.6,2,297.6,22.9,0
2,72,Female,17.6,Never,0,,Moderate,Moderate,Indoor,,0.38,0,303.3,15.3,0
3,61,Male,16.8,Never,0,Multiple,High,Sedentary,Outdoor,Both,0.6,1,438.0,40.1,1
4,21,Male,30.2,Never,0,,Moderate,Active,Indoor,,0.82,3,535.0,27.7,0


In [21]:
df.isnull().sum()

Age                           0
Gender                        0
BMI                           0
Smoking_Status                0
Family_History                0
Allergies                  2936
Air_Pollution_Level           0
Physical_Activity_Level       0
Occupation_Type               0
Comorbidities              4967
Medication_Adherence          0
Number_of_ER_Visits           0
Peak_Expiratory_Flow          0
FeNO_Level                    0
Has_Asthma                    0
dtype: int64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      10000 non-null  int64  
 1   Gender                   10000 non-null  object 
 2   BMI                      10000 non-null  float64
 3   Smoking_Status           10000 non-null  object 
 4   Family_History           10000 non-null  int64  
 5   Allergies                7064 non-null   object 
 6   Air_Pollution_Level      10000 non-null  object 
 7   Physical_Activity_Level  10000 non-null  object 
 8   Occupation_Type          10000 non-null  object 
 9   Comorbidities            5033 non-null   object 
 10  Medication_Adherence     10000 non-null  float64
 11  Number_of_ER_Visits      10000 non-null  int64  
 12  Peak_Expiratory_Flow     10000 non-null  float64
 13  FeNO_Level               10000 non-null  float64
 14  Has_Asthma             

In [23]:
df['Allergies'] = df['Allergies'].fillna(df['Allergies'].mode()[0])
df['Comorbidities'] = df['Comorbidities'].fillna(df['Comorbidities'].mode()[0])


In [24]:
col = list(df.columns)
for i in col:
    if df[i].dtype == object:
        print(f"{i} ------>  {df[i].unique()}")


Gender ------>  ['Female' 'Male' 'Other']
Smoking_Status ------>  ['Former' 'Never' 'Current']
Allergies ------>  ['Dust' 'Multiple' 'Pollen' 'Pets']
Air_Pollution_Level ------>  ['Moderate' 'Low' 'High']
Physical_Activity_Level ------>  ['Sedentary' 'Moderate' 'Active']
Occupation_Type ------>  ['Outdoor' 'Indoor']
Comorbidities ------>  ['Diabetes' 'Both' 'Hypertension']


In [25]:
def onehotencoding(df):
    cols = ['Gender', 'Smoking_Status', 'Allergies', 'Comorbidities']
    encoder = OneHotEncoder(sparse_output=False)
    for col in cols:
        if col in df.columns:
            encoded = encoder.fit_transform(df[[col]])
            encoded_cols = encoder.get_feature_names_out([col])
            encoded_df = pd.DataFrame(encoded, columns=encoded_cols, index=df.index)
            df = pd.concat([df.drop(columns=[col]), encoded_df], axis=1)
    return df

def ordinalencoding(df):
    cols = ['Air_Pollution_Level', 'Physical_Activity_Level', 'Occupation_Type', 'Comorbidities']
    encoder = OrdinalEncoder()
    for col in cols:
        if col in df.columns:
            df[col] = encoder.fit_transform(df[[col]])
    return df


In [26]:
df = onehotencoding(df)
df = ordinalencoding(df)
df.head()


Unnamed: 0,Age,BMI,Family_History,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,...,Smoking_Status_Current,Smoking_Status_Former,Smoking_Status_Never,Allergies_Dust,Allergies_Multiple,Allergies_Pets,Allergies_Pollen,Comorbidities_Both,Comorbidities_Diabetes,Comorbidities_Hypertension
0,52,27.6,1,2.0,2.0,1.0,0.38,0,421.0,46.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,15,24.6,0,1.0,1.0,0.0,0.6,2,297.6,22.9,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,72,17.6,0,2.0,1.0,0.0,0.38,0,303.3,15.3,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,61,16.8,0,0.0,2.0,1.0,0.6,1,438.0,40.1,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,21,30.2,0,2.0,0.0,0.0,0.82,3,535.0,27.7,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         10000 non-null  int64  
 1   BMI                         10000 non-null  float64
 2   Family_History              10000 non-null  int64  
 3   Air_Pollution_Level         10000 non-null  float64
 4   Physical_Activity_Level     10000 non-null  float64
 5   Occupation_Type             10000 non-null  float64
 6   Medication_Adherence        10000 non-null  float64
 7   Number_of_ER_Visits         10000 non-null  int64  
 8   Peak_Expiratory_Flow        10000 non-null  float64
 9   FeNO_Level                  10000 non-null  float64
 10  Has_Asthma                  10000 non-null  int64  
 11  Gender_Female               10000 non-null  float64
 12  Gender_Male                 10000 non-null  float64
 13  Gender_Other                1000

In [28]:
df.describe()

Unnamed: 0,Age,BMI,Family_History,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,...,Smoking_Status_Current,Smoking_Status_Former,Smoking_Status_Never,Allergies_Dust,Allergies_Multiple,Allergies_Pets,Allergies_Pollen,Comorbidities_Both,Comorbidities_Diabetes,Comorbidities_Hypertension
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,44.9307,25.05332,0.3034,1.2814,1.2033,0.2965,0.497998,1.0159,400.88409,25.10142,...,0.1443,0.2487,0.607,0.5415,0.1001,0.1585,0.1999,0.0986,0.6996,0.2018
std,25.653559,4.874466,0.459749,0.788972,0.753542,0.456737,0.224809,1.020564,97.531113,9.840184,...,0.351411,0.432281,0.488441,0.4983,0.300148,0.365227,0.399945,0.298139,0.458455,0.401364
min,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,21.6,0.0,1.0,1.0,0.0,0.32,0.0,334.8,18.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,45.0,25.0,0.0,1.0,1.0,0.0,0.5,1.0,402.5,25.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,67.0,28.4,1.0,2.0,2.0,1.0,0.67,2.0,468.7,31.7,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
max,89.0,45.0,1.0,2.0,2.0,1.0,0.99,6.0,600.0,63.9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
def remove_outliers_iqr(df, columns):
    for col in columns:
        if col in df.columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df = df[(df[col] >= 10) & (df[col] <= upper_bound)]
            
    return df


In [30]:
numeric_cols = ['Age', 'BMI', 'Peak_Expiratory_Flow', 'FeNO_Level']
df = remove_outliers_iqr(df, numeric_cols)


In [31]:
df.describe()

Unnamed: 0,Age,BMI,Family_History,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,...,Smoking_Status_Current,Smoking_Status_Former,Smoking_Status_Never,Allergies_Dust,Allergies_Multiple,Allergies_Pets,Allergies_Pollen,Comorbidities_Both,Comorbidities_Diabetes,Comorbidities_Hypertension
count,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,...,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0
mean,49.388584,24.995665,0.30129,1.2786,1.202173,0.296155,0.498461,1.014808,401.122606,26.237187,...,0.145211,0.247791,0.606998,0.543468,0.0984,0.159661,0.198471,0.0984,0.700143,0.201457
std,23.225396,4.829302,0.458846,0.790109,0.755467,0.456587,0.225718,1.019467,97.840833,8.652674,...,0.352335,0.431755,0.488446,0.498137,0.297872,0.366313,0.398873,0.297872,0.458222,0.401112
min,10.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,29.0,21.6,0.0,1.0,1.0,0.0,0.32,0.0,334.8,19.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,49.0,25.0,0.0,1.0,1.0,0.0,0.5,1.0,402.5,25.6,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,69.0,28.4,1.0,2.0,2.0,1.0,0.67,2.0,469.4,32.1,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
max,89.0,38.5,1.0,2.0,2.0,1.0,0.99,6.0,600.0,51.8,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
x = df.drop(columns=['Has_Asthma'])
y = df['Has_Asthma']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [47]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("random forest 100 estimators")

<Experiment: artifact_location='file:///C:/Users/sfed/Desktop/my-proj/china_cancer_patient_project/mlruns/1', creation_time=1760032184734, experiment_id='1', last_update_time=1760032184734, lifecycle_stage='active', name='random forest 100 estimators', tags={}>

In [48]:
import dagshub
import mlflow
import logging
import os
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier


logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
log = logging.getLogger(__name__)

with mlflow.start_run():
    start_time = time.time()

    try:
        log.info("Logging preprocessing parameters...")
        mlflow.log_param("OneHotEncoder", "OrdinalEncoder")
        mlflow.log_param("removing outlier Q1", 0.25)
        mlflow.log_param("removing outlier Q2", 0.75)

        log.info("Initializing model...")
        model = RandomForestClassifier(n_estimators=100, random_state=42)

        log.info("Fitting model...")
        model.fit(x_train, y_train)
        log.info("Model training completed.")

        log.info("Making predictions...")
        y_pred = model.predict(x_test)

        log.info("Calculating metrics...")
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        log.info("Logging metrics...")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        log.info("Saving model...")
        mlflow.sklearn.log_model(model, "model")

        end_time = time.time()
        log.info(f"Run completed in {end_time - start_time:.2f} seconds.")
        log.info(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")

    except Exception as e:
        log.error(f"An error occurred: {e}", exc_info=True)


2025-10-09 23:00:32,510 - INFO - Logging preprocessing parameters...
2025-10-09 23:00:32,564 - INFO - Initializing model...
2025-10-09 23:00:32,566 - INFO - Fitting model...
2025-10-09 23:00:33,224 - INFO - Model training completed.
2025-10-09 23:00:33,224 - INFO - Making predictions...
2025-10-09 23:00:33,252 - INFO - Calculating metrics...
2025-10-09 23:00:33,264 - INFO - Logging metrics...
2025-10-09 23:00:33,334 - INFO - Saving model...
2025-10-09 23:00:37,914 - INFO - Run completed in 5.40 seconds.
2025-10-09 23:00:37,914 - INFO - Accuracy: 0.9307462686567164, Precision: 0.9005376344086021, Recall: 0.8091787439613527, F1: 0.8524173027989822


🏃 View run calm-wren-310 at: http://127.0.0.1:5000/#/experiments/1/runs/528e86c827794c189d327b1343c89dc9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
