In [12]:
import pandas as pd
import mlflow
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('synthetic_asthma_dataset.csv')

In [4]:
df = data.drop(columns=['Patient_ID'])
df.drop(columns=['Asthma_Control_Level'], inplace=True)


In [5]:
df.head()

Unnamed: 0,Age,Gender,BMI,Smoking_Status,Family_History,Allergies,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Comorbidities,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma
0,52,Female,27.6,Former,1,,Moderate,Sedentary,Outdoor,Diabetes,0.38,0,421.0,46.0,0
1,15,Male,24.6,Former,0,Dust,Low,Moderate,Indoor,Both,0.6,2,297.6,22.9,0
2,72,Female,17.6,Never,0,,Moderate,Moderate,Indoor,,0.38,0,303.3,15.3,0
3,61,Male,16.8,Never,0,Multiple,High,Sedentary,Outdoor,Both,0.6,1,438.0,40.1,1
4,21,Male,30.2,Never,0,,Moderate,Active,Indoor,,0.82,3,535.0,27.7,0


In [6]:
df.isnull().sum()

Age                           0
Gender                        0
BMI                           0
Smoking_Status                0
Family_History                0
Allergies                  2936
Air_Pollution_Level           0
Physical_Activity_Level       0
Occupation_Type               0
Comorbidities              4967
Medication_Adherence          0
Number_of_ER_Visits           0
Peak_Expiratory_Flow          0
FeNO_Level                    0
Has_Asthma                    0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      10000 non-null  int64  
 1   Gender                   10000 non-null  object 
 2   BMI                      10000 non-null  float64
 3   Smoking_Status           10000 non-null  object 
 4   Family_History           10000 non-null  int64  
 5   Allergies                7064 non-null   object 
 6   Air_Pollution_Level      10000 non-null  object 
 7   Physical_Activity_Level  10000 non-null  object 
 8   Occupation_Type          10000 non-null  object 
 9   Comorbidities            5033 non-null   object 
 10  Medication_Adherence     10000 non-null  float64
 11  Number_of_ER_Visits      10000 non-null  int64  
 12  Peak_Expiratory_Flow     10000 non-null  float64
 13  FeNO_Level               10000 non-null  float64
 14  Has_Asthma             

In [8]:
df['Allergies'] = df['Allergies'].fillna(df['Allergies'].mode()[0])
df['Comorbidities'] = df['Comorbidities'].fillna(df['Comorbidities'].mode()[0])


In [9]:
col = list(df.columns)
for i in col:
    if df[i].dtype == object:
        print(f"{i} ------>  {df[i].unique()}")


Gender ------>  ['Female' 'Male' 'Other']
Smoking_Status ------>  ['Former' 'Never' 'Current']
Allergies ------>  ['Dust' 'Multiple' 'Pollen' 'Pets']
Air_Pollution_Level ------>  ['Moderate' 'Low' 'High']
Physical_Activity_Level ------>  ['Sedentary' 'Moderate' 'Active']
Occupation_Type ------>  ['Outdoor' 'Indoor']
Comorbidities ------>  ['Diabetes' 'Both' 'Hypertension']


In [20]:
def onehotencoding(df):
    cols = ['Gender', 'Smoking_Status', 'Allergies']
    encoder = OneHotEncoder(sparse_output=False)
    for col in cols:
        if col in df.columns:
            encoded = encoder.fit_transform(df[[col]])
            encoded_cols = encoder.get_feature_names_out([col])
            encoded_df = pd.DataFrame(encoded, columns=encoded_cols, index=df.index)
            df = pd.concat([df.drop(columns=[col]), encoded_df], axis=1)
    return df

def ordinalencoding(df):
    cols = ['Air_Pollution_Level', 'Physical_Activity_Level', 'Occupation_Type', 'Comorbidities']
    encoder = OrdinalEncoder()
    for col in cols:
        if col in df.columns:
            df[col] = encoder.fit_transform(df[[col]])
    return df


In [21]:
df = onehotencoding(df)
df = ordinalencoding(df)
df.head()


Unnamed: 0,Age,BMI,Family_History,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,...,Smoking_Status_Current,Smoking_Status_Former,Smoking_Status_Never,Allergies_Dust,Allergies_Multiple,Allergies_Pets,Allergies_Pollen,Comorbidities_Both,Comorbidities_Diabetes,Comorbidities_Hypertension
0,52,27.6,1,2.0,2.0,1.0,0.38,0,421.0,46.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,15,24.6,0,1.0,1.0,0.0,0.6,2,297.6,22.9,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,72,17.6,0,2.0,1.0,0.0,0.38,0,303.3,15.3,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,61,16.8,0,0.0,2.0,1.0,0.6,1,438.0,40.1,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,21,30.2,0,2.0,0.0,0.0,0.82,3,535.0,27.7,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
cols = ['Air_Pollution_Level', 'Physical_Activity_Level', 'Occupation_Type']
for i in cols:
    uni = df[i].unique
    print(f'{i} ----> {uni}')

Air_Pollution_Level ----> <bound method Series.unique of 0       2.0
1       1.0
2       2.0
3       0.0
4       2.0
       ... 
9995    1.0
9996    1.0
9997    1.0
9998    2.0
9999    0.0
Name: Air_Pollution_Level, Length: 10000, dtype: float64>
Physical_Activity_Level ----> <bound method Series.unique of 0       2.0
1       1.0
2       1.0
3       2.0
4       0.0
       ... 
9995    2.0
9996    1.0
9997    1.0
9998    1.0
9999    1.0
Name: Physical_Activity_Level, Length: 10000, dtype: float64>
Occupation_Type ----> <bound method Series.unique of 0       1.0
1       0.0
2       0.0
3       1.0
4       0.0
       ... 
9995    0.0
9996    0.0
9997    0.0
9998    0.0
9999    1.0
Name: Occupation_Type, Length: 10000, dtype: float64>


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      10000 non-null  int64  
 1   Gender                   10000 non-null  int64  
 2   BMI                      10000 non-null  float64
 3   Smoking_Status           10000 non-null  int64  
 4   Family_History           10000 non-null  int64  
 5   Allergies                10000 non-null  int64  
 6   Air_Pollution_Level      10000 non-null  float64
 7   Physical_Activity_Level  10000 non-null  float64
 8   Occupation_Type          10000 non-null  float64
 9   Comorbidities            10000 non-null  float64
 10  Medication_Adherence     10000 non-null  float64
 11  Number_of_ER_Visits      10000 non-null  int64  
 12  Peak_Expiratory_Flow     10000 non-null  float64
 13  FeNO_Level               10000 non-null  float64
 14  Has_Asthma             

In [12]:
df.describe()

Unnamed: 0,Age,Gender,BMI,Smoking_Status,Family_History,Allergies,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Comorbidities,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,44.9307,0.5586,25.05332,1.4627,0.3034,1.0168,1.2814,1.2033,0.2965,1.1032,0.497998,1.0159,400.88409,25.10142,0.2433
std,25.653559,0.571488,4.874466,0.732982,0.459749,1.224528,0.788972,0.753542,0.456737,0.538311,0.224809,1.020564,97.531113,9.840184,0.429096
min,1.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,5.0,0.0
25%,23.0,0.0,21.6,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.32,0.0,334.8,18.2,0.0
50%,45.0,1.0,25.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,0.5,1.0,402.5,25.0,0.0
75%,67.0,1.0,28.4,2.0,1.0,2.0,2.0,2.0,1.0,1.0,0.67,2.0,468.7,31.7,0.0
max,89.0,2.0,45.0,2.0,1.0,3.0,2.0,2.0,1.0,2.0,0.99,6.0,600.0,63.9,1.0


In [12]:
def remove_outliers_iqr(df, columns):
    for col in columns:
        if col in df.columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df = df[(df[col] >= 10) & (df[col] <= upper_bound)]
            
    return df


In [14]:
numeric_cols = ['Age', 'BMI', 'Peak_Expiratory_Flow', 'FeNO_Level']
df = remove_outliers_iqr(df, numeric_cols)


In [15]:
df.describe()

Unnamed: 0,Age,Gender,BMI,Smoking_Status,Family_History,Allergies,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Comorbidities,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma
count,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0,8374.0
mean,49.388584,0.551469,24.995665,1.461786,0.30129,1.013136,1.2786,1.202173,0.296155,1.103057,0.498461,1.014808,401.122606,26.237187,0.241939
std,23.225396,0.567788,4.829302,0.734184,0.458846,1.223528,0.790109,0.755467,0.456587,0.537839,0.225718,1.019467,97.840833,8.652674,0.428283
min,10.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,10.0,0.0
25%,29.0,0.0,21.6,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.32,0.0,334.8,19.6,0.0
50%,49.0,1.0,25.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,0.5,1.0,402.5,25.6,0.0
75%,69.0,1.0,28.4,2.0,1.0,2.0,2.0,2.0,1.0,1.0,0.67,2.0,469.4,32.1,0.0
max,89.0,2.0,38.5,2.0,1.0,3.0,2.0,2.0,1.0,2.0,0.99,6.0,600.0,51.8,1.0


In [16]:
x = df.drop(columns=['Has_Asthma'])
y = df['Has_Asthma']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'accuracy {accuracy}')
print(f'pre {precision}')
print(f're {recall}')
print(f'f1 {f1}')

accuracy 0.9319402985074627
pre 0.8787878787878788
re 0.8405797101449275
f1 0.8592592592592593


In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("random forest 100 estimators")

In [30]:
import dagshub
import mlflow
import logging
import os
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier


logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
log = logging.getLogger(__name__)

with mlflow.start_run():
    start_time = time.time()

    try:
        log.info("Logging preprocessing parameters...")
        mlflow.log_param("OneHotEncoder", "OrdinalEncoder")
        mlflow.log_param("removing outlier Q1", 0.25)
        mlflow.log_param("removing outlier Q2", 0.75)

        log.info("Initializing model...")
        model = RandomForestClassifier(n_estimators=100, random_state=42)

        log.info("Fitting model...")
        model.fit(x_train, y_train)
        log.info("Model training completed.")

        log.info("Making predictions...")
        y_pred = model.predict(x_test)

        log.info("Calculating metrics...")
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        log.info("Logging metrics...")
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        log.info("Saving model...")
        mlflow.sklearn.log_model(model, "model")

        end_time = time.time()
        log.info(f"Run completed in {end_time - start_time:.2f} seconds.")
        log.info(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")

    except Exception as e:
        log.error(f"An error occurred: {e}", exc_info=True)




KeyboardInterrupt: 

In [10]:
df2 = pd.read_csv(r'C:\Users\sfed\Desktop\my-proj\china_cancer_patient_project\data\raw\preprocessed_data.csv')
df2.head()

Unnamed: 0,Age,BMI,Family_History,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,...,Smoking_Status_Current,Smoking_Status_Former,Smoking_Status_Never,Allergies_Dust,Allergies_Multiple,Allergies_Pets,Allergies_Pollen,Comorbidities_Both,Comorbidities_Diabetes,Comorbidities_Hypertension
0,52,27.6,1,2.0,2.0,1.0,0.38,0,421.0,46.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,15,24.6,0,1.0,1.0,0.0,0.6,2,297.6,22.9,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,72,17.6,0,2.0,1.0,0.0,0.38,0,303.3,15.3,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,61,16.8,0,0.0,2.0,1.0,0.6,1,438.0,40.1,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,21,30.2,0,2.0,0.0,0.0,0.82,3,535.0,27.7,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
import pandas as pd
df = pd.read_csv('synthetic_asthma_dataset.csv')

In [None]:
df.drop(columns=['Patient_ID'], inplace=True)

Unnamed: 0,Age,Gender,BMI,Smoking_Status,Family_History,Allergies,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Comorbidities,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma,Asthma_Control_Level
0,52,Female,27.6,Former,1,,Moderate,Sedentary,Outdoor,Diabetes,0.38,0,421.0,46.0,0,
1,15,Male,24.6,Former,0,Dust,Low,Moderate,Indoor,Both,0.60,2,297.6,22.9,0,
2,72,Female,17.6,Never,0,,Moderate,Moderate,Indoor,,0.38,0,303.3,15.3,0,
3,61,Male,16.8,Never,0,Multiple,High,Sedentary,Outdoor,Both,0.60,1,438.0,40.1,1,Poorly Controlled
4,21,Male,30.2,Never,0,,Moderate,Active,Indoor,,0.82,3,535.0,27.7,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,70,Male,25.0,Never,0,,Low,Sedentary,Indoor,,0.67,0,580.6,18.7,0,
9996,78,Female,24.8,Never,0,Pollen,Low,Moderate,Indoor,Diabetes,0.72,1,417.6,40.8,0,
9997,58,Male,30.1,Former,1,Pollen,Low,Moderate,Indoor,,0.28,0,459.1,20.3,1,Not Controlled
9998,88,Female,31.2,Former,0,Pollen,Moderate,Moderate,Indoor,,0.44,0,415.9,25.0,0,


In [9]:
all_cols = df.columns
target  = ['Allergies', 'Occupation_Type', 'Comorbidities', ]

for i in all_cols:
    for k in target:
        if k == i:
            unique =  df[k].unique()
            print(f'{k} ----> {unique}')

Allergies ----> [nan 'Dust' 'Multiple' 'Pollen' 'Pets']
Occupation_Type ----> ['Outdoor' 'Indoor']
Comorbidities ----> ['Diabetes' 'Both' nan 'Hypertension']


In [7]:
import pandas as pd

# Load your dataset
df = pd.read_csv(r"C:\Users\sfed\Desktop\my-proj\china_cancer_patient_project\preprocessed_data_2.csv")

# ====== 1️⃣ Generate mapping (Label Encoding style) ======
# Only for object (categorical) columns
mapping = {
    col: {val: i for i, val in enumerate(df[col].dropna().unique())}
    for col in df.columns if df[col].dtype != int
}

print("=== Label Encoding Mapping ===")
for col, map_dict in mapping.items():
    print(f'    "{col}": {map_dict},')
print("\n")

# ====== 2️⃣ Apply Label Encoding ======
df_label_encoded = df.replace(mapping)

# ====== 3️⃣ Perform One-Hot Encoding ======
df_one_hot = pd.get_dummies(df, drop_first=False)

# ====== 4️⃣ Save or display ======
df_label_encoded.to_csv("label_encoded_data.csv", index=False)
df_one_hot.to_csv("one_hot_encoded_data.csv", index=False)

print("✅ Label encoding and one-hot encoding completed!")
print("Files saved as 'label_encoded_data.csv' and 'one_hot_encoded_data.csv'")


=== Label Encoding Mapping ===
    "BMI": {np.float64(27.6): 0, np.float64(24.6): 1, np.float64(17.6): 2, np.float64(16.8): 3, np.float64(30.2): 4, np.float64(27.8): 5, np.float64(32.3): 6, np.float64(29.7): 7, np.float64(23.1): 8, np.float64(15.0): 9, np.float64(28.0): 10, np.float64(24.1): 11, np.float64(27.1): 12, np.float64(20.9): 13, np.float64(23.5): 14, np.float64(19.8): 15, np.float64(23.8): 16, np.float64(20.3): 17, np.float64(22.4): 18, np.float64(22.8): 19, np.float64(35.6): 20, np.float64(24.4): 21, np.float64(27.2): 22, np.float64(29.6): 23, np.float64(24.3): 24, np.float64(20.1): 25, np.float64(23.4): 26, np.float64(24.2): 27, np.float64(20.6): 28, np.float64(21.4): 29, np.float64(22.0): 30, np.float64(26.6): 31, np.float64(25.6): 32, np.float64(23.9): 33, np.float64(26.8): 34, np.float64(22.2): 35, np.float64(31.0): 36, np.float64(38.1): 37, np.float64(21.7): 38, np.float64(22.6): 39, np.float64(18.5): 40, np.float64(29.3): 41, np.float64(34.6): 42, np.float64(23.6): 43,