In [None]:
import pandas as pd
import pandas as pd
from lazypredict.Supervised import LazyClassifier, CLASSIFIERS
from sklearn.model_selection import train_test_split
import mlflow
from dotenv import load_dotenv 
import os 

load_dotenv()

mlflow_tracking_uri = os.getenv("MLFLOW_TRACKING_URI")

if mlflow_tracking_uri:
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    print(f"MLflow Tracking URI set to: {mlflow.get_tracking_uri()} (from .env)")
else:
    print("MLFLOW_TRACKING_URI not found in .env or environment variables. Please set it.")

# Set an experiment name to group your runs
mlflow.set_experiment("Reaction Outcome")

print("Loaded Dataset")
df = pd.read_parquet("../data/fd_train_df.parquet")

X = df.drop(columns=["reaction_outcome"])
y = df["reaction_outcome"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df["reaction_outcome"]
)

# Define models to exclude
# exclude_models = [
#     "ExtraTreesClassifier",
#     "BernoulliNB",
#     "CalibratedClassifierCV",
#     "GaussianNB",
#     "DummyClassifier",
#     "KNeighborsClassifier",
#     "FixedThresholdClassifier",  # This is not in sklearn, might be a custom one
#     "CategoricalNB"
# ]

exclude_models = [
    "RandomForestClassifier",
    "DecisionTreeClassifier",
    "BaggingClassifier",
    "ExtraTreesClassifier",
    "ExtraTreeClassifier",
    "AdaBoostClassifier",
    "BernoulliNB",
    "QuadraticDiscriminantAnalysis",
    "LinearDiscriminantAnalysis",
    "CalibratedClassifierCV",
    "SGDClassifier",
    "LogisticRegression",
    "LinearSVC",
    "GaussianNB",
    "RidgeClassifierCV",
    "RidgeClassifier",
    "NearestCentroid",
    "Perceptron",
    "PassiveAggressiveClassifier",
    "DummyClassifier",
    "SVC",
    "NuSVC",
    "LabelSpreading",
    "LabelPropagation",
    "KNeighborsClassifier",
    "FixedThresholdClassifier",
    "CategoricalNB"
]



custom_classifiers = [c for c in CLASSIFIERS if c[0] not in exclude_models]


# Initialize LazyClassifier
clf = LazyClassifier(
    verbose=0,
    ignore_warnings=True,
    custom_metric=None,
    predictions=False,
    random_state=42,
    classifiers = custom_classifiers
)

try:
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)
except Exception as e:
    print("Error occurred:", e)

print("\n--- LazyPredict Results ---")
print(models)
print(f"\nResults have been logged to your MLflow Tracking Server at {mlflow.get_tracking_uri()}")

MLflow Tracking URI set to: http://localhost:5555/ (from .env)
Loaded Dataset


2025/07/14 12:44:35 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/07/14 12:44:35 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/07/14 12:44:35 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/07/14 12:44:35 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.


'tuple' object has no attribute '__name__'
Invalid Classifier(s)


  0%|          | 0/5 [00:00<?, ?it/s]

🏃 View run LazyClassifier-SelfTrainingClassifier at: http://localhost:5555/#/experiments/362376687284450068/runs/01bb90d3c68c47de92c299e0bf6fc063
🧪 View experiment at: http://localhost:5555/#/experiments/362376687284450068
🏃 View run LazyClassifier-StackingClassifier at: http://localhost:5555/#/experiments/362376687284450068/runs/67f737d5a25a46188872afdd55a280ab
🧪 View experiment at: http://localhost:5555/#/experiments/362376687284450068
🏃 View run LazyClassifier-TunedThresholdClassifierCV at: http://localhost:5555/#/experiments/362376687284450068/runs/3d6343efb072495d8abcaaf05ade8b89
🧪 View experiment at: http://localhost:5555/#/experiments/362376687284450068


Successfully registered model 'lazy_classifier_XGBClassifier'.
2025/07/14 12:49:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lazy_classifier_XGBClassifier, version 1
Created version '1' of model 'lazy_classifier_XGBClassifier'.


🏃 View run LazyClassifier-XGBClassifier at: http://localhost:5555/#/experiments/362376687284450068/runs/761fa6841751431ea06c95c53c33e76f
🧪 View experiment at: http://localhost:5555/#/experiments/362376687284450068
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062699 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 367
[LightGBM] [Info] Number of data points in the train set: 9637319, number of used features: 6
[LightGBM] [Info] Start training from score -2.753146
[LightGBM] [Info] Start training from score -3.550002
[LightGBM] [Info] Start training from score -1.662801
[LightGBM] [Info] Start training from score -6.510903
[LightGBM] [Info] Start training from score -0.791746
[LightGBM] [Info] Start training from score -1.334078


Successfully registered model 'lazy_classifier_LGBMClassifier'.
2025/07/14 12:54:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lazy_classifier_LGBMClassifier, version 1
Created version '1' of model 'lazy_classifier_LGBMClassifier'.


🏃 View run LazyClassifier-LGBMClassifier at: http://localhost:5555/#/experiments/362376687284450068/runs/fd958b2c9c84468aaf55e47a3bed29b1
🧪 View experiment at: http://localhost:5555/#/experiments/362376687284450068

--- LazyPredict Results ---
                Accuracy  Balanced Accuracy ROC AUC  F1 Score  Time Taken
Model                                                                    
XGBClassifier       0.72               0.44    None      0.69      267.44
LGBMClassifier      0.71               0.42    None      0.68      309.08

Results have been logged to your MLflow Tracking Server at http://localhost:5555/
