# Classifier 

In [2]:
from io import StringIO

from metaflow import FlowSpec, NBRunner, step, IncludeFile
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier


class ClassifierFlow(FlowSpec):
    train_path = IncludeFile(
        name="train_path",
        default="./data/train.csv",
        is_text=True,
    )
    test_path = IncludeFile(
        name="test_path",
        default="./data/test.csv",
        is_text=True,
    )

    @step
    def start(self):
        print("Fitting classifiers...")
        self.next(self.ingest)

    @step
    def ingest(self):
        self.train = pd.read_csv(StringIO(self.train_path))
        print(f"Train dataset loaded with shape {self.train.shape}")
        self.test = pd.read_csv(StringIO(self.test_path))
        print(f"Test dataset loaded with shape {self.test.shape}")
        self.next(self.preprocess)

    @step
    def preprocess(self):
        self.scaler = StandardScaler()
        self.encoder = LabelEncoder()
        X_train = self.train.drop(columns=["target"])
        y_train = self.train["target"]
        X_test = self.test.drop(columns=["target"])
        y_test = self.test["target"]
        self.X_train = self.scaler.fit_transform(X_train)
        self.y_train = self.encoder.fit_transform(y_train)
        self.X_test = self.scaler.transform(X_test)
        self.y_test = self.encoder.transform(y_test)
        self.next(self.fit_lr, self.fit_rf)

    @step
    def fit_lr(self):
        self.model_name = "Logistics Regression"
        self.model = LogisticRegression()
        self.model.fit(self.X_train, self.y_train)
        self.next(self.evaluate)

    @step
    def fit_rf(self):
        self.model_name = "Random Forest"
        self.model = RandomForestClassifier(n_estimators=20, max_depth=5)
        self.model.fit(self.X_train, self.y_train)
        self.next(self.evaluate)

    @step
    def evaluate(self, model_steps):
        print(f"Evaluating")
        for step in model_steps:
            print(f"-> {step.model_name}")

        print()
        for step in model_steps:
            print(f"{step.model_name} Classification Report")
            print("-> Train")
            y_train_pred = step.model.predict(step.X_train)
            print(classification_report(step.y_train, y_train_pred))
            print("-> Test")
            y_test_pred = step.model.predict(step.X_test)
            print(classification_report(step.y_test, y_test_pred))

        self.next(self.end)

    @step
    def end(self):
        print("Finished fitting classifiers")


run = NBRunner(ClassifierFlow, base_dir="./artifacts").nbrun()

Metaflow 2.13.9 executing ClassifierFlow for user:jeera
Validating your flow...
    The graph looks good!
Running pylint...
    Pylint is happy!
Including file ./data/test.csv of size 617B 
Including file ./data/train.csv of size 2KB 
2025-02-23 02:34:58.305 Workflow starting (run-id 1740252898303802):
2025-02-23 02:34:58.315 [1740252898303802/start/1 (pid 87994)] Task is starting.
2025-02-23 02:34:58.972 [1740252898303802/start/1 (pid 87994)] Fitting classifiers...
2025-02-23 02:34:59.055 [1740252898303802/start/1 (pid 87994)] Task finished successfully.
2025-02-23 02:34:59.062 [1740252898303802/ingest/2 (pid 87997)] Task is starting.
2025-02-23 02:34:59.728 [1740252898303802/ingest/2 (pid 87997)] Train dataset loaded with shape (120, 5)
2025-02-23 02:34:59.729 [1740252898303802/ingest/2 (pid 87997)] Test dataset loaded with shape (30, 5)
2025-02-23 02:34:59.811 [1740252898303802/ingest/2 (pid 87997)] Task finished successfully.
2025-02-23 02:34:59.817 [1740252898303802/preprocess/3 (