In [0]:
import random
import pandas as pd
import mlflow
import mlflow.sklearn
import mlflow.spark

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression as SparkLogReg
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [0]:
random.seed(42)
data = []

for _ in range(500):
    study_hours = round(random.uniform(0, 6), 1)
    attendance = random.randint(50, 100)
    previous_score = random.randint(30, 100)
    assignments = random.randint(40, 100)

    passed = (
        study_hours > 2 and
        attendance > 70 and
        previous_score > 50 and
        assignments > 60
    )

    data.append([
        study_hours,
        attendance,
        previous_score,
        assignments,
        int(passed)
    ])

df = pd.DataFrame(
    data,
    columns=[
        "study_hours",
        "attendance_pct",
        "previous_score",
        "assignments_completed",
        "pass"
    ]
)

X = df.drop("pass", axis=1)
y = df["pass"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from mlflow.models.signature import infer_signature
import mlflow
import mlflow.sklearn

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(max_depth=5),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}

# Ensure no active MLflow run
mlflow.end_run()

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        # Train
        model.fit(X_train, y_train)

        # Predict
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)

        # 🔹 Infer model signature (REQUIRED for Unity Catalog)
        signature = infer_signature(X_train, model.predict(X_train))

        # Log params & metrics
        mlflow.log_param("model_type", name)
        mlflow.log_metric("accuracy", acc)

        # Log + register model WITH signature
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path=name,
            registered_model_name="student_pass_model",
            signature=signature
        )

        results[name] = acc

print("📊 Model Accuracies:", results)


Registered model 'student_pass_model' already exists. Creating a new version of this model...
Created version '1' of model 'workspace.default.student_pass_model'.
Registered model 'student_pass_model' already exists. Creating a new version of this model...
Created version '2' of model 'workspace.default.student_pass_model'.
Registered model 'student_pass_model' already exists. Creating a new version of this model...


📊 Model Accuracies: {'LogisticRegression': 0.83, 'DecisionTree': 1.0, 'RandomForest': 1.0}


Created version '3' of model 'workspace.default.student_pass_model'.


In [0]:
signature = infer_signature(
    X_train.astype("float64"),
    model.predict(X_train).astype("int64")
)


In [0]:
best_model_name = max(results, key=results.get)
best_accuracy = results[best_model_name]

print(f"🏆 Best Sklearn Model: {best_model_name} ({best_accuracy:.4f})")


🏆 Best Sklearn Model: DecisionTree (1.0000)
