In [1]:
# Install MLflow
!pip install mlflow -q

# Import libraries
import os
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import average_precision_score
from google.colab import drive


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.4/28.4 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.7/648.7 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.4/203.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the dataset
data_path = "/content/drive/MyDrive/dvc_project/data"


Mounted at /content/drive


In [7]:
print("Loading data...")

from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
train = pd.read_csv(os.path.join(data_path, "train.csv"))
valid = pd.read_csv(os.path.join(data_path, "validation.csv"))

# Separate features and target
X_train_raw, y_train = train["text"], train["label"]
X_valid_raw, y_valid = valid["text"], valid["label"]

# Convert text into numerical vectors using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_valid = vectorizer.transform(X_valid_raw)

print("Text data successfully vectorized!")

print("Data loaded successfully!")


Loading data...
Text data successfully vectorized!
Data loaded successfully!


In [8]:
# Set up an experiment in MLflow
mlflow.set_experiment("Spam_Classification")


<Experiment: artifact_location='file:///content/mlruns/595921464032175323', creation_time=1741160809907, experiment_id='595921464032175323', last_update_time=1741160809907, lifecycle_stage='active', name='Spam_Classification', tags={}>

In [9]:
def train_and_log_model(model, model_name):
    """
    Trains a model, logs it to MLflow, and prints AUCPR score.
    """
    with mlflow.start_run():
        # Train model
        model.fit(X_train, y_train)

        # Predict probabilities
        y_pred = model.predict_proba(X_valid)[:, 1]

        # Compute AUCPR score
        aucpr = average_precision_score(y_valid, y_pred)

        # Log AUCPR metric
        mlflow.log_metric("AUCPR", aucpr)

        # Log model
        mlflow.sklearn.log_model(model, model_name)

        print(f"{model_name} AUCPR: {aucpr}")


In [10]:
# Define models
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVM": SVC(probability=True)
}

# Train and log each model
for name, model in models.items():
    train_and_log_model(model, name)




RandomForest AUCPR: 0.9837029179613737




LogisticRegression AUCPR: 0.972349844151224




SVM AUCPR: 0.9807715057456959
