In [1]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# 1. Experiment Settings
# Set tracking URI to point to the parent directory's mlruns folder
mlflow.set_tracking_uri("file:../mlruns")

experiment_name = "Spoiler_Detection_Baseline"

# Create experiment if it doesn't exist
try:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
except AttributeError:
    print(f"Creating new experiment: {experiment_name}")
    experiment_id = mlflow.create_experiment(experiment_name)

mlflow.set_experiment(experiment_name)

Creating new experiment: Spoiler_Detection_Baseline


  return FileStore(store_uri, store_uri)


<Experiment: artifact_location='file:///Users/alperen/Desktop/spoiler-detection-mlops/notebooks/../mlruns/763245819300813949', creation_time=1766613942086, experiment_id='763245819300813949', last_update_time=1766613942086, lifecycle_stage='active', name='Spoiler_Detection_Baseline', tags={}>

In [2]:
with mlflow.start_run():
    print("Data loading...")
    try:
        # Data fetched via DVC
        df = pd.read_csv("../data/cleaned_data.csv")
    except FileNotFoundError:
            print("Error: cleaned_data.csv couldn't found! Have to do data cleaning first.")
            

    df = df.dropna(subset=["cleaned_text","label"])
    X = df["cleaned_text"]
    y = df["label"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=35)

    # TF-IDF Transformation (Convert text to numbers)
    print("TF-IDF vectorization is being performed...")
    
    # max_features=5000: Take the top 5000 most important words
    tfidf = TfidfVectorizer(max_features=5000)
    X_train_vec = tfidf.fit_transform(X_train)
    X_test_vec = tfidf.transform(X_test)

    # Log parameters
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("max_features", 5000)


    # Train the model
    print("Model is training(Logistic Regression)...")
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_vec, y_train)

    # Prediction and evaluation
    predictions = model.predict(X_test_vec)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    print(f"\n---RESULTS---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Log metrics to MLflow
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)

    # Save the model to MLflow
    mlflow.sklearn.log_model(model, "model")
    print("\nModel ve metrics saved to MLflow successfully!")

Data loading...
TF-IDF vectorization is being performed...




Model is training(Logistic Regression)...

---RESULTS---
Accuracy: 0.6950
F1 Score: 0.7024

Model ve metrics saved to MLflow successfully!
