In [1]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , f1_score

In [2]:
mlflow.set_experiment("Spoiler_Detection_Baseline")

2025/12/10 00:59:10 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/10 00:59:10 INFO mlflow.store.db.utils: Updating database tables
2025/12/10 00:59:10 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/10 00:59:10 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/10 00:59:10 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2025/12/10 00:59:10 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025/12/10 00:59:10 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2025/12/10 00:59:10 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2025/12/10 00:59:10 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2025/12/10 00:59:10 INFO alembic.runtime.migration: Running 

<Experiment: artifact_location='/Users/alperen/Desktop/spoiler-detection-mlops/mlruns/1', creation_time=1765317550175, experiment_id='1', last_update_time=1765317550175, lifecycle_stage='active', name='Spoiler_Detection_Baseline', tags={}>

In [9]:
with mlflow.start_run():
    print("Data loading...")
    try:
        #DVC ile çekmeye çalıştığımız veri
        df = pd.read_csv("data/cleaned_data.csv")
    except FileNotFoundError:
            print("Error: cleaned_data.csv couldn't found! Have to do data cleaning first.")
            

    df = df.dropna(subset=["cleaned_review_text","label"])
    X = df["cleaned_review_text"]
    y = df["label"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=35)

    # TF-IDF Dönüşümü (Metni Sayıya Çevir)
    print("TF-IDF vectorization is being performed...")
    
    # max_features=5000: En önemli 5000 kelimeyi al
    tfidf = TfidfVectorizer(max_features=5000)
    X_train_vec = tfidf.fit_transform(X_train)
    X_test_vec = tfidf.transform(X_test)

    # Parametreleri Logla
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("max_features", 5000)


    # Modeli Eğit
    print("Model is training(Logistic Regression)...")
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_vec, y_train)

    # Tahmin ve değerlendirme
    predictions = model.predict(X_test_vec)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    print(f"\n---RESULTS---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Metrikleri MLflow'a Kaydet
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)

    # Modeli MLflow'a kaydet
    mlflow.sklearn.log_model(model, "model")
    print("\nModel ve metrics saved to MLflow successfully!")
    

Data loading...
TF-IDF vectorization is being performed...
Model is training(Logistic Regression)...





---RESULTS---
Accuracy: 0.6900
F1 Score: 0.6946

Model ve metrics saved to MLflow successfully!
