# Model Training

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import sys
import os

# Import src modules
sys.path.append(os.path.abspath(os.path.join('..')))
from src import config
from src.train import get_model # Import the factory we just wrote!

# Load Data
df = pd.read_csv("../data/processed/clean_data.csv")
X = df.drop(columns=[config.TARGET_COLUMN])
y = df[config.TARGET_COLUMN]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=config.RANDOM_STATE)

In [None]:
models_to_test = ["logistic_regression", "random_forest", "xgboost"]
results = {}

for model_name in models_to_test:
    print(f"Training {model_name}...")
    try:
        model = get_model(model_name, task_type="classification")
        model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        results[model_name] = score
        print(f"  -> Accuracy: {score:.4f}")
    except Exception as e:
        print(f"  -> Skipped {model_name}: {e}")

# Visualize Results
plt.bar(results.keys(), results.values())
plt.title("Model Comparison")
plt.ylim(0, 1)
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

# Example for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20]
}

rf = get_model("random_forest")
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best Params: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")