In [None]:
# MLFlow Titanic Project: Data Loading and Setup
import pandas as pd
from src.utils import preprocess_titanic

data = pd.read_csv('../data/Titanic Dataset.csv')
data.head()

In [None]:
# Preprocess the Titanic data
data_clean = preprocess_titanic(data.copy())
data_clean.head()

In [None]:
# Split features and target
y = data_clean['Survived']
X = data_clean.drop(['Survived', 'PassengerId', 'Name', 'Ticket'], axis=1, errors='ignore')
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

In [None]:
# Train a RandomForest model and evaluate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_val)
acc = accuracy_score(y_val, preds)
print(f'Validation Accuracy: {acc:.4f}')

In [None]:
# Log model and metrics with MLflow, and save model
import mlflow
import mlflow.sklearn
import joblib

with mlflow.start_run():
    mlflow.log_param('n_estimators', 100)
    mlflow.log_param('max_depth', 5)
    mlflow.log_metric('val_accuracy', acc)
    mlflow.sklearn.log_model(model, "model")
    joblib.dump(model, '../models/titanic_model.pkl')
    print('Model saved and logged to MLflow.')

In [None]:
# Load the saved model and make predictions on new data
loaded_model = joblib.load('../models/titanic_model.pkl')
# Example: predict on validation set
val_preds = loaded_model.predict(X_val)
print(val_preds[:10])