In [1]:
import pandas as pd
import numpy as np # <-- TAMBAHKAN INI
import joblib
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Mengatur tampilan grafik agar lebih baik (opsional, jika Anda ingin visualisasi di sini)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use('ggplot')

In [2]:
# Muat data training mentah
train_df_raw = pd.read_csv('../data/raw/train.csv')
y_train_original = train_df_raw['Survived'] # Target variable

# Muat scaler dan daftar kolom yang sudah disimpan dari preprocessing sebelumnya
scaler = joblib.load('../models/scaler.pkl')
features_columns = joblib.load('../models/features_columns.pkl')

# --- Replikasi Preprocessing untuk mendapatkan X_train_processed dan y_train ---
X_train_temp = train_df_raw.drop('Survived', axis=1).copy() # Salinan untuk preprocessing

# Penanganan Missing Values (menggunakan median/mode dari train_df_raw)
median_age_train = train_df_raw['Age'].median()
mode_embarked_train = train_df_raw['Embarked'].mode()[0]
median_fare_train = train_df_raw['Fare'].median()

# Perbaikan FutureWarning: Gunakan penugasan langsung alih-alih inplace=True
X_train_temp['Age'] = X_train_temp['Age'].fillna(median_age_train)
X_train_temp['Embarked'] = X_train_temp['Embarked'].fillna(mode_embarked_train)
X_train_temp['Fare'] = X_train_temp['Fare'].fillna(median_fare_train)


# Feature Engineering
X_train_temp['Title'] = X_train_temp['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip() if pd.notnull(x) and isinstance(x, str) and ',' in x and '.' in x else 'Rare')
title_mapping = {
    "Mme": "Mrs", "Mlle": "Miss", "Ms": "Miss", "Lady": "Mrs", "Countess": "Mrs",
    "Dona": "Mrs", "Sir": "Mr", "Don": "Mr", "Major": "Officer", "Col": "Officer",
    "Capt": "Officer", "Dr": "Officer", "Rev": "Officer", "Jonkheer": "Rare",
    "Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Master"
}
X_train_temp['Title'] = X_train_temp['Title'].replace(title_mapping)
X_train_temp['Title'] = X_train_temp['Title'].apply(lambda x: x if x in title_mapping.values() else 'Rare')

X_train_temp['FamilySize'] = X_train_temp['SibSp'] + X_train_temp['Parch'] + 1
X_train_temp['IsAlone'] = (X_train_temp['FamilySize'] == 1).astype(int)
X_train_temp['Deck'] = X_train_temp['Cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'U')

X_train_temp['FarePerPerson'] = X_train_temp['Fare'] / X_train_temp['FamilySize']
# Perbaikan FutureWarning: Gunakan penugasan langsung untuk replace dan fillna di sini juga
X_train_temp['FarePerPerson'] = X_train_temp['FarePerPerson'].replace([np.inf, -np.inf], np.nan)
X_train_temp['FarePerPerson'] = X_train_temp['FarePerPerson'].fillna(X_train_temp['FarePerPerson'].median())


# Hapus Kolom Asli
X_train_temp.drop(['Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'PassengerId'], axis=1, inplace=True, errors='ignore')

# One-Hot Encoding
categorical_cols = ['Sex', 'Embarked', 'Pclass', 'Title', 'Deck']
X_train_processed = pd.get_dummies(X_train_temp, columns=categorical_cols, drop_first=True)

# Scaling Numerik (menggunakan scaler yang sudah di-fit dari notebook 01)
numerical_cols_to_scale = ['Age', 'Fare', 'FamilySize', 'FarePerPerson']
cols_to_scale_in_df = [col for col in numerical_cols_to_scale if col in X_train_processed.columns]
X_train_processed[cols_to_scale_in_df] = scaler.transform(X_train_processed[cols_to_scale_in_df])

# Penjajaran Kolom (SANGAT KRITIS): Pastikan urutan dan jumlah kolom sama dengan fitur_columns
for col in features_columns:
    if col not in X_train_processed.columns:
        X_train_processed[col] = 0
X_train_processed = X_train_processed[features_columns] # Pastikan urutan kolom sama persis

y_train = y_train_original # Target variable

print(f"Data pelatihan siap. Bentuk: {X_train_processed.shape}")

Data pelatihan siap. Bentuk: (891, 23)


In [3]:
# Inisialisasi model-model yang akan diuji
log_reg = LogisticRegression(random_state=42, solver='liblinear') # Logistic Regression: Model sederhana, cepat, bagus untuk baseline
rf_clf = RandomForestClassifier(random_state=42, n_estimators=100) # Random Forest: Sekumpulan pohon keputusan, seringkali akurat
gb_clf = GradientBoostingClassifier(random_state=42, n_estimators=100) # Gradient Boosting: Model kuat, belajar bertahap

# Simpan model-model ini dalam sebuah dictionary agar mudah diakses
models = {
    'Logistic Regression': log_reg,
    'Random Forest': rf_clf,
    'Gradient Boosting': gb_clf
}
print("Model-model telah didefinisikan.")

Model-model telah didefinisikan.


In [4]:
results = {} # Dictionary untuk menyimpan hasil evaluasi setiap model
# StratifiedKFold memastikan pembagian data untuk cross-validation memiliki proporsi 'Selamat' dan 'Tidak Selamat' yang sama.
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Bagi data jadi 5 bagian (folds)

print("Memulai pelatihan dan evaluasi model dengan Cross-Validation...\n")

for name, model in models.items(): # Iterasi melalui setiap model yang sudah didefinisikan
    print(f"--- Melatih dan Mengevaluasi: {name} ---")

    # Melakukan cross-validation dan mendapatkan skor untuk berbagai metrik
    f1_scores = cross_val_score(model, X_train_processed, y_train, cv=skf, scoring='f1')
    accuracy_scores = cross_val_score(model, X_train_processed, y_train, cv=skf, scoring='accuracy')
    precision_scores = cross_val_score(model, X_train_processed, y_train, cv=skf, scoring='precision')
    recall_scores = cross_val_score(model, X_train_processed, y_train, cv=skf, scoring='recall')
    roc_auc_scores = cross_val_score(model, X_train_processed, y_train, cv=skf, scoring='roc_auc')

    # Menyimpan rata-rata skor dari cross-validation
    results[name] = {
        'Accuracy': accuracy_scores.mean(),
        'Precision': precision_scores.mean(),
        'Recall': recall_scores.mean(),
        'F1 Score': f1_scores.mean(),
        'ROC AUC': roc_auc_scores.mean()
    }
    print(f"{name} - Rata-rata F1 Score: {results[name]['F1 Score']:.4f}")
    print(f"{name} - Rata-rata Akurasi: {results[name]['Accuracy']:.4f}\n")

Memulai pelatihan dan evaluasi model dengan Cross-Validation...

--- Melatih dan Mengevaluasi: Logistic Regression ---
Logistic Regression - Rata-rata F1 Score: 0.7641
Logistic Regression - Rata-rata Akurasi: 0.8249

--- Melatih dan Mengevaluasi: Random Forest ---
Random Forest - Rata-rata F1 Score: 0.7479
Random Forest - Rata-rata Akurasi: 0.8103

--- Melatih dan Mengevaluasi: Gradient Boosting ---
Gradient Boosting - Rata-rata F1 Score: 0.7770
Gradient Boosting - Rata-rata Akurasi: 0.8350



In [5]:
print("==============================\nRingkasan Hasil Evaluasi Model:\n==============================")
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}") # Tampilkan metrik dengan 4 angka di belakang koma
    print("-" * 30)

# Pilih model terbaik berdasarkan F1 Score (karena F1 score adalah metrik yang baik untuk klasifikasi)
best_model_name = max(results, key=lambda k: results[k]['F1 Score'])
best_model = models[best_model_name]

print(f"\nModel Terbaik Dipilih untuk Deployment: {best_model_name} (F1 Score: {results[best_model_name]['F1 Score']:.4f})")

Ringkasan Hasil Evaluasi Model:
Model: Logistic Regression
  Accuracy: 0.8249
  Precision: 0.7895
  Recall: 0.7425
  F1 Score: 0.7641
  ROC AUC: 0.8706
------------------------------
Model: Random Forest
  Accuracy: 0.8103
  Precision: 0.7634
  Recall: 0.7338
  F1 Score: 0.7479
  ROC AUC: 0.8749
------------------------------
Model: Gradient Boosting
  Accuracy: 0.8350
  Precision: 0.8102
  Recall: 0.7485
  F1 Score: 0.7770
  ROC AUC: 0.8833
------------------------------

Model Terbaik Dipilih untuk Deployment: Gradient Boosting (F1 Score: 0.7770)


In [6]:
print(f"\nMelatih ulang '{best_model_name}' pada seluruh data training untuk finalisasi...")
best_model.fit(X_train_processed, y_train) # Model dilatih di seluruh data training yang sudah diproses

# Simpan model yang sudah dilatih ke folder 'models'
joblib.dump(best_model, '../models/best_titanic_model.pkl')
print(f"Model '{best_model_name}' berhasil disimpan ke: models/best_titanic_model.pkl")


Melatih ulang 'Gradient Boosting' pada seluruh data training untuk finalisasi...
Model 'Gradient Boosting' berhasil disimpan ke: models/best_titanic_model.pkl
