In [None]:

# Tahap EVALUATION

## Import Library
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import pandas as pd

target_names = ['Class 0', 'Class 1']  # Ganti sesuai target

# Evaluasi semua model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=target_names))
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC Score: {roc:.4f}")
    print("\n--------------------------\n")

# Evaluasi untuk setiap model
print("Evaluasi Decision Tree:")
evaluate_model(decision_tree_model, X_test, y_test)

print("Evaluasi Random Forest:")
evaluate_model(random_forest_model, X_test, y_test)

print("Evaluasi Logistic Regression:")
evaluate_model(logistic_regression_model, X_test, y_test)


"""
Definisi Metrik:
- Accuracy: Persentase prediksi yang benar.
- Precision: Seberapa banyak prediksi positif yang benar.
- Recall: Seberapa banyak data positif yang berhasil ditemukan.
- F1-Score: Harmonis antara Precision dan Recall.
- ROC-AUC: Kemampuan model membedakan antar kelas.
"""

# Tahap VALIDASI SILANG (Cross Validation)
from sklearn.model_selection import cross_val_score

models = {
    'Decision Tree': decision_tree_model,
    'Random Forest': random_forest_model,
    'Logistic Regression': logistic_regression_model
}

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name} Cross-Validation Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

# Tahap FEATURE SELECTION
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(X, y)

feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': selector.scores_
}).sort_values(by='Score', ascending=False)

print(feature_scores)

"""
Penjelasan:
- Semakin tinggi skor, semakin penting fitur terhadap target.
- Bisa memilih fitur dengan skor tertinggi untuk modelling lanjutan.
"""

# Tahap HYPERPARAMETER TUNING
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

# Evaluasi model setelah tuning
best_model = grid_search.best_estimator_
evaluate_model(best_model, X_test, y_test)

"""
Interpretasi:
- Kita membandingkan model sebelum dan sesudah tuning.
- Jika kinerja model meningkat, maka tuning berhasil.
"""

# Tahap DETERMINE NEXT STEPS
"""
Simpulan:
- Algoritma terbaik dipilih berdasarkan metrik dan validasi silang.
- Feature selection menunjukkan fitur penting yang bisa digunakan untuk penyederhanaan model.
- Hyperparameter tuning meningkatkan performa algoritma.

Next Steps:
- Jika performa sudah memuaskan, lanjut ke tahapan deployment.
- Jika performa belum memuaskan, kembali ke tahap preprocessing atau feature engineering untuk perbaikan.
"""


# Deployment - Streamlit & FastAPI with Ngrok on Colab
Panduan menjalankan dashboard dan API model di Google Colab (gratis)

In [1]:
!pip install streamlit fastapi uvicorn pyngrok joblib nest-asyncio

Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.5-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownload

## Setup Streamlit App

In [31]:
%%bash
cat > streamlit_app.py << 'EOF'
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt

st.title('Dashboard Analisis Data (Colab)')

uploaded_file = st.file_uploader("Upload dataset CSV", type="csv")
if uploaded_file:
    data = pd.read_csv(uploaded_file)
    data.columns = data.columns.str.strip()  # Penting! Buang spasi nakal

    st.write("### Data Sample")
    st.write(data.head())

    st.write("### Nama Kolom Dataset")
    st.write(list(data.columns))

    st.subheader('Distribution of Family History Variable')
    if 'family_history' in data.columns:
        st.bar_chart(data['family_history'].value_counts())
    else:
        st.error("Kolom 'family_history' tidak ditemukan di dataset!")

    st.subheader('Feature Correlation')
    numeric_cols = data.select_dtypes(include=['number'])
    if not numeric_cols.empty:
        corr = numeric_cols.corr()
        st.write(corr)
        fig, ax = plt.subplots()
        cax = ax.matshow(corr)
        fig.colorbar(cax)
        st.pyplot(fig)
    else:
        st.error("Tidak ada kolom numerik untuk korelasi.")
EOF


## Setup FastAPI App

In [3]:
%%bash
cat > fastapi_app.py << 'EOF'
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import numpy as np

app = FastAPI()
model = joblib.load('best_model.pkl')

class InputData(BaseModel):
    feature1: float
    feature2: float
    feature3: float

@app.post('/predict')
def predict(data: InputData):
    input_array = np.array([[data.feature1, data.feature2, data.feature3]])
    prediction = model.predict(input_array)
    return {'prediction': int(prediction[0])}
EOF


## Create Dummy Model for FastAPI

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import joblib


In [6]:
X, y = make_classification(n_samples=100, n_features=5, n_informative=3, n_redundant=1, random_state=42)


In [7]:
model = RandomForestClassifier()
model.fit(X, y)

In [8]:
joblib.dump(model, 'best_model.pkl')

['best_model.pkl']

In [9]:
print("Model telah disimpan sebagai 'best_model.pkl'")

Model telah disimpan sebagai 'best_model.pkl'


In [17]:
%%bash
python3 << EOF
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import joblib

X, y = make_classification(
    n_samples=100,
    n_features=3,
    n_informative=2,
    n_redundant=0,
    random_state=42
)
model = RandomForestClassifier()
model.fit(X, y)
joblib.dump(model, 'best_model.pkl')
EOF

## Run Streamlit with Ngrok

In [19]:
pip install pyngrok



In [20]:
from pyngrok import ngrok

# Set your authtoken (replace with your actual token)
ngrok.set_auth_token("2wM0uN8lHG5wGnLta7fplfx5ntl_7JThThywR2EFNLnuZmkHF")

# Create tunnel
url = ngrok.connect(8501)
print("Streamlit URL:", url)

Streamlit URL: NgrokTunnel: "https://c37e-35-243-214-60.ngrok-free.app" -> "http://localhost:8501"


## Run FastAPI with Ngrok

In [22]:
# Safer alternative using environment variables
import os
from getpass import getpass

# Prompt for token instead of hardcoding
auth_token = getpass("2wM0uN8lHG5wGnLta7fplfx5ntl_7JThThywR2EFNLnuZmkHF ")
ngrok.set_auth_token(auth_token)


2wM0uN8lHG5wGnLta7fplfx5ntl_7JThThywR2EFNLnuZmkHF ··········
