In [4]:
# ANALISIS PREDIKTIF DIABETES
# Regresi Linear & KNN Klasifikasi

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, r2_score
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("ANALISIS PREDIKTIF DIABETES DATASET")
print("="*60)

# Load dan preprocessing data
df = pd.read_csv('diabetes_dataset.csv')

# Ubah data kategorikal jadi angka
kategori = ['gender', 'ethnicity', 'education_level', 'income_level', 
            'employment_status', 'smoking_status', 'diabetes_stage']

for col in kategori:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Pilih fitur
fitur = ['age', 'gender', 'bmi', 'systolic_bp', 'diastolic_bp',
         'cholesterol_total', 'glucose_fasting', 'hba1c',
         'family_history_diabetes', 'smoking_status']

X = df[fitur]

# ========================================
# REGRESI LINEAR
# ========================================
print("\n" + "="*60)
print("BAGIAN 1: REGRESI LINEAR")
print("="*60)

print("\nTujuan: Prediksi Diabetes Risk Score (0-100)")

# Target untuk regresi
y_regresi = df['diabetes_risk_score']

# Bagi data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y_regresi, test_size=0.2, random_state=42
)

print(f"\nData Training: {len(X_train)}")
print(f"Data Testing: {len(X_test)}")

# Training model
print("\nTraining model...")
model_regresi = LinearRegression()
model_regresi.fit(X_train, y_train)

# Prediksi
prediksi = model_regresi.predict(X_test)

# Evaluasi
r2 = r2_score(y_test, prediksi)

print("\n" + "-"*60)
print("HASIL REGRESI LINEAR")
print("-"*60)
print(f"R² Score: {r2:.4f} ({r2*100:.2f}%)")
print(f"\nArtinya: Model dapat menjelaskan {r2*100:.1f}% variasi data")

if r2 >= 0.95:
    print("Status: SANGAT BAGUS! ✓")
elif r2 >= 0.85:
    print("Status: BAGUS ✓")
else:
    print("Status: CUKUP BAIK")

# ========================================
# KLASIFIKASI KNN
# ========================================
print("\n" + "="*60)
print("BAGIAN 2: KLASIFIKASI KNN")
print("="*60)

print("\nTujuan: Klasifikasi Diabetes (Ya/Tidak)")

# Target untuk klasifikasi
y_klasifikasi = df['diagnosed_diabetes']

# Bagi data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y_klasifikasi, test_size=0.2, random_state=42
)

print(f"\nData Training: {len(X_train)}")
print(f"Data Testing: {len(X_test)}")

# Coba berbagai nilai K
print("\nMencari K terbaik...")
k_terbaik = 5
akurasi_terbaik = 0

for k in [3, 5, 7, 9]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    akurasi = knn.score(X_test, y_test)
    print(f"  K={k} → Akurasi: {akurasi:.3f}")
    
    if akurasi > akurasi_terbaik:
        akurasi_terbaik = akurasi
        k_terbaik = k

print(f"\n✓ K terbaik: {k_terbaik}")

# Training model final dengan K terbaik
print(f"Training model dengan K={k_terbaik}...")
model_knn = KNeighborsClassifier(n_neighbors=k_terbaik)
model_knn.fit(X_train, y_train)

# Prediksi
prediksi_knn = model_knn.predict(X_test)

# Evaluasi
akurasi = accuracy_score(y_test, prediksi_knn)
cm = confusion_matrix(y_test, prediksi_knn)

print("\n" + "-"*60)
print("HASIL KLASIFIKASI KNN")
print("-"*60)
print(f"Akurasi: {akurasi:.3f} ({akurasi*100:.1f}%)")

benar = int(akurasi * len(y_test))
print(f"\nPrediksi Benar: {benar}/{len(y_test)}")

print(f"\nMatriks Konfusi:")
print(f"              Prediksi")
print(f"           No      Yes")
print(f"Aktual No   {cm[0][0]:5d}   {cm[0][1]:5d}")
print(f"      Yes   {cm[1][0]:5d}   {cm[1][1]:5d}")

print(f"\nPenjelasan:")
print(f"  • Benar prediksi No Diabetes: {cm[0][0]} data")
print(f"  • Salah prediksi No jadi Yes: {cm[0][1]} data")
print(f"  • Salah prediksi Yes jadi No: {cm[1][0]} data")
print(f"  • Benar prediksi Yes Diabetes: {cm[1][1]} data")

total_salah = cm[0][1] + cm[1][0]
print(f"\nTotal Kesalahan: {total_salah} dari {len(y_test)} data")

# Classification Report
print("\nLaporan Detail:")
print("-"*60)
report = classification_report(y_test, prediksi_knn, 
                              target_names=['No Diabetes', 'Diabetes'],
                              output_dict=True)

print(f"              precision    recall  f1-score   support")
print(f"")
print(f"  No Diabetes      {report['No Diabetes']['precision']:.2f}      {report['No Diabetes']['recall']:.2f}      {report['No Diabetes']['f1-score']:.2f}     {int(report['No Diabetes']['support'])}")
print(f"     Diabetes      {report['Diabetes']['precision']:.2f}      {report['Diabetes']['recall']:.2f}      {report['Diabetes']['f1-score']:.2f}     {int(report['Diabetes']['support'])}")
print(f"")
print(f"     accuracy                           {report['accuracy']:.2f}     {len(y_test)}")

# ========================================
# KESIMPULAN
# ========================================
print("\n" + "="*60)
print("KESIMPULAN")
print("="*60)

print("\n1. REGRESI LINEAR")
print(f"   • Akurasi: {r2*100:.1f}%")
print(f"   • Target: Diabetes Risk Score")
print(f"   • Kesimpulan: Model", end=" ")
if r2 >= 0.95:
    print("SANGAT BAIK dalam prediksi")
else:
    print("BAIK dalam prediksi")

print("\n2. KLASIFIKASI KNN")
print(f"   • Akurasi: {akurasi*100:.1f}%")
print(f"   • K optimal: {k_terbaik}")
print(f"   • Target: Diagnosed Diabetes (Ya/Tidak)")
print(f"   • Kesimpulan: Model", end=" ")
if akurasi >= 0.90:
    print("SANGAT BAIK")
elif akurasi >= 0.80:
    print("BAIK")
else:
    print("CUKUP BAIK")

if cm[0][1] > 100 or cm[1][0] > 100:
    print(f"   • Catatan: Masih ada {cm[0][1] + cm[1][0]} kesalahan prediksi")
    print(f"             Ada peluang untuk perbaikan")

print("\n" + "="*60)
print("SELESAI")
print("="*60)

ANALISIS PREDIKTIF DIABETES DATASET

BAGIAN 1: REGRESI LINEAR

Tujuan: Prediksi Diabetes Risk Score (0-100)

Data Training: 80000
Data Testing: 20000

Training model...

------------------------------------------------------------
HASIL REGRESI LINEAR
------------------------------------------------------------
R² Score: 0.8631 (86.31%)

Artinya: Model dapat menjelaskan 86.3% variasi data
Status: BAGUS ✓

BAGIAN 2: KLASIFIKASI KNN

Tujuan: Klasifikasi Diabetes (Ya/Tidak)

Data Training: 80000
Data Testing: 20000

Mencari K terbaik...
  K=3 → Akurasi: 0.682
  K=5 → Akurasi: 0.694
  K=7 → Akurasi: 0.702
  K=9 → Akurasi: 0.707

✓ K terbaik: 9
Training model dengan K=9...

------------------------------------------------------------
HASIL KLASIFIKASI KNN
------------------------------------------------------------
Akurasi: 0.707 (70.7%)

Prediksi Benar: 14141/20000

Matriks Konfusi:
              Prediksi
           No      Yes
Aktual No    5001    3076
      Yes    2782    9141

Penjelasa