# Proyek Klasifikasi: Prediksi Jenis Bunga Iris
## UTS Machine Learning - Universitas Pamulang

**Deskripsi:** Proyek ini menggunakan dataset Iris untuk memprediksi jenis bunga berdasarkan karakteristik fisiknya.

**Dataset:** Iris Dataset (150 sampel, 4 fitur, 3 kelas)


## 1. Import Library


In [None]:
# Library dasar
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Library sklearn
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

# Model Klasifikasi
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Evaluasi Model
from sklearn.metrics import (
    confusion_matrix, 
    classification_report, 
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    roc_auc_score,
    ConfusionMatrixDisplay
)
from sklearn.preprocessing import label_binarize

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("✓ Library berhasil diimport")


## 2. Load dan Eksplorasi Data


In [None]:
# Load dataset Iris
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['species'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

print("Dataset Iris berhasil dimuat!")
print(f"\nJumlah sampel: {df.shape[0]}")
print(f"Jumlah fitur: {df.shape[1]-2}")
print(f"\nKelas target: {df['species'].unique()}")


In [None]:
# Tampilkan 5 data pertama
print("5 Data Pertama:")
df.head()


In [None]:
# Informasi dataset
print("Informasi Dataset:")
df.info()


In [None]:
# Statistik deskriptif
print("Statistik Deskriptif:")
df.describe()


In [None]:
# Cek missing values
print("Missing Values:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")


In [None]:
# Distribusi kelas target
print("Distribusi Kelas Target:")
print(df['species'].value_counts())
print(f"\nPersentase:")
print(df['species'].value_counts(normalize=True) * 100)


## 3. Exploratory Data Analysis (EDA)


In [None]:
# Visualisasi distribusi kelas
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
df['species'].value_counts().plot(kind='bar', ax=axes[0], color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0].set_title('Distribusi Kelas Target', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Spesies')
axes[0].set_ylabel('Jumlah')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45)

# Pie chart
df['species'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', 
                                   colors=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[1].set_title('Proporsi Kelas Target', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig('distribusi_kelas.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualisasi distribusi kelas selesai")


In [None]:
# Distribusi fitur
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
features = iris.feature_names

for idx, feature in enumerate(features):
    row, col = idx // 2, idx % 2
    for species in df['species'].unique():
        data = df[df['species'] == species][feature]
        axes[row, col].hist(data, alpha=0.6, label=species, bins=20)
    
    axes[row, col].set_title(f'Distribusi {feature}', fontsize=12, fontweight='bold')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Frekuensi')
    axes[row, col].legend()
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('distribusi_fitur.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualisasi distribusi fitur selesai")


In [None]:
# Box plot untuk melihat outliers
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
features = iris.feature_names

for idx, feature in enumerate(features):
    row, col = idx // 2, idx % 2
    df.boxplot(column=feature, by='species', ax=axes[row, col])
    axes[row, col].set_title(f'Box Plot: {feature}')
    axes[row, col].set_xlabel('Spesies')
    axes[row, col].set_ylabel(feature)

plt.suptitle('')
plt.tight_layout()
plt.savefig('boxplot_fitur.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Box plot selesai")


In [None]:
# Correlation Matrix
plt.figure(figsize=(10, 8))
correlation_matrix = df[iris.feature_names].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, fmt='.2f')
plt.title('Correlation Matrix - Fitur Iris', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Correlation matrix selesai")


In [None]:
# Pair plot
plt.figure(figsize=(12, 10))
pairplot = sns.pairplot(df, hue='species', markers=['o', 's', 'D'], 
                        palette='husl', diag_kind='kde', height=2.5)
pairplot.fig.suptitle('Pair Plot - Iris Dataset', y=1.02, fontsize=16, fontweight='bold')
plt.savefig('pairplot.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Pair plot selesai")


## 4. Data Preprocessing


In [None]:
# Pisahkan fitur dan target
X = df[iris.feature_names]
y = df['target']

print(f"Shape fitur (X): {X.shape}")
print(f"Shape target (y): {y.shape}")


In [None]:
# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Data berhasil dibagi:")
print(f"Training set: {X_train.shape[0]} sampel")
print(f"Testing set: {X_test.shape[0]} sampel")
print(f"\nDistribusi kelas di training set:")
print(pd.Series(y_train).value_counts().sort_index())
print(f"\nDistribusi kelas di testing set:")
print(pd.Series(y_test).value_counts().sort_index())


In [None]:
# Standardisasi fitur
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✓ Data berhasil distandardisasi")
print(f"\nMean setelah scaling (training): {X_train_scaled.mean(axis=0)}")
print(f"Std setelah scaling (training): {X_train_scaled.std(axis=0)}")


## 5. Model Training


### 5.1 Logistic Regression


In [None]:
# Train Logistic Regression
print("Training Logistic Regression...")
lr_model = LogisticRegression(max_iter=200, random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Prediksi
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)

print("✓ Logistic Regression selesai ditraining")


### 5.2 Decision Tree


In [None]:
# Train Decision Tree
print("Training Decision Tree...")
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train_scaled, y_train)

# Prediksi
y_pred_dt = dt_model.predict(X_test_scaled)
y_pred_proba_dt = dt_model.predict_proba(X_test_scaled)

print("✓ Decision Tree selesai ditraining")


### 5.3 K-Nearest Neighbors


In [None]:
# Train K-Nearest Neighbors
print("Training K-Nearest Neighbors...")
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# Prediksi
y_pred_knn = knn_model.predict(X_test_scaled)
y_pred_proba_knn = knn_model.predict_proba(X_test_scaled)

print("✓ K-Nearest Neighbors selesai ditraining")


### 5.4 Support Vector Machine


In [None]:
# Train Support Vector Machine
print("Training Support Vector Machine...")
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Prediksi
y_pred_svm = svm_model.predict(X_test_scaled)
y_pred_proba_svm = svm_model.predict_proba(X_test_scaled)

print("✓ Support Vector Machine selesai ditraining")


## 6. Model Evaluation


In [None]:
# Fungsi untuk evaluasi model
def evaluate_model(y_true, y_pred, model_name):
    """
    Fungsi untuk mengevaluasi performa model klasifikasi
    """
    print(f"\n{'='*60}")
    print(f"EVALUASI MODEL: {model_name}")
    print(f"{'='*60}")
    
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"\n1. Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    # Precision, Recall, F1-Score (weighted average)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    print(f"2. Precision (weighted): {precision:.4f}")
    print(f"3. Recall (weighted): {recall:.4f}")
    print(f"4. F1-Score (weighted): {f1:.4f}")
    
    # Classification Report
    print(f"\n5. Classification Report:")
    print(classification_report(y_true, y_pred, 
                                target_names=['Setosa', 'Versicolor', 'Virginica']))
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }


In [None]:
# Evaluasi semua model
results = {}

results['Logistic Regression'] = evaluate_model(y_test, y_pred_lr, 'Logistic Regression')
results['Decision Tree'] = evaluate_model(y_test, y_pred_dt, 'Decision Tree')
results['K-Nearest Neighbors'] = evaluate_model(y_test, y_pred_knn, 'K-Nearest Neighbors')
results['Support Vector Machine'] = evaluate_model(y_test, y_pred_svm, 'Support Vector Machine')


### 6.1 Confusion Matrix


In [None]:
# Visualisasi Confusion Matrix untuk semua model
fig, axes = plt.subplots(2, 2, figsize=(16, 14))
models = [
    ('Logistic Regression', y_pred_lr),
    ('Decision Tree', y_pred_dt),
    ('K-Nearest Neighbors', y_pred_knn),
    ('Support Vector Machine', y_pred_svm)
]

for idx, (name, y_pred) in enumerate(models):
    row, col = idx // 2, idx % 2
    cm = confusion_matrix(y_test, y_pred)
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Setosa', 'Versicolor', 'Virginica'],
                yticklabels=['Setosa', 'Versicolor', 'Virginica'],
                ax=axes[row, col], cbar_kws={'label': 'Count'})
    
    axes[row, col].set_title(f'Confusion Matrix: {name}', 
                             fontsize=12, fontweight='bold', pad=10)
    axes[row, col].set_ylabel('True Label')
    axes[row, col].set_xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Confusion matrices berhasil divisualisasikan")


### 6.2 ROC Curve


In [None]:
# ROC Curve untuk multiclass classification
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
n_classes = y_test_bin.shape[1]

fig, axes = plt.subplots(2, 2, figsize=(16, 14))
models_proba = [
    ('Logistic Regression', y_pred_proba_lr),
    ('Decision Tree', y_pred_proba_dt),
    ('K-Nearest Neighbors', y_pred_proba_knn),
    ('Support Vector Machine', y_pred_proba_svm)
]

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
class_names = ['Setosa', 'Versicolor', 'Virginica']

for idx, (name, y_proba) in enumerate(models_proba):
    row, col = idx // 2, idx % 2
    
    # ROC curve untuk setiap kelas
    for i, color, class_name in zip(range(n_classes), colors, class_names):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
        roc_auc = roc_auc_score(y_test_bin[:, i], y_proba[:, i])
        
        axes[row, col].plot(fpr, tpr, color=color, lw=2,
                           label=f'{class_name} (AUC = {roc_auc:.3f})')
    
    # Diagonal line
    axes[row, col].plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')
    
    axes[row, col].set_xlim([0.0, 1.0])
    axes[row, col].set_ylim([0.0, 1.05])
    axes[row, col].set_xlabel('False Positive Rate', fontsize=10)
    axes[row, col].set_ylabel('True Positive Rate', fontsize=10)
    axes[row, col].set_title(f'ROC Curve: {name}', fontsize=12, fontweight='bold')
    axes[row, col].legend(loc='lower right', fontsize=9)
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ ROC curves berhasil divisualisasikan")


In [None]:
# Buat DataFrame perbandingan
comparison_df = pd.DataFrame(results).T
comparison_df = comparison_df.round(4)
comparison_df['accuracy_pct'] = (comparison_df['accuracy'] * 100).round(2)

print("\n" + "="*70)
print("PERBANDINGAN PERFORMA MODEL")
print("="*70)
print(comparison_df)
print("\n" + "="*70)


In [None]:
# Visualisasi perbandingan metrics
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
metric_names = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors_bar = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#95E1D3']

for idx, (metric, metric_name) in enumerate(zip(metrics, metric_names)):
    row, col = idx // 2, idx % 2
    
    data = comparison_df[metric]
    bars = axes[row, col].bar(data.index, data.values, color=colors_bar)
    
    # Tambahkan nilai di atas bar
    for bar in bars:
        height = bar.get_height()
        axes[row, col].text(bar.get_x() + bar.get_width()/2., height,
                           f'{height:.4f}',
                           ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    axes[row, col].set_title(f'{metric_name} Comparison', fontsize=12, fontweight='bold')
    axes[row, col].set_ylabel(metric_name)
    axes[row, col].set_ylim([0, 1.1])
    axes[row, col].tick_params(axis='x', rotation=45)
    axes[row, col].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualisasi perbandingan model selesai")


In [None]:
# Radar chart untuk perbandingan keseluruhan
from math import pi

categories = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
N = len(categories)

fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='polar')

angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

colors_radar = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#95E1D3']

for idx, (model_name, color) in enumerate(zip(comparison_df.index, colors_radar)):
    values = comparison_df.loc[model_name, metrics].values.tolist()
    values += values[:1]
    
    ax.plot(angles, values, 'o-', linewidth=2, label=model_name, color=color)
    ax.fill(angles, values, alpha=0.15, color=color)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, size=11)
ax.set_ylim(0, 1)
ax.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8', '1.0'], size=9)
ax.grid(True)

plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=10)
plt.title('Radar Chart - Perbandingan Performa Model', size=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('radar_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Radar chart selesai")


## 8. Cross-Validation


In [None]:
# Cross-validation untuk validasi model
print("\nMelakukan 5-Fold Cross-Validation...\n")

cv_results = {}
models_cv = {
    'Logistic Regression': lr_model,
    'Decision Tree': dt_model,
    'K-Nearest Neighbors': knn_model,
    'Support Vector Machine': svm_model
}

for name, model in models_cv.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_results[name] = {
        'scores': scores,
        'mean': scores.mean(),
        'std': scores.std()
    }
    print(f"{name}:")
    print(f"  Scores: {scores}")
    print(f"  Mean Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")
    print()


In [None]:
# Visualisasi cross-validation results
fig, ax = plt.subplots(figsize=(12, 6))

model_names = list(cv_results.keys())
means = [cv_results[name]['mean'] for name in model_names]
stds = [cv_results[name]['std'] for name in model_names]

x_pos = np.arange(len(model_names))
bars = ax.bar(x_pos, means, yerr=stds, capsize=10, color=colors_bar, 
              alpha=0.8, edgecolor='black', linewidth=1.5)

# Tambahkan nilai di atas bar
for i, (bar, mean, std) in enumerate(zip(bars, means, stds)):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + std,
           f'{mean:.4f}\n(±{std:.4f})',
           ha='center', va='bottom', fontsize=10, fontweight='bold')

ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('5-Fold Cross-Validation Results', fontsize=14, fontweight='bold', pad=15)
ax.set_xticks(x_pos)
ax.set_xticklabels(model_names, rotation=45, ha='right')
ax.set_ylim([0, 1.1])
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('cross_validation.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Cross-validation visualization selesai")


## 9. Kesimpulan


In [None]:
# Tentukan model terbaik
best_model_name = comparison_df['accuracy'].idxmax()
best_accuracy = comparison_df['accuracy'].max()

print("\n" + "="*70)
print("KESIMPULAN ANALISIS")
print("="*70)
print(f"\n1. Dataset Iris berhasil diklasifikasikan menggunakan 4 algoritma:")
print(f"   - Logistic Regression")
print(f"   - Decision Tree")
print(f"   - K-Nearest Neighbors")
print(f"   - Support Vector Machine")
print(f"\n2. MODEL TERBAIK: {best_model_name}")
print(f"   Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")
print(f"   Precision: {comparison_df.loc[best_model_name, 'precision']:.4f}")
print(f"   Recall: {comparison_df.loc[best_model_name, 'recall']:.4f}")
print(f"   F1-Score: {comparison_df.loc[best_model_name, 'f1_score']:.4f}")

print(f"\n3. Perbandingan Performa (berdasarkan Accuracy):")
sorted_models = comparison_df.sort_values('accuracy', ascending=False)
for idx, (model, row) in enumerate(sorted_models.iterrows(), 1):
    print(f"   {idx}. {model}: {row['accuracy']:.4f} ({row['accuracy']*100:.2f}%)")

print(f"\n4. Insight:")
print(f"   - Semua model menunjukkan performa yang sangat baik (accuracy > 90%)")
print(f"   - Dataset Iris relatif mudah untuk diklasifikasikan")
print(f"   - Tidak ada overfitting yang signifikan berdasarkan cross-validation")
print(f"   - Fitur-fitur dalam dataset memiliki daya diskriminatif yang tinggi")

print(f"\n5. Rekomendasi:")
print(f"   - Model {best_model_name} direkomendasikan untuk produksi")
print(f"   - Pertimbangkan ensemble methods untuk meningkatkan performa")
print(f"   - Lakukan hyperparameter tuning untuk optimasi lebih lanjut")

print("\n" + "="*70)


## 10. Simpan Model dan Hasil


In [None]:
# Simpan hasil ke CSV
comparison_df.to_csv('model_comparison_results.csv')
print("✓ Hasil perbandingan disimpan ke 'model_comparison_results.csv'")

# Simpan model terbaik
import pickle

best_models = {
    'Logistic Regression': lr_model,
    'Decision Tree': dt_model,
    'K-Nearest Neighbors': knn_model,
    'Support Vector Machine': svm_model
}

with open(f'best_model_{best_model_name.replace(" ", "_").lower()}.pkl', 'wb') as f:
    pickle.dump(best_models[best_model_name], f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print(f"✓ Model terbaik ({best_model_name}) dan scaler disimpan")
print("\n" + "="*70)
print("ANALISIS SELESAI!")
print("="*70)
