In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Simulasi data berdasarkan contoh yang diberikan
# Dalam praktik nyata, Anda akan membaca dari file CSV
data = {
    'Switch_ID': ['of:0000000000000006', 'of:0000000000000006', 'of:0000000000000009', 
                  'of:0000000000000009', 'of:0000000000000009'],
    'Port_Number': [2, 3, 1, 2, 3],
    'Received_Packets': [228301957, 106418962, 104225252, 149012143, 228303772],
    'Received_Bytes': [857046145141, 205637732338, 201278569761, 601152950921, 525860011138],
    'Sent_Bytes': [276, 276, 451402760, 276, 276],
    'Sent_Packets': [0, 0, 0, 0, 0],
    'Port_Alive_Duration': [0, 0, 0, 0, 0],
    'Packets_Rx_Dropped': [0, 0, 0, 0, 0],
    'Packets_Tx_Dropped': [0, 0, 0, 0, 0],
    'Packets_Rx_Errors': [0, 0, 0, 0, 0],
    'is_valid': [True, True, True, True, True],
    'Table_ID': [0, 0, 0, 0, 0],
    'Active_Flow_Entries': [6147563, 6147563, 8295511, 8295511, 8295511],
    'Packets_Looked_Up': [147454, 147454, 295404, 295404, 295404],
    'Packets_Matched': [-1, -1, -1, -1, -1],
    'Max_Size': [-1, -1, -1, -1, -1],
    'Label': ['PortScanAttack', 'PortScanAttack', 'PortScanAttack', 'PortScanAttack', 'PortScanAttack']
}

# Membuat DataFrame
df = pd.DataFrame(data)
print("Dataset Asli:")
print(df.head())
print(f"\nShape dataset: {df.shape}")
print(f"\nInfo dataset:")
print(df.info())

# ===============================
# 1. PREPROCESSING DAN ENCODING
# ===============================

print("\n" + "="*50)
print("1. PREPROCESSING DAN ENCODING")
print("="*50)

# Membuat copy untuk preprocessing
df_processed = df.copy()

# Encoding untuk kolom kategorikal
categorical_columns = []
numerical_columns = []

# Identifikasi kolom kategorikal dan numerik
for col in df.columns:
    if col == 'Label':  # Skip target column
        continue
    elif df[col].dtype == 'object' or df[col].dtype.name == 'category':
        categorical_columns.append(col)
    elif df[col].dtype == 'bool':
        categorical_columns.append(col)  # Treat boolean as categorical
    else:
        numerical_columns.append(col)

print(f"Kolom kategorikal: {categorical_columns}")
print(f"Kolom numerik: {numerical_columns}")

label_encoders = {}

# Encoding kolom kategorikal
for col in categorical_columns:
    if col in df_processed.columns:
        le = LabelEncoder()
        df_processed[col + '_encoded'] = le.fit_transform(df_processed[col].astype(str))
        label_encoders[col] = le
        print(f"Encoding {col}: {len(le.classes_)} unique values")

# Encoding untuk target variable
if 'Label' in df_processed.columns:
    le_target = LabelEncoder()
    df_processed['Label_encoded'] = le_target.fit_transform(df_processed['Label'])
    label_encoders['Label'] = le_target
    print(f"Target encoding: {dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))}")

# Menghapus kolom asli yang sudah di-encode
columns_to_drop = categorical_columns + ['Label'] if 'Label' in df_processed.columns else categorical_columns
df_processed = df_processed.drop(columns=[col for col in columns_to_drop if col in df_processed.columns])

print(f"\nDataset setelah encoding:")
print(df_processed.head())
print(f"Shape: {df_processed.shape}")

# ===============================
# 2. FEATURE SELECTION
# ===============================

print("\n" + "="*50)
print("2. FEATURE SELECTION")
print("="*50)

# Memisahkan features dan target
target_column = 'Label_encoded'
if target_column not in df_processed.columns:
    print("Warning: Target column 'Label_encoded' not found. Using last column as target.")
    target_column = df_processed.columns[-1]

X = df_processed.drop(target_column, axis=1)
y = df_processed[target_column]

print(f"Features: {list(X.columns)}")
print(f"Target: {target_column}")
print(f"Shape X: {X.shape}, Shape y: {y.shape}")

# Pastikan semua kolom di X adalah numerik
non_numeric_features = X.select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric_features:
    print(f"Warning: Non-numeric features found: {non_numeric_features}")
    print("Converting to numeric...")
    for col in non_numeric_features:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        print(f"Converted {col} to numeric")

# Menghapus kolom dengan nilai konstan atau hampir konstan
def remove_constant_features(X, threshold=0.95):
    """Menghapus fitur yang memiliki nilai konstan atau hampir konstan"""
    constant_features = []
    for col in X.columns:
        if X[col].dtype in ['int64', 'float64']:
            # Hitung persentase nilai yang sama
            most_frequent_pct = X[col].value_counts().iloc[0] / len(X)
            if most_frequent_pct >= threshold:
                constant_features.append(col)
    
    print(f"Fitur dengan nilai konstan/hampir konstan (>{threshold*100}%): {constant_features}")
    return X.drop(columns=constant_features), constant_features

X_filtered, removed_features = remove_constant_features(X, threshold=0.8)
print(f"Fitur setelah menghapus konstan: {list(X_filtered.columns)}")

# ===============================
# 3. NORMALISASI
# ===============================

print("\n" + "="*50)
print("3. NORMALISASI")
print("="*50)

# Identifikasi kolom numerik dan non-numerik
numeric_columns = X_filtered.select_dtypes(include=[np.number]).columns.tolist()
non_numeric_columns = X_filtered.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Kolom numerik: {numeric_columns}")
print(f"Kolom non-numerik: {non_numeric_columns}")

# Jika ada kolom non-numerik, lakukan encoding tambahan
if non_numeric_columns:
    print(f"Melakukan encoding tambahan untuk kolom: {non_numeric_columns}")
    X_filtered_encoded = X_filtered.copy()
    
    for col in non_numeric_columns:
        if col in X_filtered_encoded.columns:
            le = LabelEncoder()
            X_filtered_encoded[col] = le.fit_transform(X_filtered_encoded[col].astype(str))
            print(f"Encoded {col}: {len(le.classes_)} unique values")
    
    # Update kolom numerik setelah encoding
    numeric_columns = X_filtered_encoded.select_dtypes(include=[np.number]).columns.tolist()
    X_for_scaling = X_filtered_encoded[numeric_columns]
else:
    X_for_scaling = X_filtered[numeric_columns]

print(f"Kolom yang akan dinormalisasi: {list(X_for_scaling.columns)}")

# StandardScaler (Z-score normalization)
scaler_standard = StandardScaler()
X_standard = scaler_standard.fit_transform(X_for_scaling)
X_standard_df = pd.DataFrame(X_standard, columns=X_for_scaling.columns)

print("Statistik sebelum normalisasi:")
print(X_for_scaling.describe())

print("\nStatistik setelah StandardScaler:")
print(X_standard_df.describe())

# MinMaxScaler (0-1 normalization)
scaler_minmax = MinMaxScaler()
X_minmax = scaler_minmax.fit_transform(X_for_scaling)
X_minmax_df = pd.DataFrame(X_minmax, columns=X_for_scaling.columns)

print("\nStatistik setelah MinMaxScaler:")
print(X_minmax_df.describe())

# ===============================
# 4. PRINCIPAL COMPONENT ANALYSIS (PCA)
# ===============================

print("\n" + "="*50)
print("4. PRINCIPAL COMPONENT ANALYSIS (PCA)")
print("="*50)

# PCA dengan StandardScaler
pca = PCA()
X_pca = pca.fit_transform(X_standard)

# Menghitung explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

print(f"Explained Variance Ratio per komponen:")
for i, ratio in enumerate(explained_variance_ratio):
    print(f"PC{i+1}: {ratio:.4f} ({ratio*100:.2f}%)")

print(f"\nCumulative Explained Variance Ratio:")
for i, cum_ratio in enumerate(cumulative_variance_ratio):
    print(f"PC1-PC{i+1}: {cum_ratio:.4f} ({cum_ratio*100:.2f}%)")

# Menentukan jumlah komponen yang menjelaskan 95% varians
n_components_95 = np.argmax(cumulative_variance_ratio >= 0.95) + 1
print(f"\nJumlah komponen untuk 95% varians: {n_components_95}")

# PCA dengan jumlah komponen optimal
pca_optimal = PCA(n_components=n_components_95)
X_pca_optimal = pca_optimal.fit_transform(X_standard)

print(f"Shape data setelah PCA: {X_pca_optimal.shape}")

# Membuat DataFrame untuk data yang sudah di-PCA
pca_columns = [f'PC{i+1}' for i in range(n_components_95)]
X_pca_df = pd.DataFrame(X_pca_optimal, columns=pca_columns)

print(f"\nData setelah PCA:")
print(X_pca_df.head())

# ===============================
# 5. VISUALISASI
# ===============================

print("\n" + "="*50)
print("5. VISUALISASI")
print("="*50)

# Plotting hasil
plt.figure(figsize=(15, 10))

# Plot 1: Explained Variance Ratio
plt.subplot(2, 3, 1)
plt.bar(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance Ratio per PC')
plt.xticks(range(1, len(explained_variance_ratio) + 1))

# Plot 2: Cumulative Explained Variance
plt.subplot(2, 3, 2)
plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, 'bo-')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% threshold')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance')
plt.legend()
plt.grid(True)

# Plot 3: Feature importance dalam PC1
plt.subplot(2, 3, 3)
if len(pca.components_) > 0 and len(X_for_scaling.columns) > 0:
    pc1_importance = np.abs(pca.components_[0])
    feature_names = X_for_scaling.columns
    if len(pc1_importance) == len(feature_names):
        sorted_idx = np.argsort(pc1_importance)[::-1]
        # Tampilkan maksimal 10 fitur teratas
        n_features_to_show = min(10, len(pc1_importance))
        plt.barh(range(n_features_to_show), pc1_importance[sorted_idx[:n_features_to_show]])
        plt.yticks(range(n_features_to_show), [feature_names[sorted_idx[i]] for i in range(n_features_to_show)])
        plt.xlabel('Absolute Loading')
        plt.title('Top Feature Importance in PC1')
    else:
        plt.text(0.5, 0.5, 'Feature names mismatch', ha='center', va='center', transform=plt.gca().transAxes)
else:
    plt.text(0.5, 0.5, 'No PCA components available', ha='center', va='center', transform=plt.gca().transAxes)

# Plot 4: Distribusi data sebelum dan sesudah normalisasi
plt.subplot(2, 3, 4)
if len(X_for_scaling.columns) > 0:
    feature_example = X_for_scaling.columns[0]
    plt.hist(X_for_scaling[feature_example], alpha=0.7, label='Original', bins=20, density=True)
    plt.hist(X_standard_df[feature_example], alpha=0.7, label='Standardized', bins=20, density=True)
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.title(f'Distribution: {feature_example}')
    plt.legend()
else:
    plt.text(0.5, 0.5, 'No features available', ha='center', va='center', transform=plt.gca().transAxes)

# Plot 5: Korelasi antar fitur yang sudah dinormalisasi
plt.subplot(2, 3, 5)
if X_standard_df.shape[1] > 1:
    correlation_matrix = X_standard_df.corr()
    # Batasi ukuran heatmap jika terlalu banyak fitur
    if correlation_matrix.shape[0] > 10:
        # Ambil 10 fitur pertama saja
        correlation_matrix = correlation_matrix.iloc[:10, :10]
    
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
    plt.title('Feature Correlation Matrix (Normalized)')
else:
    plt.text(0.5, 0.5, 'Insufficient features for correlation', ha='center', va='center', transform=plt.gca().transAxes)

# Plot 6: PCA biplot (jika ada minimal 2 komponen)
plt.subplot(2, 3, 6)
if X_pca_optimal.shape[1] >= 2:
    # Buat scatter plot dengan warna berdasarkan target
    scatter = plt.scatter(X_pca_optimal[:, 0], X_pca_optimal[:, 1], c=y, cmap='viridis', alpha=0.7)
    plt.xlabel(f'PC1 ({explained_variance_ratio[0]*100:.1f}%)')
    plt.ylabel(f'PC2 ({explained_variance_ratio[1]*100:.1f}%)')
    plt.title('PCA: First Two Components')
    plt.colorbar(scatter, label='Label')
elif X_pca_optimal.shape[1] == 1:
    plt.hist(X_pca_optimal[:, 0], bins=20, alpha=0.7)
    plt.xlabel(f'PC1 ({explained_variance_ratio[0]*100:.1f}%)')
    plt.ylabel('Frequency')
    plt.title('PCA: First Component Distribution')
else:
    plt.text(0.5, 0.5, 'No PCA components available', ha='center', va='center', transform=plt.gca().transAxes)

plt.tight_layout()
plt.show()

# ===============================
# 6. SUMMARY DAN REKOMENDASI
# ===============================

print("\n" + "="*50)
print("6. SUMMARY DAN REKOMENDASI")
print("="*50)

print(f"Dataset Original: {df.shape}")
print(f"Setelah preprocessing: {df_processed.shape}")
print(f"Setelah feature selection: {X_filtered.shape}")
print(f"Setelah PCA: {X_pca_optimal.shape}")
print(f"Dimensi reduction: {X_filtered.shape[1]} â†’ {X_pca_optimal.shape[1]} fitur")
print(f"Varians yang dipertahankan: {cumulative_variance_ratio[n_components_95-1]*100:.2f}%")

print(f"\nFitur yang dihapus karena konstan: {removed_features}")
print(f"Fitur akhir setelah PCA: {pca_columns}")

# Menyimpan hasil untuk penggunaan selanjutnya
final_features = {
    'original_data': df,
    'processed_data': df_processed,
    'X_filtered': X_filtered,
    'X_standard': X_standard_df,
    'X_minmax': X_minmax_df,
    'X_pca': X_pca_df,
    'y': y,
    'scalers': {
        'standard': scaler_standard,
        'minmax': scaler_minmax
    },
    'pca_model': pca_optimal,
    'label_encoders': label_encoders,
    'removed_features': removed_features,
    'n_components': n_components_95
}

print(f"\nData siap untuk machine learning!")
print(f"Gunakan X_pca_df sebagai features dan y sebagai target")

# Contoh split data untuk training
if len(y.unique()) > 1:  # Pastikan ada lebih dari 1 kelas
    X_train, X_test, y_train, y_test = train_test_split(
        X_pca_df, y, test_size=0.2, random_state=42, stratify=y
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X_pca_df, y, test_size=0.2, random_state=42
    )

print(f"\nContoh split data:")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Classes in dataset: {y.unique()}")