# Data Preprocessing

In [1]:
# Data Preprocessing
# penanganan outliers yang terdeteksi oleh multi-model PyCaret (ABOD, KNN, COF).

print("DATA PREPROCESSING NOTEBOOK")
print("=" * 50)
print("Tahap preprocessing data untuk dataset Iris")
print("Berdasarkan hasil analisis dari DataUnderstanding notebook")
print("Penanganan outliers dari multi-model PyCaret (ABOD, KNN, COF)")

DATA PREPROCESSING NOTEBOOK
Tahap preprocessing data untuk dataset Iris
Berdasarkan hasil analisis dari DataUnderstanding notebook
Penanganan outliers dari multi-model PyCaret (ABOD, KNN, COF)


## Import Libraries dan Setup Environment

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

try:
    from pycaret.datasets import get_data
    from pycaret.classification import *
    from pycaret.anomaly import *
    print("PyCaret berhasil diimport")
except ImportError:
    print("PyCaret tidak tersedia. Install dengan: pip install pycaret")

plt.style.use('default')
sns.set_palette("husl")

print("Libraries berhasil diimport untuk preprocessing")
print("Libraries yang tersedia:")
print("   • Pandas & NumPy: Data manipulation")
print("   • Matplotlib & Seaborn: Visualisasi")
print("   • Scikit-learn: Preprocessing tools")
print("   • PyCaret: Advanced ML preprocessing")

PyCaret berhasil diimport
Libraries berhasil diimport untuk preprocessing
Libraries yang tersedia:
   • Pandas & NumPy: Data manipulation
   • Matplotlib & Seaborn: Visualisasi
   • Scikit-learn: Preprocessing tools
   • PyCaret: Advanced ML preprocessing


## Load Data dan Hasil Outlier Detection


In [3]:
# Load dataset Iris dengan hasil outlier detection dari DataUnderstanding
print("=== LOADING DATA DAN HASIL OUTLIER DETECTION ===")

try:
    # Load data dari file CSV atau PyCaret
    try:
        # Coba load dari file lokal
        df = pd.read_csv('data_iris.csv', delimiter=';')
        
        # Konversi kolom numerik yang menggunakan koma sebagai decimal separator
        numeric_columns = ['sepal length', 'sepal width', 'petal length', 'petal width']
        
        for col in numeric_columns:
            if col in df.columns:
                df[col] = df[col].astype(str).str.replace(',', '.').astype(float)
        
        # Buat kolom species numerik dan species name jika 'Class' ada
        if 'Class' in df.columns:
            df['species'] = df['Class'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})
            df['species_name'] = df['Class'].map({'Iris-setosa': 'setosa', 'Iris-versicolor': 'versicolor', 'Iris-virginica': 'virginica'})
            df = df.drop('Class', axis=1)
        else:
            print("Kolom 'Class' tidak ditemukan di data, lewati mapping species.")
        
        # Rename kolom untuk konsistensi dengan format sklearn
        df = df.rename(columns={
            'sepal length': 'sepal length (cm)',
            'sepal width': 'sepal width (cm)', 
            'petal length': 'petal length (cm)',
            'petal width': 'petal width (cm)'
        })
        
        # Drop kolom yang tidak diperlukan
        if 'id' in df.columns:
            df = df.drop('id', axis=1)
        
        print("Dataset Iris berhasil dimuat dari data_iris.csv")
        
    except FileNotFoundError:
        print("File lokal tidak ditemukan. Menggunakan dataset Iris dari PyCaret...")
        try:
            df = get_data('iris')
            df['species'] = df['species'].map({'setosa': 0, 'versicolor': 1, 'virginica': 2})
            df['species_name'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
            print("Dataset Iris berhasil dimuat dari PyCaret")
        except:
            print("Error: Tidak dapat memuat dataset dari PyCaret")
    
    features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
    
    print(f"\nInfo Dataset:")
    print(f"   • Ukuran: {df.shape[0]} baris, {df.shape[1]} kolom")
    print(f"   • Features: {features}")
    print(f"   • Target: species (0=setosa, 1=versicolor, 2=virginica)")
    
    print(f"\nSample Data:")
    print(df.head())
    
except Exception as e:
    print(f"Error loading data: {e}")
    df = None


=== LOADING DATA DAN HASIL OUTLIER DETECTION ===
Kolom 'Class' tidak ditemukan di data, lewati mapping species.
Dataset Iris berhasil dimuat dari data_iris.csv

Info Dataset:
   • Ukuran: 150 baris, 1 kolom
   • Features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
   • Target: species (0=setosa, 1=versicolor, 2=virginica)

Sample Data:
  id,Class,sepal length,sepal width,petal length,petal width
0                      1,Iris-setosa,5.1,3.5,1.4,0.2        
1                        2,Iris-setosa,4.9,3,1.4,0.2        
2                      3,Iris-setosa,4.7,3.2,1.3,0.2        
3                      4,Iris-setosa,4.6,3.1,1.5,0.2        
4                        5,Iris-setosa,5,3.6,1.4,0.2        


In [4]:
# === IMPORT LIBRARY ===
%pip install openpyxl
import pandas as pd
import numpy as np
from scipy import stats

# PyCaret Anomaly
from pycaret.anomaly import setup, create_model, assign_model

# === LOAD DATASET ===
df = pd.read_csv("data_iris.csv")

# Pastikan hanya ambil fitur numerik, buang 'id' kalau tidak perlu
numeric_features = [col for col in df.select_dtypes(include=['int64', 'float64']).columns if col != 'id']
print("Fitur numerik:", numeric_features)


# === 1. DETEKSI OUTLIER MENGGUNAKAN METODE STATISTIK ===
print("\n=== DETEKSI OUTLIER DENGAN METODE STATISTIK ===")

# Metode Z-Score
z_scores = np.abs(stats.zscore(df[numeric_features]))
outliers_z = (z_scores > 3).any(axis=1)

# Metode IQR
Q1 = df[numeric_features].quantile(0.25)
Q3 = df[numeric_features].quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = ((df[numeric_features] < (Q1 - 1.5 * IQR)) | (df[numeric_features] > (Q3 + 1.5 * IQR))).any(axis=1)

df['Outlier_Statistik'] = outliers_z | outliers_iqr
print("Jumlah outlier (statistik):", df['Outlier_Statistik'].sum())


# === 2. DETEKSI OUTLIER MENGGUNAKAN PYCARET ===
print("\n=== DETEKSI OUTLIER DENGAN PYCARET ===")

try:
    # Setup PyCaret versi baru (tanpa silent/train_size)
    s = setup(data=df[numeric_features], session_id=123, verbose=False)

    models = ["abod", "knn", "cof"]
    for m in models:
        model = create_model(m)
        results = assign_model(model)
        print(f"\nModel {m.upper()}:")
        print(results['Anomaly'].value_counts())

        # Simpan hasil ke dataframe utama
        df[f"Outlier_{m.upper()}"] = results['Anomaly']

except Exception as e:
    print("Error PyCaret:", e)
    print("Lanjut menggunakan metode statistik saja...")


# === SIMPAN HASIL KE FILE BARU ===
df.to_excel("hasil_outlier.xlsx", index=False)
print("\n✓ Deteksi outlier selesai, hasil disimpan ke hasil_outlier.xlsx")


Note: you may need to restart the kernel to use updated packages.
Fitur numerik: ['sepal length', 'sepal width', 'petal length', 'petal width']

=== DETEKSI OUTLIER DENGAN METODE STATISTIK ===
Jumlah outlier (statistik): 4

=== DETEKSI OUTLIER DENGAN PYCARET ===



Model ABOD:
Anomaly
0    142
1      8
Name: count, dtype: int64



Model KNN:
Anomaly
0    142
1      8
Name: count, dtype: int64



Model COF:
Anomaly
0    142
1      8
Name: count, dtype: int64



✓ Deteksi outlier selesai, hasil disimpan ke hasil_outlier.xlsx
