In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_classif, SelectKBest, f_classif, chi2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from scipy.stats import f_oneway


In [2]:
# Load Data
df = pd.read_csv('KlasifikasiUTS.csv')
df.drop_duplicates(inplace=True)

In [3]:
# Feature Encoding - Label Encoding for categorical columns
cat_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

# Feature Selection - Variance Threshold
selector = VarianceThreshold(threshold=0.01)
selector.fit(df.drop(columns=['Class']))
df.drop(columns=df.drop(columns=['Class']).columns[~selector.get_support()], inplace=True)

# Feature Selection - Remove duplicate features
duplicated_features = []
cols = df.drop(columns=['Class']).columns
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        if np.allclose(df[cols[i]].values, df[cols[j]].values):
            duplicated_features.append(cols[j])
df.drop(columns=np.unique(duplicated_features), inplace=True)

In [4]:
# Correlation Analysis
corr_matrix = df.corr()
high_corr_var = set()
threshold = 0.95
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            colname = corr_matrix.columns[i]
            high_corr_var.add(colname)
df.drop(columns=high_corr_var, inplace=True)

In [5]:
# Train-Test Split
X = df.drop(columns=['Class'])
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [6]:
# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature Selection - Mutual Information
mi = mutual_info_classif(X_train_scaled, y_train)
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)

# Feature Selection - Chi-Square
X_chi2 = SelectKBest(score_func=chi2, k=20).fit(X_train_scaled, y_train)
chi2_features = X.columns[X_chi2.get_support()]

# Feature Selection - ANOVA F-test
anova_selector = SelectKBest(score_func=f_classif, k=20)
anova_selector.fit(X_train_scaled, y_train)
anova_features = X.columns[anova_selector.get_support()]

# Combine features from MI, Chi2, and ANOVA (intersection)
selected_features = list(set(mi_series.head(20).index) & set(chi2_features) & set(anova_features))
X_train_sel = X_train_scaled[:, [X.columns.get_loc(f) for f in selected_features]]
X_test_sel = X_test_scaled[:, [X.columns.get_loc(f) for f in selected_features]]

ValueError: Input X must be non-negative.