#**Import dan Instal Library**

In [None]:
!pip install matplotlib-venn
!pip install jcopml

In [None]:
# Import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # Untuk membuat plot
import seaborn as sns
from jcopml.plot import plot_missing_value # Untuk plot missing value
from sklearn.model_selection import train_test_split # Untuk splitting dataset
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive', force_remount=True)


#**Import Data**

In [None]:
# Data German
df = pd.read_csv('/content/drive/MyDrive/Dataset/german.csv')
df.rename(columns={'Status':'TARGET'}, inplace=True)

In [None]:
# Data Japanese
df = pd.read_csv('/content/drive/MyDrive/Dataset/japanese baru.csv')
df.rename(columns={'A16':'TARGET'}, inplace=True)

In [None]:
# Data HMEQ
df = pd.read_csv('/content/drive/MyDrive/Dataset/hmeq.csv')
df.rename(columns={'BAD':'TARGET'}, inplace=True)

In [None]:
df.head()

##**Statistika Deskriptif**

In [None]:
## Statistika Descriptif
df.info() # Melihat banyak entri, tipe data, dan data yang tidak kosong
print(df.nunique()) # Melihat nilai unik

In [None]:
df['TARGET']=pd.Categorical(df.TARGET) # Mengubah colom target menjadi tipe kategori
df.info()
df.describe()

#**Preprocesing**

##**Outlier**

In [None]:
#Memisahkan kumpulan data menjadi data numerik saja
numerical_df = df.select_dtypes(include=[np.number])
numerical_df.head()

In [None]:
#Cek outlier pada data numerik menggunakan boxplot
def num_plot(df, var):
    plt.subplot
    sns.boxplot(y = df[var])
    plt.title("Boxplot")
    plt.show()

for var in numerical_df:
    num_plot(df, var)

In [None]:
# Fungsi untuk menghilangkan outlier menggunakan metode IQR
def remove_outliers(df, column_list):
    cleaned_df = df.copy()
    for column in column_list:
        Q1 = cleaned_df[column].quantile(0.25)
        Q3 = cleaned_df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Memfilter outliers
        cleaned_df = cleaned_df[(cleaned_df[column] >= lower_bound) & (cleaned_df[column] <= upper_bound)]

    return cleaned_df

# Aplikasikan fungsi pada data numerik
df = remove_outliers(df, numerical_df)

In [None]:
## Statistika Descriptif
df.info() # Melihat banyak entri, tipe data, dan data yang tidak kosong

In [None]:
for var in numerical_df:
    num_plot(df, var)

##**Cek Data Imbalance**

In [None]:
# Cek imbalance data
df["TARGET"].value_counts()

In [None]:
import seaborn as sns
sns.countplot(df["TARGET"])

##**Cek Missing Value**

In [None]:
# Cek missing value
missing_values = df.isnull().sum(axis=0)
print(missing_values)
plot_missing_value(df)

In [None]:
# Create a bar plot of missing values
plt.figure(figsize=(10,5))
plt.bar(missing_values.index, missing_values.values)
plt.title('Missing Value Count')
plt.xlabel('Features')
plt.ylabel('Count')
plt.show()

##**Penanganan Missing Value, Standarisasi, dan One-Hot Encoding**

In [None]:
# Data Preprocessing
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# Preprocessing untuk kolom numerik
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),# Mengisi nilai yang hilang
    ('scaler', StandardScaler()) # Standardisasi
])
# Preprocessing untuk kolom kategorikal
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Bundle preprocessing untuk kolom numerik dan kategorikal
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#**Split Data**

In [None]:
# Splitting data
X = df.drop(columns="TARGET")
y = df.TARGET
# Split data menjadi training dan testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y,
random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# Cek nilai missing value sebelum preprocessing
print("Missing values in training data before preprocessing:\n", X_train.isna().sum())
print("Missing values in testing data before preprocessing:\n", X_test.isna().sum())

In [None]:
# Terapkan preprocessing ke data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
# Cek nilai missing value setelah preprocessing (pada array numpy)
print("Missing values in training data after preprocessing:", np.isnan(X_train).sum())
print("Missing values in testing data after preprocessing:", np.isnan(X_test).sum())

#**SMOTE**

In [None]:
# Inisialisasi SMOTE dan resample data training
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
class_counts = pd.value_counts(y_train)
print(class_counts)

In [None]:
# Buat bar chart
plt.barh(class_counts.index, class_counts.values)
plt.xlabel('Count')
plt.ylabel('Class')
plt.title('Class Distribution after SMOTE')
plt.yticks([0, 1])
plt.show()

#**MIXED MODEL**

##**Deep Neural Network**

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout
from sklearn.metrics import accuracy_score

def build_dnn_for_feature_extraction(input_shape):
    input_layer = Input(shape=input_shape)
    x = Dense(32, activation='relu')(input_layer)
    x = Dense(16, activation='relu')(x)
    feature_layer = Dense(8, activation='relu')(x)
    output_layer = Dense(1, activation='sigmoid')(feature_layer)

    model = Model(inputs=input_layer, outputs=output_layer)
    feature_model = Model(inputs=input_layer, outputs=feature_layer)

    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

    return model, feature_model

# Buat model dan ekstrak fitur tanpa subset
model, feature_model = build_dnn_for_feature_extraction((X_train.shape[1],))
model.fit(X_train, y_train, epochs=50, batch_size=30)

extracted_features_train = feature_model.predict(X_train)
extracted_features_test = feature_model.predict(X_test)

###**Evaluasi dan Hasil**

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy}")

In [None]:
y_pred_dnn = y_test

In [None]:
print(extracted_features_train[:5])  # Menampilkan 5 baris pertama dari data pelatihan

In [None]:
import pandas as pd

# Konversi array numpy menjadi DataFrame
df_train = pd.DataFrame(extracted_features_train[:5])

# Menampilkan DataFrame
print(df_train)

##**Extreme Gradient Boosting**

###**Tanpa Imbalanced**

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Inisialisasi model XGBoost
xgb_model = XGBClassifier(
   booster='gbtree',
   n_estimators=500,
   learning_rate=0.1,
   max_depth=6,
   subsample=0.8,
   colsample_bytree=0.8,
   gamma=1,
   reg_alpha=0.1,
   reg_lambda=1,
   objective='binary:logistic'
)

# Latih model XGBoost dengan fitur yang diekstraksi dan data asli
xgb_model.fit(extracted_features_train, y_train)

# Memprediksi menggunakan model pada data uji
predictions = xgb_model.predict(extracted_features_test)

# Evaluasi model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy on test set:", accuracy)

# Tampilkan laporan klasifikasi
print(classification_report(y_test, predictions))


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


# Hitung confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# Ekstrak nilai TP, TN, FP, FN dari confusion matrix
TP = conf_matrix[1, 1]
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]

# Hitung metric lainnya
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)
roc_auc = roc_auc_score(y_test, predictions)

# Cetak hasil
print("Confusion Matrix:")
print(conf_matrix)
print("\nTrue Positive (TP):", TP)
print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

###Smote

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Ekstrak fitur menggunakan DNN dari seluruh data pelatihan
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(extracted_features_train, y_train)

# Inisialisasi model XGBoost
xgb_model = XGBClassifier(
   n_estimators=100,
   learning_rate=0.1,
   max_depth=6,
   subsample=0.8,
   colsample_bytree=0.8,
   gamma=1,
   reg_alpha=0.1,
   reg_lambda=1
)

# Latih model XGBoost dengan fitur yang diekstraksi dan data yang telah di-resample
xgb_model.fit(X_train_resampled, y_train_resampled)

# Memprediksi menggunakan model pada data uji
predictions = xgb_model.predict(extracted_features_test)

# Evaluasi model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy on test set:", accuracy)

# Tampilkan laporan klasifikasi
print(classification_report(y_test, predictions))


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


# Hitung confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# Ekstrak nilai TP, TN, FP, FN dari confusion matrix
TP = conf_matrix[1, 1]
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]

# Hitung metric lainnya
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)
roc_auc = roc_auc_score(y_test, predictions)

# Cetak hasil
print("Confusion Matrix:")
print(conf_matrix)
print("\nTrue Positive (TP):", TP)
print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

###Scale Pos Weight

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Menghitung rasio kelas minoritas terhadap kelas mayoritas
class_ratio = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Inisialisasi model XGBoost dengan scale_pos_weight
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    scale_pos_weight=class_ratio
)

# Latih model XGBoost dengan fitur yang diekstraksi dan data asli
xgb_model.fit(extracted_features_train, y_train)

# Memprediksi menggunakan model pada data uji
predictions = xgb_model.predict(extracted_features_test)

# Evaluasi model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy on test set:", accuracy)

# Tampilkan laporan klasifikasi
print(classification_report(y_test, predictions))


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


# Hitung confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# Ekstrak nilai TP, TN, FP, FN dari confusion matrix
TP = conf_matrix[1, 1]
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]

# Hitung metric lainnya
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)
roc_auc = roc_auc_score(y_test, predictions)

# Cetak hasil
print("Confusion Matrix:")
print(conf_matrix)
print("\nTrue Positive (TP):", TP)
print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

#**SINGLE MODEL**

##XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import confusion_matrix, roc_auc_score

# Inisialisasi model
model_xgb = xgb.XGBClassifier()

# Melatih model pada data latih
model_xgb.fit(X_train, y_train)

# Membuat prediksi menggunakan data uji
y_pred_xgb = model_xgb.predict(X_test)

# Menghitung confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_xgb)

# Mengambil nilai dari confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Menghitung tipe I dan tipe II error
type_i_error = FP
type_ii_error = FN

# Hitung metric lainnya
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)
roc_auc = roc_auc_score(y_test, y_pred_xgb)

# Cetak hasil
print("XGBoost Model Metrics:")
print("Confusion Matrix:")
print(conf_matrix)
print("\nTrue Positive (TP):", TP)
print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

##GBM

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Inisialisasi model
model_gbm = GradientBoostingClassifier()

# Melatih model pada data latih
model_gbm.fit(X_train, y_train)

# Membuat prediksi menggunakan data uji
y_pred_gbm = model_gbm.predict(X_test)

# Menghitung confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_gbm)

# Mengambil nilai dari confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Menghitung tipe I dan tipe II error
type_i_error = FP
type_ii_error = FN
# Hitung metric lainnya
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)
roc_auc = roc_auc_score(y_test, y_pred_gbm)

# Cetak hasil
print("Gradient Boosting Machine (GBM) Model Metrics:")
print("Confusion Matrix:")
print(conf_matrix)
print("\nTrue Positive (TP):", TP)
print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

##Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Inisialisasi model
model = LogisticRegression()

# Melatih model pada data latih
model.fit(X_train, y_train)

# Membuat prediksi menggunakan data uji
y_pred_lr = model.predict(X_test)

# Menghitung confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_lr)

# Mengambil nilai dari confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Menghitung tipe I dan tipe II error
type_i_error = FP
type_ii_error = FN
# Hitung metric lainnya
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)
roc_auc = roc_auc_score(y_test, y_pred_lr)

# Cetak hasil
print("Logistic Regression Model Metrics:")
print("Confusion Matrix:")
print(conf_matrix)
print("\nTrue Positive (TP):", TP)
print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

##Decision Tree

In [None]:
from sklearn import tree, metrics, model_selection, preprocessing
from IPython.display import Image, display

In [None]:
# train the decision tree
dtree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3)
dtree.fit(X_train, y_train)

In [None]:
y_pred_dt = dtree.predict(X_test)

In [None]:
# Menghitung confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_dt)

# Mengambil nilai dari confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Menghitung tipe I dan tipe II error
type_i_error = FP
type_ii_error = FN
# Hitung metric lainnya
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)
roc_auc = roc_auc_score(y_test, y_pred_dt)

# Cetak hasil
print("Decision Tree Model Metrics:")
print("Confusion Matrix:")
print(conf_matrix)
print("\nTrue Positive (TP):", TP)
print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

##Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred_rf = rf.predict(X_test)

In [None]:
# Menghitung confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_rf)

# Mengambil nilai dari confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Menghitung tipe I dan tipe II error
type_i_error = FP
type_ii_error = FN
# Hitung metric lainnya
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)
roc_auc = roc_auc_score(y_test, y_pred_rf)

# Cetak hasil
print("Random Forest Model Metrics:")
print("Confusion Matrix:")
print(conf_matrix)
print("\nTrue Positive (TP):", TP)
print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

##Support Vector Mechine

In [None]:
from sklearn.svm import SVC

# Inisialisasi model SVM
svm_model = SVC()

# Melatih model pada data latih
svm_model.fit(X_train, y_train)

# Membuat prediksi menggunakan data uji
y_pred_svm = svm_model.predict(X_test)

# Menghitung confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_svm)

# Mengambil nilai dari confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Menghitung tipe I dan tipe II error
type_i_error = FP
type_ii_error = FN
# Hitung metric lainnya
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)
roc_auc = roc_auc_score(y_test, y_pred_svm)

# Cetak hasil
print("Support Vector Machine Model Metrics:")
print("Confusion Matrix:")
print(conf_matrix)
print("\nTrue Positive (TP):", TP)
print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)


##ADA BOOST

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Inisialisasi model
model_adaboost = AdaBoostClassifier()

# Melatih model pada data latih
model_adaboost.fit(X_train, y_train)

# Membuat prediksi menggunakan data uji
y_pred_adaboost = model_adaboost.predict(X_test)

# Menghitung confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_adaboost)

# Mengambil nilai dari confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Menghitung tipe I dan tipe II error
type_i_error = FP
type_ii_error = FN
# Hitung metric lainnya
accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * (precision * recall) / (precision + recall)
roc_auc = roc_auc_score(y_test, y_pred_adaboost)

# Cetak hasil
print("Ada Boost Model Metrics:")
print("Confusion Matrix:")
print(conf_matrix)
print("\nTrue Positive (TP):", TP)
print("True Negative (TN):", TN)
print("False Positive (FP):", FP)
print("False Negative (FN):", FN)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)