# **1. Import Library**

Pada tahap ini, Anda perlu mengimpor beberapa pustaka (library) Python yang dibutuhkan untuk analisis data dan pembangunan model machine learning.

In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# **2. Memuat Dataset dari Hasil Clustering**

Memuat dataset hasil clustering dari file CSV ke dalam variabel DataFrame.

In [76]:
from google.colab import drive

drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Submission Dicoding/hasil_clustering_frauds_detection.csv'
df = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [77]:
df.head()

Unnamed: 0,TransactionID,AccountID,TransactionAmount,TransactionDate,TransactionType,Location,DeviceID,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,Cluster_DBSCAN
0,TX000001,AC00128,14.09,2023-04-11 16:29:14,Debit,San Diego,D000380,162.198.218.92,M015,ATM,70,Doctor,81,1,5112.21,2024-11-04 08:08:08,-1
1,TX000002,AC00455,376.24,2023-06-27 16:44:19,Debit,Houston,D000051,13.149.61.4,M052,ATM,68,Doctor,141,1,13758.91,2024-11-04 08:09:35,-1
2,TX000003,AC00019,126.29,2023-07-10 18:16:08,Debit,Mesa,D000235,215.97.143.157,M009,Online,19,Student,56,1,1122.35,2024-11-04 08:07:04,-1
3,TX000004,AC00070,184.5,2023-05-05 16:32:11,Debit,Raleigh,D000187,200.13.225.150,M002,Online,26,Student,25,1,8569.06,2024-11-04 08:09:06,-1
4,TX000005,AC00411,13.45,2023-10-16 17:51:24,Credit,Atlanta,D000308,65.164.3.100,M091,Online,26,Student,198,1,7429.4,2024-11-04 08:06:39,-1


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2282 entries, 0 to 2281
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TransactionID            2282 non-null   object 
 1   AccountID                2282 non-null   object 
 2   TransactionAmount        2282 non-null   float64
 3   TransactionDate          2282 non-null   object 
 4   TransactionType          2282 non-null   object 
 5   Location                 2282 non-null   object 
 6   DeviceID                 2282 non-null   object 
 7   IP Address               2282 non-null   object 
 8   MerchantID               2282 non-null   object 
 9   Channel                  2282 non-null   object 
 10  CustomerAge              2282 non-null   int64  
 11  CustomerOccupation       2282 non-null   object 
 12  TransactionDuration      2282 non-null   int64  
 13  LoginAttempts            2282 non-null   int64  
 14  AccountBalance          

In [79]:
X = df.drop(columns='Cluster_DBSCAN')
y = df['Cluster_DBSCAN']
y

Unnamed: 0,Cluster_DBSCAN
0,-1
1,-1
2,-1
3,-1
4,-1
...,...
2277,-1
2278,-1
2279,-1
2280,-1


In [80]:
# Group 3 cluster anomaly (0, 1 , 2) menjadi 1 saja
def regroup_cluster(Cluster_DBSCAN):
    if Cluster_DBSCAN == -1:
        return 0
    else:
        return 1

y_regroup = y.apply(regroup_cluster).values

In [81]:
print("SEBELUM REGROUP:")
print(y.value_counts().sort_index())

print("\nSESUDAH REGROUP:")
unique, counts = np.unique(y_regroup, return_counts=True)
for val, count in zip(unique, counts):
    print(f"Label {val}: {count} ({count/len(y_regroup)*100:.2f}%)")

SEBELUM REGROUP:
Cluster_DBSCAN
-1    2103
 0      59
 1      66
 2      54
Name: count, dtype: int64

SESUDAH REGROUP:
Label 0: 2103 (92.16%)
Label 1: 179 (7.84%)


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y_regroup, test_size=0.2, random_state=42)

# drop beberapa fitur yang nilainya unik
X_train = X_train.drop(columns=['TransactionID', 'AccountID', 'TransactionDate', 'DeviceID', 'IP Address', 'MerchantID', 'PreviousTransactionDate'])
X_test = X_test.drop(columns=['TransactionID', 'AccountID', 'TransactionDate', 'DeviceID', 'IP Address', 'MerchantID', 'PreviousTransactionDate'])

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1825, 9) (457, 9) (1825,) (457,)


In [83]:
# Encoding data kategorikal
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_cols),
        ('cat', OneHotEncoder(), cat_cols)
    ],
    remainder='drop'
)

X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)

print("Setelah encoding:")
print("X_train_enc shape:", X_train_enc.shape)

Setelah encoding:
X_train_enc shape: (1825, 57)


In [84]:
# Menggunakan Model Random Forest yg tidak memerlukan scaling dan robust terhadap outlier
model = RandomForestClassifier(
    n_estimators=100,       #jumlah pohon yang diuji coba dengan subsample ada 100
    max_depth=10,           #jumlah kedalaman tiap pohon ada 10, ini hyperparameter yang umum agar tidak overfit
    min_samples_split=2,    #ukuran split tiap node dalam pohon sebanyak 2 kali
    min_samples_leaf=1,     #ukuran minimum leaf node
    random_state=42,        #langkah umum untuk memastikan hasil konsisten
    max_features = 'sqrt',  #fitur yang digunakan dalam node-node tiap pohon dibatasi sebanyak 'sqrt' dari total fitur
    bootstrap =True         #memastikan tiap pohon mendapat subsample yang berbeda
)

model.fit(X_train_enc, y_train)  #melatih data fitur x (independent variabel) dan y (dependent variabel)
y_pred = model.predict(X_test_enc)  #prediksi data fitur x

In [85]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       431
           1       0.00      0.00      0.00        26

    accuracy                           0.94       457
   macro avg       0.47      0.50      0.49       457
weighted avg       0.89      0.94      0.92       457



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [86]:
# Model Support Vector Machine
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__degree': [2, 3, 4]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train_enc, y_train)
best_params = grid_search.best_params_
print(f'Model terbaik: {best_params}')

ValueError: 
All the 1080 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1080 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/joblib/memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_data.py", line 894, in fit
    return self.partial_fit(X, y, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/sklearn/preprocessing/_data.py", line 959, in partial_fit
    raise ValueError(
ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.


In [None]:
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))