In [95]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

from sklearn.datasets import make_blobs
from sklearn.mixture import GaussianMixture

## Import Dataset

In [107]:
# take dataset from github
data_url = "https://raw.githubusercontent.com/propixxx/anomaly_detection/refs/heads/main/dummy_data_properti.csv"


In [108]:
# import dataset
data = pd.read_csv(data_url)
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Kategori,Harga,LT,LB,Sertifikat
0,Rumah,2019600000,283,278,Hak Guna Bangunan (HGB)
1,Gedung,907200000,58,50,Hak Guna Bangunan (HGB)
2,Apartemen,822000000,0,137,Hak Guna Bangunan (HGB)
3,Kost,789600000,122,66,Lainnya
4,Apartemen,408000000,0,68,Lainnya


## Preprocess Data

In [98]:
## how to get dataframe column
df.columns

Index(['Kategori', 'Harga', 'LT', 'LB', 'Sertifikat'], dtype='object')

In [99]:
from sklearn.preprocessing import LabelEncoder

# # Inisialisasi LabelEncoder
# # Lakukan label encoding untuk setiap kolom secara terpisah
encoders = {}
for col in ['Kategori', 'Sertifikat']:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    encoders[col] = encoder

joblib.dump(encoders, "label_encoders.pkl")


['label_encoders.pkl']

In [100]:
df.head()

Unnamed: 0,Kategori,Harga,LT,LB,Sertifikat
0,4,2019600000,283,278,0
1,1,907200000,58,50,0
2,0,822000000,0,137,0
3,2,789600000,122,66,2
4,0,408000000,0,68,2


# Model

## Gaussian Mixture

In [101]:
import numpy as np
import joblib
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

# Normalisasi data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df)

# Simpan scaler untuk digunakan nanti
joblib.dump(scaler, 'scaler.pkl')

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
log_likelihoods = []

# Loop melalui setiap fold
for train_index, val_index in kf.split(df):
    train_data, val_data = df.iloc[train_index], df.iloc[val_index]

    # Melatih GMM pada training fold
    gmm = GaussianMixture(n_components=5, covariance_type='full', random_state=42)
    gmm.fit(train_data)

    # Hitung log-likelihood pada validation fold
    log_likelihoods.extend(gmm.score_samples(val_data))

# Menentukan threshold sebagai percentile ke-5 dari semua log-likelihood CV
threshold = np.percentile(log_likelihoods, 5)

# Latih model akhir pada seluruh data dan simpan
gmm_final = GaussianMixture(n_components=3, covariance_type='full', random_state=42)
gmm_final.fit(data_scaled)
joblib.dump(gmm_final, 'gmm_model.pkl')
joblib.dump(threshold, 'threshold.pkl')

print(f"Threshold optimal berdasarkan CV: {threshold}")
print("Model dan threshold telah disimpan.")


Threshold optimal berdasarkan CV: -35.85404851589889
Model dan threshold telah disimpan.


In [102]:
# import numpy as np
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt
# from sklearn.mixture import GaussianMixture


# def plot_gmm_scatter(X_train, covariance_types):
#     # Create a figure with subplots for each covariance type
#     # fig, axes = plt.subplots(2, 2, figsize=(10, 4))

#     # Create and fit a GMM model for each covariance type '''axes.ravel()'''
#     for cov_type in (covariance_types ):
#         gmm = GaussianMixture(n_components=5, n_init=10, covariance_type=cov_type)
#         gmm.fit(X_train)

#         # Get the predicted labels
#         y_train_pred = gmm.predict(X_train)

#         # Plot
#         # sns.scatterplot(x=X_train[:, 0], y=X_train[:, 1], hue=y_train_pred, palette='deep', alpha=0.8, ax=ax)
#         # ax.set_title(f'Scatter plot - Covariance Type: {cov_type}')
#         # ax.set_xlabel('x0')
#         # ax.set_ylabel('x1')
#         # ax.legend(title='Class', loc='upper right')

#     # Adjust spacing between subplots
#     # plt.tight_layout()
#     # plt.show()

In [103]:
# covariance_types = ['full', 'tied', 'diag', 'spherical']

# # Call the function to generate scatter plots
# plot_gmm_scatter(X_train, covariance_types)

# # Convert training data to a Pandas DataFrame for display
# X_train_pd = pd.DataFrame(X_train)
# print('\nDataset')
# X_train_pd.head()

In [104]:
# # JOBLIB
# import joblib

In [105]:
# # Hitung probabilitas log-likelihood setiap sampel
# log_likelihood = gmm.score_samples(df)

# # Menentukan threshold anomali (misal, ambil kuantil ke-5)
# threshold = np.percentile(log_likelihood, 5)

# joblib.dump(gmm, 'gmm_model.pkl')
# joblib.dump(scaler, 'scaler.pkl')
# joblib.dump(threshold, 'threshold.pkl')

In [106]:
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder

# Contoh data kategori sebelum encoding (diambil dari training data)
categorical_columns = ["Kategori", "Sertifikat"]  # Misalnya, kolom ke-0 dan ke-2 adalah kategori

# Load encoder yang telah disimpan dari training
encoders = joblib.load("label_encoders.pkl")
scaler = joblib.load("scaler.pkl")
gmm = joblib.load("gmm_model.pkl")
threshold = joblib.load("threshold.pkl")

def preprocess_input(new_input, categorical_columns, encoders):
    """
    - Melakukan label encoding untuk kolom kategori dalam new_input menggunakan encoder yang telah disimpan.
    - Melakukan normalisasi menggunakan scaler yang telah disimpan.
    - Mengembalikan data yang sudah siap diproses oleh model.
    """
    new_input = np.array(new_input).reshape(1, -1)  # Pastikan bentuk array (1, n_features)

    # Lakukan label encoding untuk kolom kategori
    for col in categorical_columns:
        encoder = encoders[col]  # Ambil encoder yang digunakan pada training
        if col == "Kategori":
            new_input[:, 0] = encoder.transform(new_input[:, 0])  # Gunakan transform() agar mapping tetap sama
        else:
            new_input[:, 4] = encoder.transform(new_input[:, 4])  # Gunakan transform() agar mapping tetap sama

    # Lakukan normalisasi
    new_input_scaled = scaler.transform(new_input.astype(float))  # Pastikan dalam bentuk float sebelum transformasi
    return new_input_scaled

def check_anomaly(new_input):
    """
    Mengecek apakah new_input merupakan anomali atau tidak.
    """
    new_input_scaled = preprocess_input(new_input, categorical_columns, encoders)
    log_likelihood = gmm.score_samples(new_input_scaled)[0]
    return "Anomali" if log_likelihood < threshold else "Normal"

# Contoh input baru (kategori masih dalam bentuk string)
new_input = ["Kost", 18000000, 100, 66, "Lainnya"]  # Kolom 0 dan 2 adalah kategori

# Cek apakah input merupakan anomali atau tidak
result = check_anomaly(new_input)
print(f"Input {new_input} diklasifikasikan sebagai: {result}")


Input ['Kost', 18000000, 100, 66, 'Lainnya'] diklasifikasikan sebagai: Normal


