# Data Preprocessing

Pada tahapan ini dilakukan preprocessing data agar dataset bisa dilakukan modeling

## Import Dataset

In [1]:
import pickle

with open('../data/dataset.pkl', 'rb') as f:
    df = pickle.load(f)

print("Dataset berhasil dibaca, contoh data:")
df.head()

Dataset berhasil dibaca, contoh data:


Unnamed: 0,latitude,longitude,date,label,weather_code,temperature_2m_mean,temperature_2m_max,temperature_2m_min,apparent_temperature_mean,apparent_temperature_max,...,sunset,daylight_duration,sunshine_duration,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,tanggal,gi,jarak_ke_beban
0,-6.92195,108.68166,31/12/2023 17:00,0,63,27.5927,33.7365,23.8365,32.8545,40.6505,...,1704107036,45059.2,36187.2,12.9,12.9,0,8,31/12/2023,GI BABAKAN,10.06346
1,-6.92195,108.68166,01/01/2024 17:00,0,63,26.999,32.3865,23.8365,32.7597,37.9859,...,1704193462,45053.1,30011.6,12.4,12.4,0,16,01/01/2024,GI BABAKAN,10.06346
2,-6.92195,108.68166,02/01/2024 17:00,0,63,26.3761,30.8365,23.5365,31.8353,37.4942,...,1704279888,45046.5,29641.9,8.2,8.2,0,3,02/01/2024,GI BABAKAN,10.06346
3,-6.92195,108.68166,03/01/2024 17:00,0,65,25.3427,29.5365,22.9865,30.7851,35.1786,...,1704366313,45039.4,19045.9,36.3,36.3,0,15,03/01/2024,GI BABAKAN,10.06346
4,-6.92195,108.68166,04/01/2024 17:00,0,63,26.4928,31.9865,23.6365,31.7316,37.8391,...,1704452737,45031.8,33153.9,15.1,15.1,0,17,04/01/2024,GI BABAKAN,10.06346


## Clustering Lokasi

In [2]:
import numpy as np
import hdbscan
import pandas as pd

df_unique_coords = df[['latitude', 'longitude']].drop_duplicates().reset_index(drop=True)

coords_rad = np.radians(df_unique_coords[['latitude', 'longitude']].values)

clusterer = hdbscan.HDBSCAN(
    metric='haversine',
    min_cluster_size=5,
    min_samples=3,
    core_dist_n_jobs=-1
)

labels = clusterer.fit_predict(coords_rad)

df_unique_coords['cluster'] = labels

noise_mask = df_unique_coords['cluster'] == -1
noise_count = noise_mask.sum()

max_cluster_id = df_unique_coords['cluster'].max()
new_ids = range(max_cluster_id + 1, max_cluster_id + 1 + noise_count)

df_unique_coords.loc[noise_mask, 'cluster'] = new_ids

df = df.merge(df_unique_coords, on=['latitude', 'longitude'], how='left')



In [3]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

df_plot = df_unique_coords.copy()

num_clusters = df_plot['cluster'].nunique()
colormap = cm.get_cmap('tab20', num_clusters)
norm = colors.Normalize(vmin=0, vmax=num_clusters - 1)

map_center = [df_plot['latitude'].mean(), df_plot['longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=6)

for _, row in df_plot.iterrows():
    cluster_id = int(row['cluster'])
    
    color = "#999999" if cluster_id == -1 else colors.rgb2hex(colormap(norm(cluster_id)))
    
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=3,
        color=color,
        fill=True,
        fill_opacity=0.6,
        popup=f"Cluster: {cluster_id}"
    ).add_to(m)

m

  colormap = cm.get_cmap('tab20', num_clusters)


# Drop Missing Values

In [4]:
df = df.dropna()
print("Jumlah Data Null:", df.isnull().sum())

Jumlah Data Null: latitude                       0
longitude                      0
date                           0
label                          0
weather_code                   0
temperature_2m_mean            0
temperature_2m_max             0
temperature_2m_min             0
apparent_temperature_mean      0
apparent_temperature_max       0
apparent_temperature_min       0
wind_speed_10m_max             0
wind_gusts_10m_max             0
wind_direction_10m_dominant    0
shortwave_radiation_sum        0
et0_fao_evapotranspiration     0
sunrise                        0
sunset                         0
daylight_duration              0
sunshine_duration              0
precipitation_sum              0
rain_sum                       0
snowfall_sum                   0
precipitation_hours            0
tanggal                        0
gi                             0
jarak_ke_beban                 0
cluster                        0
dtype: int64


## Drop Duplicates

In [5]:
df = df.drop_duplicates()
print("Jumlah Data Duplikat:", df.duplicated().sum())

Jumlah Data Duplikat: 0


In [7]:
threshold = 0.05
df_numerik = df.select_dtypes(include=['int64', 'float64'])
correlation_matrix = df_numerik.corr()
high_corr_features = correlation_matrix.index[abs(correlation_matrix["label"]) > threshold].tolist()
high_corr_features.remove("label")
print(high_corr_features)

['wind_direction_10m_dominant', 'shortwave_radiation_sum', 'et0_fao_evapotranspiration', 'daylight_duration', 'sunshine_duration', 'precipitation_sum', 'rain_sum', 'precipitation_hours']


## Splitting Dataset

In [9]:
selected_features = [
    'wind_direction_10m_dominant', 'shortwave_radiation_sum',
    'et0_fao_evapotranspiration', 'daylight_duration',
    'sunshine_duration', 'precipitation_sum',
    'rain_sum', 'precipitation_hours', 'jarak_ke_beban','cluster'
]

# Ambil fitur dan target
X = df[selected_features].copy()
y = df['label']

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [11]:
import pandas as pd
from sklearn.preprocessing import PowerTransformer
import pickle

# Daftar kolom yang mau di-transformasi dengan Yeo-Johnson
yeojohnson_cols = ['precipitation_sum', 'rain_sum']

# Inisialisasi PowerTransformer
pt = PowerTransformer(method='yeo-johnson')

# Fit dan transformasi hanya untuk kolom tertentu
X_train[yeojohnson_cols] = pt.fit_transform(X_train[yeojohnson_cols])
X_test[yeojohnson_cols] = pt.transform(X_test[yeojohnson_cols])

# Simpan transformer + daftar kolom ke pickle
to_save = {
    'power_transformer': pt,
    'yeojohnson_cols': yeojohnson_cols
}

with open('../transform/power_transformer.pkl', 'wb') as f:
    pickle.dump(to_save, f)

# Cek hasil
print(X_train.head())
print(X_test.head())
print("PowerTransformer berhasil disimpan ke 'power_transformer.pkl' ✅")

        wind_direction_10m_dominant  shortwave_radiation_sum  \
228852                      35.3949                    15.45   
370396                     342.6130                    20.07   
496452                     168.6900                    20.89   
217356                     223.8770                    23.06   
19252                      109.7290                    20.25   

        et0_fao_evapotranspiration  daylight_duration  sunshine_duration  \
228852                     3.07992            43076.6            25455.7   
370396                     3.95461            45031.8            31527.6   
496452                     3.97736            42260.8            39062.1   
217356                     4.64791            44743.4            39892.7   
19252                      4.03741            42661.5            39753.8   

        precipitation_sum  rain_sum  precipitation_hours  jarak_ke_beban  \
228852           1.165091  1.165091                   16       30.506635   
370396

## Robust Scaler

In [12]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X_train_scaled_array = scaler.fit_transform(X_train)
X_test_scaled_array = scaler.transform(X_test)

# 3. Konversi hasil scaling kembali ke DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled_array, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled_array, columns=X_test.columns, index=X_test.index)

# 4. Simpan RobustScaler + nama kolom ke pickle
scaler_save = {
    'scaler': scaler,
    'feature_names': list(X_train.columns)  # Simpan nama kolom training
}

with open('../transform/robust_scaler.pkl', 'wb') as f:
    pickle.dump(scaler_save, f)

print("✅ RobustScaler dan nama kolom berhasil disimpan ke 'robust_scaler.pkl'.")

✅ RobustScaler dan nama kolom berhasil disimpan ke 'robust_scaler.pkl'.


# Balancing Data

In [13]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

In [14]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_scaled, y_train)

In [15]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
X_train_enn, y_train_enn = smote_enn.fit_resample(X_train_scaled, y_train)

In [16]:
import pickle

with open('../data/X_train.pkl', 'wb') as f:
    pickle.dump(X_train, f)

with open('../data/y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)

with open('../data/X_train_scaled.pkl', 'wb') as f:
    pickle.dump(X_train_scaled, f)

with open('../data/X_train_smote.pkl', 'wb') as f:
    pickle.dump(X_train_smote, f)

with open('../data/y_train_smote.pkl', 'wb') as f:
    pickle.dump(y_train_smote, f)

with open('../data/X_train_adasyn.pkl', 'wb') as f:
    pickle.dump(X_train_adasyn, f)

with open('../data/y_train_adasyn.pkl', 'wb') as f:
    pickle.dump(y_train_adasyn, f)

with open('../data/X_train_enn.pkl', 'wb') as f:
    pickle.dump(X_train_enn, f)

with open('../data/y_train_enn.pkl', 'wb') as f:
    pickle.dump(y_train_enn, f)

with open('../data/X_test_scaled.pkl', 'wb') as f:
    pickle.dump(X_test_scaled, f)

with open('../data/y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

print("Semua dataset berhasil disimpan!")

Semua dataset berhasil disimpan!
