# Feature Engineering untuk Prediksi Gagal Panen

Notebook ini menjelaskan proses feature engineering yang dilakukan pada data panen dan cuaca untuk mempersiapkan data training model GRU.


In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import data_processing as dp
import config

# Set style untuk visualisasi
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## 1. Memuat Data


In [None]:
# Memuat data dari CSV
df_harvest, df_weather = dp.load_data_from_csv()

print("Data Panen:")
print(df_harvest.head())
print(f"\nShape: {df_harvest.shape}")
print(f"\nKolom: {df_harvest.columns.tolist()}")

print("\n" + "="*60)
print("\nData Cuaca:")
print(df_weather.head())
print(f"\nShape: {df_weather.shape}")
print(f"\nKolom: {df_weather.columns.tolist()}")


## 2. Pembersihan Data Panen


In [None]:
# Rename kolom untuk kemudahan
df_harvest_clean = df_harvest.rename(columns={
    config.TARGET_COLUMN: "Produktivitas",
    config.REGION_COLUMN: "Wilayah",
    "Luas Panen Tanaman Padi (ha) (Ha)": "LuasPanen"
})

# Bersihkan format angka (misal: "54 987,79" -> 54987.79)
for col in ["Produktivitas", "LuasPanen"]:
    df_harvest_clean[col] = df_harvest_clean[col].apply(dp._clean_numeric_string)

# Hapus missing values
df_harvest_clean = df_harvest_clean.dropna(subset=["Produktivitas", "LuasPanen"])

print("Data Panen Setelah Pembersihan:")
print(df_harvest_clean.head())
print(f"\nShape: {df_harvest_clean.shape}")
print(f"\nStatistik Deskriptif:")
print(df_harvest_clean[["Produktivitas", "LuasPanen", "Tahun"]].describe())


## 3. Pembuatan Label (Target Variable)


In [None]:
# Hitung Z-score produktivitas per wilayah
df_harvest_clean['z_score'] = df_harvest_clean.groupby('Wilayah')['Produktivitas'].transform(
    lambda x: (x - x.mean()) / x.std(ddof=0)
)

# Label 1 (Gagal Panen) jika Z-score di bawah threshold
df_harvest_clean['GagalPanen'] = (df_harvest_clean['z_score'] < config.Z_SCORE_THRESHOLD).astype(int)

print("Distribusi Label:")
print(df_harvest_clean['GagalPanen'].value_counts())
print(f"\nPersentase Gagal Panen: {df_harvest_clean['GagalPanen'].mean()*100:.2f}%")

# Visualisasi distribusi Z-score
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df_harvest_clean['z_score'], bins=50, edgecolor='black', alpha=0.7)
plt.axvline(x=config.Z_SCORE_THRESHOLD, color='r', linestyle='--', label=f'Threshold: {config.Z_SCORE_THRESHOLD}')
plt.xlabel('Z-Score Produktivitas')
plt.ylabel('Frekuensi')
plt.title('Distribusi Z-Score Produktivitas')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
df_harvest_clean['GagalPanen'].value_counts().plot(kind='bar', color=['green', 'red'])
plt.xlabel('Gagal Panen')
plt.ylabel('Jumlah')
plt.title('Distribusi Label Gagal Panen')
plt.xticks([0, 1], ['Normal', 'Gagal Panen'], rotation=0)
plt.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


## 4. Feature Engineering Data Cuaca


In [None]:
# Rename kolom
df_weather_clean = df_weather.rename(columns={config.REGION_COLUMN: "Wilayah"})
df_weather_clean[config.DATE_COLUMN] = pd.to_datetime(df_weather_clean[config.DATE_COLUMN])

print("Data Cuaca Setelah Pembersihan:")
print(df_weather_clean.head())
print(f"\nShape: {df_weather_clean.shape}")

# One-Hot Encoding untuk Cuaca Ekstrem
print("\nNilai unik Cuaca Ekstrem:")
print(df_weather_clean[config.WEATHER_EVENT_COLUMN].value_counts().head(10))

# One-Hot Encoding
df_weather_events = df_weather_clean[config.WEATHER_EVENT_COLUMN].str.get_dummies(sep=', ')
df_weather_impacts = df_weather_clean[config.WEATHER_IMPACT_COLUMN].str.get_dummies(sep=' / ')

print(f"\nJumlah fitur cuaca ekstrem: {df_weather_events.shape[1]}")
print(f"Jumlah fitur dampak: {df_weather_impacts.shape[1]}")

# Gabungkan
df_weather_proc = pd.concat([
    df_weather_clean[['Wilayah', config.DATE_COLUMN]], 
    df_weather_events, 
    df_weather_impacts
], axis=1)

print(f"\nShape setelah encoding: {df_weather_proc.shape}")


## 5. Agregasi Temporal (Harian ke Mingguan)


In [None]:
# Agregasi harian ke mingguan
df_weather_weekly = df_weather_proc.set_index(config.DATE_COLUMN).groupby('Wilayah').resample(config.TIME_AGGREGATION_RULE).sum(numeric_only=True).reset_index()
df_weather_weekly['Tahun'] = df_weather_weekly[config.DATE_COLUMN].dt.year

print("Data Cuaca Setelah Agregasi Mingguan:")
print(df_weather_weekly.head())
print(f"\nShape: {df_weather_weekly.shape}")

# Visualisasi jumlah kejadian cuaca ekstrem per minggu
plt.figure(figsize=(14, 6))
sample_region = df_weather_weekly['Wilayah'].iloc[0]
sample_data = df_weather_weekly[df_weather_weekly['Wilayah'] == sample_region].set_index(config.DATE_COLUMN)

# Ambil beberapa kolom cuaca ekstrem untuk visualisasi
weather_cols = [col for col in df_weather_weekly.columns if col not in ['Wilayah', config.DATE_COLUMN, 'Tahun']][:5]
sample_data[weather_cols].plot(kind='area', stacked=True, alpha=0.7, figsize=(14, 6))
plt.title(f'Kejadian Cuaca Ekstrem per Minggu - {sample_region}')
plt.xlabel('Tanggal')
plt.ylabel('Jumlah Kejadian')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


## 6. Penggabungan Data Panen dan Cuaca


In [None]:
# Gabungkan data cuaca mingguan dengan data panen tahunan
df_merged = pd.merge(
    df_weather_weekly,
    df_harvest_clean,
    on=['Wilayah', 'Tahun'],
    how='left'
)

# Forward fill dan backward fill untuk data panen
cols_to_fill = ['LuasPanen', 'GagalPanen']
df_merged[cols_to_fill] = df_merged.groupby('Wilayah')[cols_to_fill].ffill().bfill()
df_merged = df_merged.dropna(subset=cols_to_fill)

print("Data Setelah Penggabungan:")
print(df_merged.head())
print(f"\nShape: {df_merged.shape}")
print(f"\nWilayah yang ada: {df_merged['Wilayah'].nunique()}")
print(f"Rentang tahun: {df_merged['Tahun'].min()} - {df_merged['Tahun'].max()}")


## 7. Normalisasi Fitur


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Hapus kolom non-fitur
cols_to_drop = ['Wilayah', config.DATE_COLUMN, 'Tahun', 'Produktivitas', 'Rekap Produksi Padi (ton)', 'z_score']
features_df = df_merged.drop(columns=cols_to_drop, errors='ignore')

print("Fitur yang digunakan:")
print(features_df.columns.tolist())
print(f"\nJumlah fitur: {len(features_df.columns)}")

# Normalisasi dengan MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_features = scaler.fit_transform(features_df)

print(f"\nShape setelah normalisasi: {scaled_features.shape}")
print(f"Range nilai: [{scaled_features.min():.3f}, {scaled_features.max():.3f}]")

# Visualisasi distribusi beberapa fitur sebelum dan sesudah normalisasi
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
sample_cols = features_df.columns[:6]

for idx, col in enumerate(sample_cols):
    row = idx // 3
    col_idx = idx % 3
    
    # Sebelum normalisasi
    axes[row, col_idx].hist(features_df[col].values, bins=30, alpha=0.7, color='blue', edgecolor='black')
    axes[row, col_idx].set_title(f'{col}\n(Sebelum Normalisasi)')
    axes[row, col_idx].set_xlabel('Nilai')
    axes[row, col_idx].set_ylabel('Frekuensi')
    axes[row, col_idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 8. Persiapan Data untuk Model GRU (Time Series Windowing)


In [None]:
from tensorflow.keras.utils import timeseries_dataset_from_array

# Buat dataset time series
labels = df_merged['GagalPanen'].values
dataset = timeseries_dataset_from_array(
    data=scaled_features,
    targets=labels,
    sequence_length=config.SEQUENCE_LENGTH,
    sequence_stride=config.SEQUENCE_STRIDE,
    batch_size=config.BATCH_SIZE,
    shuffle=True
)

# Cek bentuk dataset
sample_batch = next(iter(dataset))
X_sample, y_sample = sample_batch

print("Bentuk Dataset:")
print(f"  Input shape (X): {X_sample.shape}")
print(f"  Target shape (y): {y_sample.shape}")
print(f"  Sequence length: {config.SEQUENCE_LENGTH} minggu")
print(f"  Number of features: {X_sample.shape[2]}")

# Hitung jumlah total samples
total_samples = len(list(dataset)) * config.BATCH_SIZE
print(f"\nTotal samples dalam dataset: {total_samples}")

# Visualisasi beberapa sequence
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
for i in range(4):
    row = i // 2
    col = i % 2
    
    # Ambil satu sequence
    seq_idx = i * 5
    X_seq = X_sample[seq_idx].numpy()
    y_label = y_sample[seq_idx].numpy()
    
    # Plot beberapa fitur
    feature_indices = [0, 1, 2, 3]  # Ambil 4 fitur pertama
    for feat_idx in feature_indices:
        axes[row, col].plot(X_seq[:, feat_idx], alpha=0.7, label=f'Fitur {feat_idx}')
    
    axes[row, col].set_title(f'Sequence {seq_idx} - Label: {"Gagal Panen" if y_label == 1 else "Normal"}')
    axes[row, col].set_xlabel('Time Step (Minggu)')
    axes[row, col].set_ylabel('Nilai Normalisasi')
    axes[row, col].legend()
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("FEATURE ENGINEERING SELESAI!")
print("="*60)
print(f"\nData siap untuk training model dengan:")
print(f"  - Sequence length: {config.SEQUENCE_LENGTH} minggu")
print(f"  - Number of features: {X_sample.shape[2]}")
print(f"  - Batch size: {config.BATCH_SIZE}")
