In [None]:
import sys
sys.path.append('..')

from warnings import filterwarnings
filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [None]:
dipole_mode_index = pd.read_csv('../data/raw/Data Eksternal/Dipole Mode Index (DMI).csv')
air_quality_index = pd.read_csv('../data/raw/Data Eksternal/AirQualityIndex_Google Trends.csv')
oceanic_nino_index = pd.read_csv('../data/raw/Data Eksternal/OceanicNinoIndex (ONI).csv')
relative_humidity = pd.read_csv('../data/raw/Data Eksternal/RelativeHumidityMonthlyMean.csv')

In [None]:
print("Memulai pra-pemrosesan 4 file data eksternal...")

# --- 1. Pra-pemrosesan DMI ---
print("Memproses DMI...")
# Ganti nama kolom & tangani missing values
dmi = dipole_mode_index.copy()

dmi.rename(columns={dmi.columns[1]: 'dmi_value'}, inplace=True)
dmi['dmi_value'].replace(-9999, np.nan, inplace=True)
dmi['dmi_value'] = dmi['dmi_value'].ffill()
# Buat kunci merge 'tahun_bulan'
dmi['date_dt'] = pd.to_datetime(dmi['Date'])
dmi['tahun_bulan'] = dmi['date_dt'].dt.strftime('%Y-%m')
# Buat fitur lag & rolling
for lag in [1, 2, 3, 6]:
    dmi[f'dmi_lag_{lag}'] = dmi['dmi_value'].shift(lag)
for window in [3, 6, 12]:
    dmi[f'dmi_rolling_mean_{window}'] = dmi['dmi_value'].rolling(window, min_periods=1).mean()
# Pilih kolom final
dmi_final = dmi.drop(columns=['Date', 'date_dt'])

# --- 2. Pra-pemrosesan ONI ---
print("Memproses ONI...")
oni = oceanic_nino_index.copy()

oni.rename(columns={oni.columns[1]: 'oni_value'}, inplace=True)
oni['oni_value'].replace(-9999, np.nan, inplace=True)
oni['oni_value'] = oni['oni_value'].ffill()
# Buat kunci merge 'tahun_bulan'
oni['date_dt'] = pd.to_datetime(oni['Date'], format='%d/%m/%Y')
oni['tahun_bulan'] = oni['date_dt'].dt.strftime('%Y-%m')
# Buat fitur lag & rolling
for lag in [1, 2, 3, 6]:
    oni[f'oni_lag_{lag}'] = oni['oni_value'].shift(lag)
for window in [3, 6, 12]:
    oni[f'oni_rolling_mean_{window}'] = oni['oni_value'].rolling(window, min_periods=1).mean()
# Pilih kolom final
oni_final = oni.drop(columns=['Date', 'date_dt'])

# --- 3. Pra-pemrosesan Relative Humidity ---
print("Memproses Relative Humidity...")
rh = relative_humidity.copy()

rh.rename(columns={'month': 'tahun_bulan', 'mean_rh': 'rh_value'}, inplace=True)
rh['rh_value'] = rh['rh_value'].ffill() # Jaga-jaga jika ada missing value
# Buat fitur lag & rolling
for lag in [1, 2, 3, 6]:
    rh[f'rh_lag_{lag}'] = rh['rh_value'].shift(lag)
for window in [3, 6, 12]:
    rh[f'rh_rolling_mean_{window}'] = rh['rh_value'].rolling(window, min_periods=1).mean()
# Pilih kolom final
rh_final = rh.copy()

# --- 4. Pra-pemrosesan Air Quality Index ---
print("Memproses Air Quality Index...")
aqi = air_quality_index.copy()

aqi.rename(columns={aqi.columns[0]: 'tahun_bulan', aqi.columns[1]: 'aqi_value'}, inplace=True)
aqi['aqi_value'] = aqi['aqi_value'].ffill()
# Buat fitur lag & rolling
for lag in [1, 2, 3, 6]:
    aqi[f'aqi_lag_{lag}'] = aqi['aqi_value'].shift(lag)
for window in [3, 6, 12]:
    aqi[f'aqi_rolling_mean_{window}'] = aqi['aqi_value'].rolling(window, min_periods=1).mean()
# Pilih kolom final
aqi_final = aqi.copy()

# --- 5. Gabungkan semua data eksternal menjadi satu DataFrame ---
print("Menggabungkan semua data eksternal...")
# Mulai dari DMI, lalu gabungkan dengan yang lain satu per satu
external_features = pd.merge(dmi_final, oni_final, on='tahun_bulan', how='left')
external_features = pd.merge(external_features, rh_final, on='tahun_bulan', how='left')
external_features = pd.merge(external_features, aqi_final, on='tahun_bulan', how='left')

# Isi NaN di awal data (akibat proses lag/rolling) dengan forward/backward fill
external_features = external_features.ffill().bfill()
external_features['tahun_bulan'] = pd.to_datetime(external_features['tahun_bulan'])

print("\nâœ… Pra-pemrosesan data eksternal selesai.")
print(f"Total fitur eksternal yang dibuat: {external_features.shape[1] - 1}")
print("\nContoh 5 baris pertama dari data eksternal yang siap digabung:")
external_features.head()

In [None]:
external_features.info()

In [None]:
train_clean = pd.read_csv('../data/clean/train.csv')
train_clean.head()

In [None]:
train_clean['date'] = pd.to_datetime(train_clean['date'], format = 'mixed')
train_clean['tahun_bulan'] = train_clean['date'].dt.strftime('%Y-%m')
train_clean['tahun_bulan'] = pd.to_datetime(train_clean['tahun_bulan'])
train_clean = pd.merge(train_clean, external_features, on = 'tahun_bulan', how = 'left')

train_clean.drop(columns = ['tahun_bulan'], inplace = True)

In [None]:
def create_features(df):
    """
    Membuat fitur-fitur baru pada DataFrame cuaca.
    """
    # Salin DataFrame untuk menghindari SettingWithCopyWarning
    df = df.copy()

    # 1. Pastikan kolom 'date' dalam format datetime
    df['date'] = pd.to_datetime(df['date'])

    # --- 1. Ekstraksi Fitur Waktu ---
    print("Membuat fitur waktu...")
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_of_year'] = df['date'].dt.dayofyear
    df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)

    # --- 2. Rentang Temperatur Harian ---
    print("Membuat rentang temperatur harian...")
    df['temp_range_c'] = df['maximum_temperature_c'] - df['minimum_temperature_c']

    # --- 3. Faktor Hembusan Angin ---
    print("Membuat faktor hembusan angin...")
    # Ganti pembagian dengan nol atau NaN dengan 1 (menandakan angin stabil)
    df['wind_gust_factor'] = df['max_wind_speed_kmh'] / df['mean_wind_speed_kmh']
    df['wind_gust_factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
    df['wind_gust_factor'].fillna(1, inplace=True)

    # --- 4. Rasio Intensitas Hujan ---
    print("Membuat rasio intensitas hujan...")
    # Ganti pembagian dengan nol atau NaN dengan 1 (menandakan intensitas stabil)
    df['rain_intensity_ratio'] = df['highest_60_min_rainfall_mm'] / df['highest_30_min_rainfall_mm']
    df['rain_intensity_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
    df['rain_intensity_ratio'].fillna(1, inplace=True)

    # --- 5. Fitur Lag (Data 1 hari sebelumnya) ---
    print("Membuat fitur lag...")
    df = df.sort_values(by='date').reset_index(drop=True) # Pastikan data terurut sebelum membuat lag
    lag_features = ['mean_temperature_c', 'highest_60_min_rainfall_mm', 'mean_wind_speed_kmh']
    for feature in lag_features:
        df[f'{feature}_lag1'] = df[feature].shift(1)

    # --- 6. Statistik Bergulir (Rolling Statistics) ---
    print("Membuat statistik bergulir...")
    df['mean_temp_roll_7d'] = df['mean_temperature_c'].rolling(window=7, min_periods=1).mean()
    df['max_rain_roll_3d'] = df['highest_60_min_rainfall_mm'].rolling(window=3, min_periods=1).max()
    df['mean_wind_roll_7d'] = df['mean_wind_speed_kmh'].rolling(window=7, min_periods=1).mean()

    # --- FITUR SIKLUS & INTERAKSI AWAL ---
    days_in_year = 366
    df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / days_in_year)
    df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / days_in_year)
    df['wind_x_rain'] = df['mean_wind_speed_kmh'] * df['highest_60_min_rainfall_mm']

    # --- FITUR BARU DARI DATA EKSTERNAL ---
    print("Menambahkan fitur interaksi dari data eksternal...")

    # 1. Interaksi ONI & Temperatur
    df['oni_x_temp'] = df['oni_value'] * df['mean_temperature_c']

    # 2. Interaksi DMI & Hujan
    df['dmi_x_rainfall'] = df['dmi_value'] * df['highest_60_min_rainfall_mm']

    # 3. Proxy Indeks Panas (RH & Temperatur)
    df['heat_index_proxy'] = df['rh_value'] * df['mean_temperature_c']

    # 4. Interaksi AQI & Rentang Temperatur
    df['aqi_x_temp_range'] = df['aqi_value'] * df['temp_range_c']
    print("-" * 30)
    return df

train_featured = create_features(train_clean)
test_featured = create_features(test_merged)

In [None]:
train_featured.info()

In [None]:
test_featured.info()