In [None]:
import lightgbm as lgb
import optuna
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')

print("Tahap 1: Memuat dataset...")
try:
    train_df = train_featured.copy()
    test_df = test_featured.copy()

    train_df['date'] = pd.to_datetime(train_df['date'])
    test_df['date'] = pd.to_datetime(test_df['date'])

    print("‚úÖ Dataset berhasil dimuat.")
    print(f"Jumlah baris data training asli: {len(train_df)}")
    print(f"Jumlah baris data testing asli: {len(test_df)}")

except FileNotFoundError as e:
    print(f"‚ùå Error: File tidak ditemukan. Pastikan path '/kaggle/input/nama-dataset-anda/...' sudah benar. Detail: {e}")
    exit()

print("\nTahap 2: Mengisi nilai yang hilang (Imputasi)...")

# Urutkan data terlebih dahulu untuk imputasi yang benar
train_df = train_df.sort_values(by=['lokasi', 'date'])
test_df = test_df.sort_values(by=['lokasi', 'date'])

kolom_fitur_imputasi = [col for col in train_df.columns if col not in ['date', 'daily_rainfall_total_mm', 'lokasi']]

for col in kolom_fitur_imputasi:
    train_df[col] = train_df.groupby('lokasi')[col].transform(lambda x: x.ffill().bfill())
    if col in test_df.columns:
        test_df[col] = test_df.groupby('lokasi')[col].transform(lambda x: x.ffill().bfill())

print("‚úÖ Proses imputasi selesai.")
print("\nTahap 3: Membuat fitur-fitur baru untuk model...")

train_df['source'] = 'train'
test_df['source'] = 'test'

combined_df = pd.concat([train_df, test_df], sort=False, ignore_index=True)

def create_time_series_features(df):
    """Membuat fitur lag, rolling window, dan berbasis tanggal pada data gabungan."""
    df = df.copy()
    df = df.sort_values(by=['lokasi', 'date']).reset_index(drop=True)

    # Fitur berbasis tanggal
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofyear'] = df['date'].dt.dayofyear
    df['weekofyear'] = df['date'].dt.isocalendar().week.astype(int)

    features_to_process = ['daily_rainfall_total_mm', 'mean_temperature_c', 'maximum_temperature_c']
    for feature in features_to_process:
        for lag in [7, 14, 30]:
            df[f'{feature}_lag_{lag}'] = df.groupby('lokasi')[feature].shift(lag)
        for window in [7, 14, 30]:
            rolling_series = df.groupby('lokasi')[feature].shift(1).rolling(window, min_periods=1)
            df[f'{feature}_rolling_mean_{window}'] = rolling_series.mean()
            df[f'{feature}_rolling_std_{window}'] = rolling_series.std()
    return df

full_df = create_time_series_features(combined_df)
full_df = pd.get_dummies(full_df, columns=['lokasi'], prefix='lokasi', dtype=int)

print("‚úÖ Rekayasa fitur selesai.")
print("\nTahap 4: Mempersiapkan data dan melatih model LightGBM...")

train_final = full_df[full_df['source'] == 'train'].copy()
test_final = full_df[full_df['source'] == 'test'].copy()

train_final.drop(columns=['source'], inplace=True)
test_final.drop(columns=['source'], inplace=True)

print(f"Jumlah baris data training setelah diproses: {len(train_final)}")
print(f"Jumlah baris data testing setelah diproses: {len(test_final)}")

# Definisikan fitur dan target
TARGET = 'daily_rainfall_total_mm'
features = [col for col in train_final.columns if col not in ['date', TARGET]]

X_train = train_final[features]
y_train = train_final[TARGET]
X_test = test_final[features]

X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

lgbm_params = {
    'objective': 'regression_l1',
    'metric': 'rmse',
    'n_estimators': 2500,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 40,
    'verbose': -1,
    'n_jobs': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
}

model = lgb.LGBMRegressor(**lgbm_params)

# Training with early stopping
model.fit(X_train, y_train,
          eval_set=[(X_train, y_train)],
          eval_metric='rmse',
          callbacks=[lgb.early_stopping(150, verbose=False)])

print("‚úÖ Model berhasil dilatih.")
print("\nTahap 5: Melakukan prediksi dan membuat file submission...")

predictions = model.predict(X_test)
predictions[predictions < 0] = 0

lokasi_cols = [col for col in test_final.columns if col.startswith('lokasi_')]
test_final['lokasi_asal'] = test_final[lokasi_cols].idxmax(axis=1).str.replace('lokasi_', '')

submission_id = test_final['lokasi_asal'] + '_' + test_final['date'].dt.strftime('%Y-%m-%d')

submission_df = pd.DataFrame({
    'id': submission_id,
    'Daily Rainfall Total (mm)': predictions
})

submission_filename = 'submission_LGBM_final.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"\nüéâ File submission berhasil dibuat: '{submission_filename}'")
print(f"Jumlah baris file submission: {len(submission_df)}")
print("\nContoh 5 baris pertama hasil prediksi:")
print(submission_df.head())

In [None]:
submission_df

In [None]:
final = pd.read_csv('submission_LGBM_final.csv')
final.info()

In [None]:
final.head()

In [None]:
final.iloc[700]

In [None]:
# Ganti  file ini dengan nama file hasil prediksi Anda yang memiliki kolom:
# 'id' dan 'Daily Rainfall Total (mm)'
file_untuk_diubah = 'submission_LGBM_final.csv'

# Nama file output yang akan di-download
file_submission_final = 'submission.csv'

# --- PROSES ---
try:
    nama_kolom_sumber = 'id'

    # 1. Pisahkan nama kota dan tanggal mentah
    #    Ini akan membuat dua kolom sementara: 'kota_raw' dan 'tanggal_raw'
    final[['kota_raw', 'tanggal_raw']] = final[nama_kolom_sumber].str.rsplit('_', n=1, expand=True)

    # 2. Ubah kolom tanggal mentah menjadi tipe datetime
    #    Ini membuat kolom sementara 'tanggal_dt'
    final['tanggal_dt'] = pd.to_datetime(final['tanggal_raw'])

    # 3. Buat semua kolom baru yang diinginkan
    final['ID (kota)'] = final['kota_raw'].str.lower() + '_' + final['tanggal_raw'].str.replace('-', '_')
    final['tahun'] = final['tanggal_dt'].dt.year
    final['bulan'] = final['tanggal_dt'].dt.month
    final['hari'] = final['tanggal_dt'].dt.day
    final.rename(columns = {'Daily Rainfall Total (mm)': 'prediksi'}, inplace = True)

    # 4. Hapus kolom-kolom sementara yang sudah tidak diperlukan lagi
    final.drop(columns=['kota_raw', 'tanggal_raw', 'tanggal_dt', 'id'], inplace=True)

    # 6. Simpan ke file CSV baru
    print(f"Menyimpan hasil ke '{file_submission_final}'...")
    final.to_csv(file_submission_final, index=False)

    # 7. Sediakan file untuk di-download
    print("\n‚úÖ Proses selesai. File siap untuk di-download.")
    # Note: files.download() will trigger a download in the browser when run in Colab
    # files.download(file_submission_final) # Uncomment this line if you want to auto-download

except FileNotFoundError:
    print(f"‚ùå Error: File '{file_untuk_diubah}' tidak ditemukan.")
    print("Pastikan Anda sudah meng-upload file tersebut atau nama filenya sudah benar.")
except Exception as e:
    print(f"‚ùå An unexpected error occurred: {e}")

In [None]:
final_submission = pd.read_csv('submission.csv')
prediksi = final_submission['prediksi']
final_submission.drop(columns = ['prediksi'], inplace = True)

In [None]:
final_submission['prediksi'] = prediksi
final_submission

In [None]:
final_submission.info()

In [None]:
sns.lineplot(final_submission['prediksi'])

plt.tight_layout()
plt.figure(figsize = (100,20))

In [None]:
final_submission.to_csv('submission_final.csv', index = False)

In [None]:
round(final_submission.describe(), 2)