In [68]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [86]:
df_train = pd.read_csv('./file_created/merged_ispu.csv')
df_test = pd.read_csv('./file_created/testdata_scraping.csv')

In [87]:
df_train = df_train[df_train['categori'] != 'TIDAK ADA DATA']
df_train

Unnamed: 0,periode_data,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,201001,2010-01-01,DKI1 (Bunderan HI),60,0,4,73,27,14,73,CO,SEDANG
5,201001,2010-01-02,DKI1 (Bunderan HI),32,0,2,16,33,9,33,O3,BAIK
10,201001,2010-01-03,DKI1 (Bunderan HI),27,0,2,19,20,9,27,PM10,BAIK
15,201001,2010-01-04,DKI1 (Bunderan HI),22,0,2,16,15,6,22,PM10,BAIK
20,201001,2010-01-05,DKI1 (Bunderan HI),25,0,2,17,15,8,25,PM10,BAIK
...,...,...,...,...,...,...,...,...,...,...,...,...
16897,202508,2025-08-31,DKI1 Bundaran Hotel Indonesia HI,42.0,70.0,29.0,12.0,15.0,24.0,70.0,PM25,SEDANG
16898,202508,2025-08-31,DKI2 Kelapa Gading,0,72.0,45.0,16.0,21.0,16.0,72.0,PM25,SEDANG
16899,202508,2025-08-31,DKI3 Jagakarsa,28.0,60.0,53.0,8.0,19.0,39.0,60.0,PM25,SEDANG
16900,202508,2025-08-31,DKI4 Lubang Buaya,47.0,59.0,27.0,10.0,18.0,17.0,59.0,PM25,SEDANG


In [88]:
df_train = df_train[(df_train['stasiun'] != 'SANGAT TIDAK SEHAT') & (df_train['stasiun'] != 'TIDAK SEHAT') & (df_train['stasiun'] != 'SEDANG') & (df_train['stasiun'] != 'BAIK') & (df_train['stasiun'] != '0')]

In [89]:
df_train['stasiun'] = (
    df_train['stasiun']
    .str.upper()
    .str.extract(r'(DKI\s*[1-5])')[0]
    .str.replace(' ', '', regex=False)
)


In [90]:
cols = ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2']
for c in cols:
    df_train[c] = df_train[c].str.replace('-', '0')
    
for c in cols:
    df_train[c] = df_train[c].astype(float)
    
df_train['critical'] = df_train[cols].idxmax(axis=1)
df_train['max'] = df_train[cols].max(axis=1)

df_test['critical'] = df_test[cols].idxmax(axis=1)
df_test['max'] = df_test[cols].max(axis=1)

In [91]:
df_train['tanggal'] = pd.to_datetime(df_train['tanggal'])
df_train['year'] = df_train['tanggal'].dt.year
df_train['month'] = df_train['tanggal'].dt.month
df_train['day'] = df_train['tanggal'].dt.day

In [92]:
df_test['tanggal'] = df_test['id'].str.replace(r'_DKI[1-5]$', '', regex=True)
df_test['tanggal'] = pd.to_datetime(df_test['tanggal'])

df_test['year'] = df_test['tanggal'].dt.year
df_test['month'] = df_test['tanggal'].dt.month
df_test['day'] = df_test['tanggal'].dt.day

df_test['stasiun'] = df_test['id'].str.extract(r'(DKI[1-5])')

In [93]:
le = LabelEncoder()
cols_feature = ['stasiun', 'critical', 'categori']
for col in cols_feature:
    df_train[col] = le.fit_transform(df_train[col])
    if col != 'categori':
        df_test[col] = le.transform(df_test[col])

In [94]:
df_train

Unnamed: 0,periode_data,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori,year,month,day
0,201001,2010-01-01,0,60.0,0.0,4.0,73.0,27.0,14.0,73.0,0,4,2010,1,1
5,201001,2010-01-02,0,32.0,0.0,2.0,16.0,33.0,9.0,33.0,2,1,2010,1,2
10,201001,2010-01-03,0,27.0,0.0,2.0,19.0,20.0,9.0,27.0,3,1,2010,1,3
15,201001,2010-01-04,0,22.0,0.0,2.0,16.0,15.0,6.0,22.0,3,1,2010,1,4
20,201001,2010-01-05,0,25.0,0.0,2.0,17.0,15.0,8.0,25.0,3,1,2010,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16897,202508,2025-08-31,0,42.0,70.0,29.0,12.0,15.0,24.0,70.0,4,4,2025,8,31
16898,202508,2025-08-31,1,0.0,72.0,45.0,16.0,21.0,16.0,72.0,4,4,2025,8,31
16899,202508,2025-08-31,2,28.0,60.0,53.0,8.0,19.0,39.0,60.0,4,4,2025,8,31
16900,202508,2025-08-31,3,47.0,59.0,27.0,10.0,18.0,17.0,59.0,4,4,2025,8,31


In [95]:
df_test

Unnamed: 0,pm10,pm25,so2,co,o3,no2,id,critical,max,tanggal,year,month,day,stasiun
0,53,86,35,10,19,26,2025-09-01_DKI1,4,86,2025-09-01,2025,9,1,0
1,0,84,44,14,24,15,2025-09-01_DKI2,4,84,2025-09-01,2025,9,1,1
2,42,80,55,11,22,45,2025-09-01_DKI3,4,80,2025-09-01,2025,9,1,2
3,44,78,28,12,23,16,2025-09-01_DKI4,4,78,2025-09-01,2025,9,1,3
4,49,83,30,9,24,34,2025-09-01_DKI5,4,83,2025-09-01,2025,9,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,56,87,36,15,21,32,2025-11-30_DKI1,4,87,2025-11-30,2025,11,30,0
451,56,82,51,18,22,11,2025-11-30_DKI2,4,82,2025-11-30,2025,11,30,1
452,37,66,54,11,19,76,2025-11-30_DKI3,1,76,2025-11-30,2025,11,30,2
453,59,62,33,10,26,29,2025-11-30_DKI4,4,62,2025-11-30,2025,11,30,3


In [96]:
features = ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max', 'critical', 'year', 'month', 'day']
X = df_train[features]
y = df_train['categori']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, shuffle=False)

In [97]:
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=RANDOM_STATE
)
rf_model.fit(X_train, y_train)

In [98]:
test_rf = rf_model.predict(X_test)
print(classification_report(y_test, test_rf))

              precision    recall  f1-score   support

           1       1.00      0.95      0.97       381
           3       1.00      1.00      1.00         1
           4       0.99      1.00      1.00      2377
           5       0.99      1.00      1.00       324

    accuracy                           0.99      3083
   macro avg       1.00      0.99      0.99      3083
weighted avg       0.99      0.99      0.99      3083



In [99]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(15).to_string(index=False))

 feature  importance
     max    0.378600
      o3    0.244440
    pm10    0.095854
     no2    0.084618
    pm25    0.042710
      co    0.035136
     day    0.029358
    year    0.023992
   month    0.022931
 stasiun    0.014602
     so2    0.014445
critical    0.013315


In [102]:
test_data = df_test[features]

preds = rf_model.predict(test_data)
df_test['preds'] = preds

In [104]:
sub = pd.read_csv('../penyisihan-datavidia-10/sample_submission.csv')

In [106]:
df_test.head()

Unnamed: 0,pm10,pm25,so2,co,o3,no2,id,critical,max,tanggal,year,month,day,stasiun,preds
0,53,86,35,10,19,26,2025-09-01_DKI1,4,86,2025-09-01,2025,9,1,0,4
1,0,84,44,14,24,15,2025-09-01_DKI2,4,84,2025-09-01,2025,9,1,1,4
2,42,80,55,11,22,45,2025-09-01_DKI3,4,80,2025-09-01,2025,9,1,2,4
3,44,78,28,12,23,16,2025-09-01_DKI4,4,78,2025-09-01,2025,9,1,3,4
4,49,83,30,9,24,34,2025-09-01_DKI5,4,83,2025-09-01,2025,9,1,4,4


In [107]:
sub.head()

Unnamed: 0,id,category
0,2025-09-01_DKI1,
1,2025-09-01_DKI2,
2,2025-09-01_DKI3,
3,2025-09-01_DKI4,
4,2025-09-01_DKI5,


In [122]:
df_test['category'] = le.inverse_transform(df_test['preds'])

In [124]:
df_test[df_test['preds'] == 3]

Unnamed: 0,pm10,pm25,so2,co,o3,no2,id,critical,max,tanggal,year,month,day,stasiun,preds,category
114,53,276,26,10,39,44,2025-09-23_DKI5,4,276,2025-09-23,2025,9,23,4,3,SANGAT TIDAK SEHAT


In [108]:
sub['category'] = sub['id'].map(
    df_test.set_index('id')['preds']
)

In [120]:
sub['category'] = le.inverse_transform(sub['category'])

In [121]:
sub['category'].unique()

array(['SEDANG', 'TIDAK SEHAT', 'BAIK', 'SANGAT TIDAK SEHAT'],
      dtype=object)

In [127]:
sub.to_csv('submission_1.csv', index=False)