In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Modelling
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

# Settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

SEED = 42

In [31]:
# Load Dataset ISPU
from pathlib import Path

files = sorted(Path("ISPU").glob("*.csv"))
dataframes = [pd.read_csv(file) for file in files]


In [32]:
# Gabungkan semua file ISPU
df_ispu = pd.concat(dataframes, ignore_index=True)
print(df_ispu.shape)

(16902, 23)


In [33]:
# Tampilkan informasi dasar tentang dataset
df_ispu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16902 entries, 0 to 16901
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   periode_data               16902 non-null  int64  
 1   tanggal                    16902 non-null  object 
 2   stasiun                    14000 non-null  object 
 3   pm_sepuluh                 4684 non-null   object 
 4   pm_duakomalima             5182 non-null   object 
 5   sulfur_dioksida            4817 non-null   object 
 6   karbon_monoksida           4826 non-null   object 
 7   ozon                       4827 non-null   object 
 8   nitrogen_dioksida          4804 non-null   object 
 9   max                        16895 non-null  object 
 10  parameter_pencemar_kritis  4795 non-null   object 
 11  kategori                   4870 non-null   object 
 12  bulan                      3045 non-null   float64
 13  pm10                       11667 non-null  obj

In [34]:
# Tampilkan 10 baris pertama dari dataset
df_ispu.head(10)

Unnamed: 0,periode_data,tanggal,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori,bulan,pm10,so2,co,o3,no2,critical,categori,lokasi_spku,pm25,pm_10
0,202302,2023-02-25,DKI5 Kebon Jeruk Jakarta Barat,35,-,13,12,31,18,35,PM10,BAIK,,,,,,,,,,,
1,202302,2023-02-26,DKI5 Kebon Jeruk Jakarta Barat,23,-,14,9,32,11,32,O3,BAIK,,,,,,,,,,,
2,202302,2023-02-27,DKI5 Kebon Jeruk Jakarta Barat,20,-,13,8,33,13,33,O3,BAIK,,,,,,,,,,,
3,202302,2023-02-28,DKI5 Kebon Jeruk Jakarta Barat,30,-,21,11,28,18,30,PM10,BAIK,,,,,,,,,,,
4,202303,2023-03-01,DKI1 Bunderan HI,38,44,50,8,19,27,50,3,BAIK,,,,,,,,,,,
5,202303,2023-03-02,DKI1 Bunderan HI,29,33,47,11,21,27,47,3,BAIK,,,,,,,,,,,
6,202303,2023-03-03,DKI1 Bunderan HI,38,46,49,9,16,25,49,3,BAIK,,,,,,,,,,,
7,202303,2023-03-04,DKI1 Bunderan HI,31,40,50,---,18,21,50,3,BAIK,,,,,,,,,,,
8,202303,2023-03-05,DKI1 Bunderan HI,33,41,47,11,21,22,47,3,BAIK,,,,,,,,,,,
9,202303,2023-03-06,DKI1 Bunderan HI,31,44,46,9,20,19,46,3,BAIK,,,,,,,,,,,


In [35]:
# Pisahkan data latih dan data uji
train_df = df_ispu[df_ispu['kategori'].notna()].copy()

In [39]:
# Konversi kolom numerik ke tipe data numerik
num_cols = ['pm10','pm25','so2','co','o3','no2']

for col in num_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')

In [None]:
# Cek missing values pada kolom numerik
train_df[num_cols].isna().mean()

pm10    1.0
pm25    1.0
so2     1.0
co      1.0
o3      1.0
no2     1.0
dtype: float64

In [None]:
# Filter hanya kelas yang valid
valid_classes = ['BAIK','SEDANG','TIDAK SEHAT']
train_df = train_df[train_df['kategori'].isin(valid_classes)]

In [None]:
# Cek tipe data kolom tanggal
train_df['tanggal'].dtype
train_df['tanggal'].head(10)


0    2023-02-25
1    2023-02-26
2    2023-02-27
3    2023-02-28
4    2023-03-01
5    2023-03-02
6    2023-03-03
7    2023-03-04
8    2023-03-05
9    2023-03-06
Name: tanggal, dtype: object

In [None]:
# cek tipe data setiap entri di kolom tanggal
train_df['tanggal'].apply(type).value_counts()

tanggal
<class 'int'>    3026
<class 'str'>    1801
Name: count, dtype: int64

In [None]:
# Konversi kolom tanggal ke tipe datetime
train_df['tanggal'] = pd.to_datetime(
    train_df['tanggal'],
    errors='coerce'
)

In [None]:
# cek jumlah missing values setelah konversi
train_df['tanggal'].isna().sum()

np.int64(3026)

In [None]:
# buang baris dengan nilai tanggal yang tidak valid
train_df = train_df.dropna(subset=['tanggal'])

In [None]:
# sort berdasarkan tanggal
train_df = train_df.sort_values('tanggal')

In [None]:
# cek tipe data setiap entri di kolom tanggal
train_df['tanggal'].apply(type).value_counts()

tanggal
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    1801
Name: count, dtype: int64

In [None]:
# split data latih dan data validasi berdasarkan tanggal
train_df = train_df.sort_values('tanggal')

split_date = train_df['tanggal'].quantile(0.8)

train_data = train_df[train_df['tanggal'] <= split_date]
val_data   = train_df[train_df['tanggal'] >  split_date]


In [37]:
# Persiapan fitur dan target
features = ['pm10','pm25','so2','co','o3','no2']
target = 'kategori'

X_train = train_data[features]
y_train = train_data[target]

X_val = val_data[features]
y_val = val_data[target]

In [None]:
# baseline model
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# evaluasi makro F1
from sklearn.metrics import f1_score

y_pred = model.predict(X_val)
f1 = f1_score(y_val, y_pred, average='macro')
print("Macro F1:", f1)

Macro F1: 0.0


In [40]:
# Simpan train_df untuk keperluan lain
train_df = df_ispu[['tanggal','pm10','pm25','so2','co','o3','no2','kategori']].copy()

In [41]:
# cek tipe data kolom tanggal
# pastikan string
train_df['tanggal'] = train_df['tanggal'].astype(str)

# konversi ke datetime
train_df['tanggal'] = pd.to_datetime(
    train_df['tanggal'],
    format='%Y-%m-%d',
    errors='coerce'
)

# buang tanggal rusak
train_df = train_df.dropna(subset=['tanggal'])


In [42]:
# cek tipe data setiap entri di kolom tanggal
train_df['tanggal'].apply(type).value_counts()

tanggal
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    13385
Name: count, dtype: int64

In [43]:
# split data latih dan data validasi berdasarkan tanggal 
train_df = train_df.sort_values('tanggal')

split_date = train_df['tanggal'].quantile(0.8)

train_data = train_df[train_df['tanggal'] <= split_date]
val_data   = train_df[train_df['tanggal'] >  split_date]

In [44]:
# tambahkan fitur tanggal
train_df['dayofweek'] = train_df['tanggal'].dt.dayofweek  # 0=Senin
train_df['day']       = train_df['tanggal'].dt.day
train_df['month']     = train_df['tanggal'].dt.month
train_df['is_weekend'] = train_df['dayofweek'].isin([5,6]).astype(int)

In [45]:
# tampilkan beberapa baris
train_df[['tanggal','dayofweek','is_weekend']].head()

Unnamed: 0,tanggal,dayofweek,is_weekend
5088,2010-01-01,4,0
4914,2010-01-01,4,0
4883,2010-01-01,4,0
5057,2010-01-01,4,0
4972,2010-01-01,4,0


In [46]:
# encoding target
allowed = ['BAIK','SEDANG','TIDAK SEHAT']
train_df = train_df[train_df['kategori'].isin(allowed)]


In [47]:
# encoding target
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['target'] = le.fit_transform(train_df['kategori'])

le.classes_


array(['BAIK', 'SEDANG', 'TIDAK SEHAT'], dtype=object)

In [48]:
# menyiapkan fitur dan target
features = [
    'pm10','pm25','so2','co','o3','no2',
    'dayofweek','day','month','is_weekend'
]

X = train_df[features]
y = train_df['target']

In [49]:
# split data latih dan data validasi berdasarkan tanggal
split_date = train_df['tanggal'].quantile(0.8)

X_train = X[train_df['tanggal'] <= split_date]
X_val   = X[train_df['tanggal'] >  split_date]

y_train = y[train_df['tanggal'] <= split_date]
y_val   = y[train_df['tanggal'] >  split_date]

In [50]:
# baseline model 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)

print("Macro F1:", f1_score(y_val, y_pred, average='macro'))

Macro F1: 0.31494953694080696


In [51]:
# confusion matrix
from sklearn.metrics import confusion_matrix
import pandas as pd

cm = confusion_matrix(y_val, y_pred)
pd.DataFrame(cm, index=le.classes_, columns=le.classes_)

Unnamed: 0,BAIK,SEDANG,TIDAK SEHAT
BAIK,0,0,0
SEDANG,39,223,27
TIDAK SEHAT,5,54,8
