In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

import matplotlib.pyplot as plt
import seaborn as sns

# Import Datasets

In [2]:
df = pd.read_csv("datasets/City_day.csv")

# ambil data yang data targetnya tidak kosong
df = df[df["Air_quality"].notna()]

# reset index karena jumlah data pasti berkurang
df = df.reset_index()

# hapus kolom index lama
df = df.drop(columns="index")

# tampilkan dataframe
df

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,Air_quality
0,Ahmedabad,1/29/2015,83.13,,6.93,28.71,33.72,,6.93,49.52,59.76,0.02,0.00,3.14,209.0,Poor
1,Ahmedabad,1/30/2015,79.84,,13.85,28.68,41.08,,13.85,48.49,97.07,0.04,0.00,4.81,328.0,Very Poor
2,Ahmedabad,1/31/2015,94.52,,24.39,32.66,52.61,,24.39,67.39,111.33,0.24,0.01,7.67,514.0,Severe
3,Ahmedabad,2/1/2015,135.99,,43.48,42.08,84.57,,43.48,75.23,102.70,0.40,0.04,25.87,782.0,Severe
4,Ahmedabad,2/2/2015,178.33,,54.56,35.31,72.80,,54.56,55.04,107.38,0.46,0.06,35.61,914.0,Severe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21932,Thiruvananthapuram,4/27/2020,14.13,34.27,5.60,8.98,12.48,5.65,0.49,5.50,42.41,,,,63.0,Satisfactory
21933,Thiruvananthapuram,4/28/2020,23.84,44.32,6.27,10.01,13.80,5.73,0.44,5.62,44.55,,,,60.0,Satisfactory
21934,Thiruvananthapuram,4/29/2020,18.54,34.48,6.17,9.67,13.35,5.93,0.51,5.52,38.97,,,,57.0,Satisfactory
21935,Thiruvananthapuram,4/30/2020,20.57,48.19,6.28,9.52,13.56,5.84,0.46,5.32,39.23,,,,57.0,Satisfactory


Karena dalam siklus angin biasanya tahun dan bulan juga dapat mempengaruhi kualitas udara, oleh karena itu dalam kolom Date akan kita lakukan ekstraksi fitur dengan mengambil data tahun dan data bulan.

In [3]:
# buat kolom year yang berisi data tahun hasil ekstraksi kolom date
df['year'] = pd.DatetimeIndex(df['Date']).year

# buat kolom month yang berisi hasil ekstraksi bulan pada kolom date
df['month'] = pd.DatetimeIndex(df['Date']).month

# hapus kolom Date karena sudah tidak terpakai
df = df.drop(columns="Date")

# tampilkan dataframe
df

Unnamed: 0,City,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,Air_quality,year,month
0,Ahmedabad,83.13,,6.93,28.71,33.72,,6.93,49.52,59.76,0.02,0.00,3.14,209.0,Poor,2015,1
1,Ahmedabad,79.84,,13.85,28.68,41.08,,13.85,48.49,97.07,0.04,0.00,4.81,328.0,Very Poor,2015,1
2,Ahmedabad,94.52,,24.39,32.66,52.61,,24.39,67.39,111.33,0.24,0.01,7.67,514.0,Severe,2015,1
3,Ahmedabad,135.99,,43.48,42.08,84.57,,43.48,75.23,102.70,0.40,0.04,25.87,782.0,Severe,2015,2
4,Ahmedabad,178.33,,54.56,35.31,72.80,,54.56,55.04,107.38,0.46,0.06,35.61,914.0,Severe,2015,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21932,Thiruvananthapuram,14.13,34.27,5.60,8.98,12.48,5.65,0.49,5.50,42.41,,,,63.0,Satisfactory,2020,4
21933,Thiruvananthapuram,23.84,44.32,6.27,10.01,13.80,5.73,0.44,5.62,44.55,,,,60.0,Satisfactory,2020,4
21934,Thiruvananthapuram,18.54,34.48,6.17,9.67,13.35,5.93,0.51,5.52,38.97,,,,57.0,Satisfactory,2020,4
21935,Thiruvananthapuram,20.57,48.19,6.28,9.52,13.56,5.84,0.46,5.32,39.23,,,,57.0,Satisfactory,2020,4


# Datasets Splitting

In [4]:
X = df.drop(columns="Air_quality")
y = df["Air_quality"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17549, 16), (4388, 16), (17549,), (4388,))

# Preprocessor

In [6]:
X_train.head(4)

Unnamed: 0,City,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,year,month
17277,Lucknow,163.42,,23.57,60.3,47.29,30.05,0.49,13.52,38.72,0.56,0.85,,319.0,2018,4
12640,Hyderabad,9.75,37.86,7.21,11.87,6.08,3.36,0.13,6.13,31.55,0.03,0.34,0.01,66.0,2017,7
12331,Hyderabad,16.93,,7.2,12.04,19.24,,8.34,5.98,8.35,0.21,1.84,0.19,247.0,2016,7
21794,Thiruvananthapuram,34.11,59.92,10.47,24.62,21.56,11.78,1.4,9.49,40.34,,,,91.0,2019,12


In [5]:
from xgboost import XGBClassifier

In [8]:
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(poly=2, scaling='robust', transform='yeo-johnson'), ["PM2.5","PM10","NO","NO2","NOx","NH3",
                                                                             "CO","SO2","O3","Benzene","Toluene","Xylene","AQI"]),
    ('categoric', cat_pipe(encoder='onehot'), ["City","year","month"])
])


pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBClassifier(n_jobs=-1, random_state=42))
])