# Air Quality Dataset Preparation

### Imports

In [8]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.preprocessing import StandardScaler

### Fetch Dataset

In [3]:
air_quality = fetch_ucirepo(id=360)
X = air_quality.data.features
X.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,3/10/2004,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,3/10/2004,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,3/10/2004,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,3/10/2004,21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,3/10/2004,22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888


### Data Processing

In [5]:
X = X.iloc[:, 1:]
X.head()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888


In [6]:
Y = X['T']
Y.head()

0    13.6
1    13.3
2    11.9
3    11.0
4    11.2
Name: T, dtype: float64

In [7]:
X = X.loc[:, X.columns != 'T']
X.head()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH
0,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,48.9,0.7578
1,2.0,1292,112,9.4,955,103,1174,92,1559,972,47.7,0.7255
2,2.2,1402,88,9.0,939,131,1140,114,1555,1074,54.0,0.7502
3,2.2,1376,80,9.2,948,172,1092,122,1584,1203,60.0,0.7867
4,1.6,1272,51,6.5,836,131,1205,116,1490,1110,59.6,0.7888


### Convert Non-Numeric Columns to Numeric using one-hot encoding

In [9]:
X = pd.get_dummies(X)

### Normailize Data

In [15]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pd.DataFrame(X_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.474,0.942983,2.211236,0.242504,0.442297,-0.010166,0.810649,0.432124,0.643258,0.641101,0.183831,0.194881
1,0.466273,0.736807,1.939383,0.182085,0.176459,-0.254902,1.177136,0.266684,0.358574,-0.006723,0.1604,0.194052
2,0.468849,1.070327,1.767687,0.172418,0.129719,-0.146131,1.071538,0.440002,0.350012,0.216513,0.283414,0.194686
3,0.468849,0.991495,1.710454,0.177252,0.15601,0.013142,0.922459,0.503027,0.412086,0.498842,0.400571,0.195622
4,0.461122,0.676167,1.502988,0.112,-0.171174,-0.146131,1.273416,0.455759,0.210881,0.295303,0.392761,0.195676
