In [1]:
!pip install sweetviz
!pip install h2o

Collecting sweetviz
  Downloading sweetviz-2.2.1-py3-none-any.whl (15.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sweetviz
Successfully installed sweetviz-2.2.1
Collecting h2o
  Downloading h2o-3.44.0.1.tar.gz (257.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.4/257.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.44.0.1-py2.py3-none-any.whl size=257484150 sha256=903e890e33b6c698342ec3230818ed87d5329926ff1eea5f7695ec686d8b6089
  Stored in directory: /root/.cache/pip/wheels/d9/9b/ca/7345b72d17e1e17da37239d70631c3214ec9e541b0c9e700e2
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.44.0.1


In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


**Data Preprocessing and Cleaning**

In [4]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/CMPE-255/Assignment 5/Data Set/Task-2/AirQualityUCI.csv', delimiter=';', decimal=',')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,,
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,,
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,,
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,,
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,,


In [5]:
# Remove unnecessary columns
data_cleaned = data.drop(columns=['Unnamed: 15', 'Unnamed: 16'])

# Check for missing values
missing_values = data_cleaned.isnull().sum()

missing_values


Date             114
Time             114
CO(GT)           114
PT08.S1(CO)      114
NMHC(GT)         114
C6H6(GT)         114
PT08.S2(NMHC)    114
NOx(GT)          114
PT08.S3(NOx)     114
NO2(GT)          114
PT08.S4(NO2)     114
PT08.S5(O3)      114
T                114
RH               114
AH               114
dtype: int64

In [6]:
# Drop rows with missing Date and Time values
data_cleaned = data_cleaned.dropna(subset=['Date', 'Time'])

# Convert Date and Time columns to a single datetime format
data_cleaned['Datetime'] = pd.to_datetime(data_cleaned['Date'] + ' ' + data_cleaned['Time'], format='%d/%m/%Y %H.%M.%S')

# Drop the original Date and Time columns
data_cleaned = data_cleaned.drop(columns=['Date', 'Time'])

# Set Datetime as the index
data_cleaned.set_index('Datetime', inplace=True)

# Display the first few rows of the cleaned dataset
data_cleaned.head()


Unnamed: 0_level_0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2004-03-10 18:00:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
2004-03-10 19:00:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2004-03-10 20:00:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
2004-03-10 21:00:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
2004-03-10 22:00:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888


In [7]:
# Interpolate missing values using linear interpolation
data_interpolated = data_cleaned.interpolate(method='linear')

# Verify if there are any remaining missing values
remaining_missing = data_interpolated.isnull().sum()

remaining_missing

CO(GT)           0
PT08.S1(CO)      0
NMHC(GT)         0
C6H6(GT)         0
PT08.S2(NMHC)    0
NOx(GT)          0
PT08.S3(NOx)     0
NO2(GT)          0
PT08.S4(NO2)     0
PT08.S5(O3)      0
T                0
RH               0
AH               0
dtype: int64

**Auto EDA using the sweetviz**

In [8]:
import sweetviz as sv

# Create a report
report = sv.analyze(data_interpolated)

# Show the report
report.show_html("/content/drive/MyDrive/CMPE-255/Assignment 5/Data Set/Task-2/AirQualityReport.html")

                                             |          | [  0%]   00:00 -> (? left)

Report /content/drive/MyDrive/CMPE-255/Assignment 5/Data Set/Task-2/AirQualityReport.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


**Clustering and Anomaly Detection**

In [9]:
from sklearn.ensemble import IsolationForest

# Initialize the model
clf = IsolationForest(contamination=0.05)  # setting contamination to 5% to detect anomalies
anomalies = clf.fit_predict(data_interpolated)

# Convert -1 labels to 0 for anomalies and 1 to 0 for normal data
data_interpolated['Anomaly'] = [1 if i == -1 else 0 for i in anomalies]




**Feature Processing and Selection**

In [10]:
# Assuming we're predicting 'CO(GT)', generate lag features for it
lags = 3
for i in range(1, lags + 1):
    data_interpolated[f'CO(GT)_lag_{i}'] = data_interpolated['CO(GT)'].shift(i)


In [11]:
window_size = 3
data_interpolated['CO(GT)_rolling_mean'] = data_interpolated['CO(GT)'].rolling(window=window_size).mean()
data_interpolated['CO(GT)_rolling_std'] = data_interpolated['CO(GT)'].rolling(window=window_size).std()


In [12]:
from sklearn.ensemble import RandomForestRegressor

# Drop NA values generated due to lags and rolling window operations
data_for_feature_selection = data_interpolated.dropna()

# Initialize a random forest model
rf = RandomForestRegressor(n_estimators=100)
rf.fit(data_for_feature_selection.drop('CO(GT)', axis=1), data_for_feature_selection['CO(GT)'])

# Get feature importance
feature_importances = pd.Series(rf.feature_importances_, index=data_for_feature_selection.drop('CO(GT)', axis=1).columns)
top_features = feature_importances.nlargest(10).index.tolist()


In [13]:
!pip install tpot

Collecting tpot
  Downloading TPOT-0.12.1-py3-none-any.whl (87 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/87.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m81.9/87.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected 

In [14]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.20.1" 2023-08-24; OpenJDK Runtime Environment (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.20.1+1-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.10/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpesxctzau
  JVM stdout: /tmp/tmpesxctzau/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpesxctzau/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,05 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,13 days
H2O_cluster_name:,H2O_from_python_unknownUser_no1qdi
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.170 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Loading Data into H2O

In [15]:
h2o_df = h2o.H2OFrame(data_interpolated)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


Train-Test Split

In [16]:
train, test = h2o_df.split_frame(ratios=[.8], seed=1234)


Run H2O's AutoML

In [17]:
from h2o.automl import H2OAutoML

# Assuming we're predicting 'CO(GT)'
y = "CO(GT)"
x = train.columns
x.remove(y)

# Run AutoML
aml = H2OAutoML(max_models=20, seed=1, max_runtime_secs=600)
aml.train(x=x, y=y, training_frame=train)


AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,91.0,91.0,20821.0,4.0,4.0,4.0,7.0,16.0,13.593407

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.6098349,0.1409365,0.6080548,0.5759627,0.6657553,0.4060912,0.7933105
mean_residual_deviance,17.198385,13.943414,24.871641,7.8811603,14.854485,1.6062427,36.778397
mse,17.198385,13.943414,24.871641,7.8811603,14.854485,1.6062427,36.778397
r2,0.9971413,0.0022982,0.995732,0.9986797,0.9975614,0.9997341,0.993999
residual_deviance,17.198385,13.943414,24.871641,7.8811603,14.854485,1.6062427,36.778397
rmse,3.796107,1.8668008,4.987148,2.8073404,3.8541515,1.2673763,6.0645194
rmsle,,0.0,,,,,

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2023-10-30 08:08:21,40.751 sec,0.0,77.5394532,59.4631982,6012.3667981
,2023-10-30 08:08:21,40.812 sec,5.0,46.2911822,35.4645154,2142.8735459
,2023-10-30 08:08:21,40.871 sec,10.0,28.2068769,21.4648357,795.6279053
,2023-10-30 08:08:21,40.930 sec,15.0,17.1394098,12.8980504,293.7593669
,2023-10-30 08:08:21,40.989 sec,20.0,10.5200939,7.7330537,110.6723758
,2023-10-30 08:08:21,41.053 sec,25.0,6.6787806,4.6616725,44.6061108
,2023-10-30 08:08:21,41.114 sec,30.0,4.5879823,2.8616661,21.0495813
,2023-10-30 08:08:21,41.179 sec,35.0,3.3811488,1.7770372,11.4321675
,2023-10-30 08:08:21,41.248 sec,40.0,2.7835668,1.1654532,7.7482441
,2023-10-30 08:08:21,41.315 sec,45.0,2.4326627,0.8248736,5.9178478

variable,relative_importance,scaled_importance,percentage
CO(GT)_rolling_mean,149969136.0,1.0,0.6430686
CO(GT)_lag_1,50625980.0,0.337576,0.2170845
CO(GT)_lag_2,24265946.0,0.1618063,0.1040525
CO(GT)_rolling_std,6485030.5,0.0432424,0.0278079
NOx(GT),820145.125,0.0054688,0.0035168
CO(GT)_lag_3,737255.625,0.004916,0.0031614
NO2(GT),138311.484375,0.0009223,0.0005931
PT08.S1(CO),51330.7460938,0.0003423,0.0002201
PT08.S4(NO2),27163.6269531,0.0001811,0.0001165
PT08.S2(NMHC),25667.3984375,0.0001712,0.0001101


**Leaderboard and Model Selection**

In [18]:
lb = aml.leaderboard
print(lb.head(rows=lb.nrows))

model_id                                            rmse       mse       mae    rmsle    mean_residual_deviance
GBM_grid_1_AutoML_1_20231030_80132_model_2       4.14087   17.1468  0.61329       nan                   17.1468
XGBoost_grid_1_AutoML_1_20231030_80132_model_3   4.33813   18.8193  0.439404      nan                   18.8193
GBM_5_AutoML_1_20231030_80132                    4.47084   19.9884  0.785095      nan                   19.9884
GBM_1_AutoML_1_20231030_80132                    4.9023    24.0325  1.46729       nan                   24.0325
GBM_3_AutoML_1_20231030_80132                    5.41115   29.2805  0.974317      nan                   29.2805
GBM_4_AutoML_1_20231030_80132                    5.41723   29.3464  1.02921       nan                   29.3464
XGBoost_3_AutoML_1_20231030_80132                5.4173    29.3471  0.971359      nan                   29.3471
XGBoost_grid_1_AutoML_1_20231030_80132_model_2   5.61604   31.5399  0.909998      nan                   

In [19]:
preds = aml.leader.predict(test)


gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
