# H2O Auto ML

In [2]:
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [3]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "19" 2022-09-20; OpenJDK Runtime Environment (build 19+36-2238); OpenJDK 64-Bit Server VM (build 19+36-2238, mixed mode, sharing)
  Starting server from /home/shishiriyer/.local/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpnbwp5cum
  JVM stdout: /tmp/tmpnbwp5cum/h2o_shishiriyer_started_from_python.out
  JVM stderr: /tmp/tmpnbwp5cum/h2o_shishiriyer_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,1 month and 7 days
H2O_cluster_name:,H2O_from_python_shishiriyer_xxisoh
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.770 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [4]:
# Load the dataset
train_df = pd.read_csv('archive/train.csv')
test_df = pd.read_csv('archive/test_public.csv')
coord_lookup = pd.read_csv('archive/metaData_taxistandsID_name_GPSlocation.csv')

In [5]:
def get_lon(station_no):
    return float(coord_lookup.loc[coord_lookup['ID'] == station_no]['Longitude'].iloc[0])

def get_lat(station_no):
    return float(coord_lookup.loc[coord_lookup['ID'] == station_no]['Latitude'].iloc[0])

In [30]:
encoder = LabelEncoder()
train_df['CALL_TYPE'] = encoder.fit_transform(train_df['CALL_TYPE'])

# Normalize timestamp
train_df['TIMESTAMP'] = pd.to_datetime(train_df['TIMESTAMP'], unit='s')
train_df['MINUTE'] = train_df['TIMESTAMP'].dt.minute
train_df['HOUR'] = train_df['TIMESTAMP'].dt.hour
train_df['DAY'] = train_df['TIMESTAMP'].dt.dayofweek
train_df['MONTH'] = train_df['TIMESTAMP'].dt.month

# Split the training data based on whether CALL_TYPE == 'B' (i.e. there is an origin stand present)
has_origin = train_df['CALL_TYPE'] == 1
train_df_origin = train_df[has_origin]
train_df_nan = train_df[~has_origin]
train_df_origin = train_df_origin.dropna(subset=['ORIGIN_STAND'])

features_nan = pd.DataFrame({ 
    'MINUTE': train_df_nan['MINUTE'], 
    'HOUR': train_df_nan['HOUR'], 
    'DAY': train_df_nan['DAY'], 
    'MONTH': train_df_nan['MONTH'], 
    'TIME': train_df_nan['POLYLINE'].str.count(',') * 15 
})


features_origin = pd.DataFrame({
    'HOUR': train_df_origin['HOUR'], 
    'DAY': train_df_origin['DAY'], 
    'MONTH': train_df_origin['MONTH'], 
    'STAND': train_df_origin['ORIGIN_STAND'],
    'TIME': train_df_origin['POLYLINE'].str.count(',') * 15
})

print(features_nan)

         MINUTE  HOUR  DAY  MONTH  TIME
0             0     0    0      7   675
2             2     0    0      7  1935
3             0     0    0      7  1275
4             4     0    0      7   855
5             2     0    0      7   765
...         ...   ...  ...    ...   ...
1710663       0    11    3      1  5805
1710664      16    15    2      1     0
1710665      37    23    0      6   945
1710666      36    23    0      6   885
1710667      41    10    4      1     0

[892789 rows x 5 columns]


In [31]:
features_nan = h2o.H2OFrame(features_nan)
features_origin = h2o.H2OFrame(features_origin)
train_nan, test_nan = features_nan.split_frame(ratios=[0.8], seed = 1)
train_origin, test_origin = features_origin.split_frame(ratios=[0.8], seed=1)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [32]:
aml_nan = H2OAutoML(max_models=25, seed=1, nfolds=0)
aml_nan.train(training_frame=train_nan, y='TIME', validation_frame=test_nan)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,35.0

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2023-06-05 01:00:11,21.825 sec,0.0,2253.4684694,1541.5632562,5078120.1424972,2273.7888117,1539.7125048,5170115.5603483
,2023-06-05 01:00:12,22.975 sec,5.0,1648.5633876,764.7214839,2717761.2430582,1677.8720381,765.4679252,2815254.5761633
,2023-06-05 01:00:14,24.135 sec,10.0,1626.9911738,787.6667391,2647100.2796579,1657.3281533,788.5581109,2746736.6077854
,2023-06-05 01:00:15,25.804 sec,15.0,1625.621335,795.7782544,2642644.7247361,1656.6631748,796.9180599,2744532.8746385
,2023-06-05 01:00:17,27.347 sec,20.0,1624.9329711,797.0376492,2640407.1605517,1656.5844616,798.379748,2744272.0783545
,2023-06-05 01:00:18,28.982 sec,25.0,1624.4608414,797.0801581,2638873.0252674,1656.5068267,798.646437,2744014.8670161
,2023-06-05 01:00:20,30.823 sec,30.0,1623.9036317,796.9456323,2637063.0049549,1656.4967976,798.725362,2743981.640611
,2023-06-05 01:00:22,32.902 sec,35.0,1623.4588966,796.8260421,2635618.788807,1656.5267253,798.8788499,2744080.7916419

variable,relative_importance,scaled_importance,percentage
HOUR,65450803200.0,1.0,0.7069308
DAY,10781882368.0,0.1647326,0.1164546
MONTH,10663894016.0,0.1629299,0.1151802
MINUTE,5687882240.0,0.0869032,0.0614345


In [28]:
leaderboard = aml_nan.leaderboard
print(leaderboard.head())

model_id                                           rmse          mse      mae     rmsle    mean_residual_deviance
XGBoost_grid_1_AutoML_7_20230605_03457_model_3  1323.54  1.75176e+06  682.064  0.990681               1.75176e+06
GBM_3_AutoML_7_20230605_03457                   1323.72  1.75224e+06  682.091  0.99124                1.75224e+06
GBM_4_AutoML_7_20230605_03457                   1323.82  1.7525e+06   682.251  0.99122                1.7525e+06
GBM_1_AutoML_7_20230605_03457                   1323.95  1.75285e+06  682.545  0.990693               1.75285e+06
GBM_2_AutoML_7_20230605_03457                   1324.01  1.75301e+06  682.408  0.991608               1.75301e+06
XGBoost_3_AutoML_7_20230605_03457               1324.04  1.75309e+06  682.521  0.991391               1.75309e+06
XGBoost_grid_1_AutoML_7_20230605_03457_model_1  1324.08  1.7532e+06   682.744  0.991238               1.7532e+06
XGBoost_grid_1_AutoML_7_20230605_03457_model_5  1324.11  1.75326e+06  682.739  0.991619   

In [33]:
aml_origin = H2OAutoML(max_models=25, seed=1, nfolds=0)
aml_origin.train(training_frame=train_origin, y='TIME', validation_frame=test_origin)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,60.0

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2023-06-05 01:19:42,37.734 sec,0.0,1655.1045757,1342.4738736,2739371.1565223,1634.867761,1341.1984673,2672792.5957994
,2023-06-05 01:19:42,38.484 sec,5.0,975.417688,541.1375129,951439.6659682,942.5200223,539.2881226,888343.9923907
,2023-06-05 01:19:43,39.196 sec,10.0,945.1375966,540.2075834,893285.0765668,911.6603229,539.2423741,831124.5444261
,2023-06-05 01:19:44,40.061 sec,15.0,942.5540185,543.4088273,888408.0777005,909.1593796,542.5582634,826570.7774294
,2023-06-05 01:19:45,41.160 sec,20.0,941.0535942,542.9005627,885581.8670978,907.886078,542.1547642,824257.1305597
,2023-06-05 01:19:46,42.475 sec,25.0,939.9606802,542.1269209,883526.0803338,907.176366,541.4930616,822968.959068
,2023-06-05 01:19:48,43.887 sec,30.0,939.1980339,541.4886672,882092.9467925,906.7129376,540.9506647,822128.3511304
,2023-06-05 01:19:49,45.425 sec,35.0,938.5832321,541.0553733,880938.4836566,906.479031,540.6597961,821704.2336236
,2023-06-05 01:19:51,47.241 sec,40.0,938.0290089,540.5909699,879898.4215439,906.2130183,540.3493489,821222.0345393
,2023-06-05 01:19:53,49.212 sec,45.0,937.6076622,540.1977815,879108.1282075,906.0084927,540.1157445,820851.3888424

variable,relative_importance,scaled_importance,percentage
HOUR,35034750976.0,1.0,0.4591413
STAND,25633097728.0,0.7316478,0.3359297
DAY,7852457984.0,0.2241334,0.1029089
MONTH,7784632832.0,0.2221975,0.10202


In [37]:
test_df['CALL_TYPE'] = encoder.fit_transform(test_df['CALL_TYPE'])

# Normalize timestamp
test_df['TIMESTAMP'] = pd.to_datetime(test_df['TIMESTAMP'], unit='s')
test_df['MINUTE'] = test_df['TIMESTAMP'].dt.minute
test_df['HOUR'] = test_df['TIMESTAMP'].dt.hour
test_df['DAY'] = test_df['TIMESTAMP'].dt.dayofweek
test_df['MONTH'] = test_df['TIMESTAMP'].dt.month

# Split the training data based on whether CALL_TYPE == 'B' (i.e. there is an origin stand present)
has_origin = test_df['CALL_TYPE'] == 1
test_df_origin = test_df[has_origin]
test_df_nan = test_df[~has_origin]
test_df_origin = test_df_origin.dropna(subset=['ORIGIN_STAND'])

features_nan = pd.DataFrame({ 
    'MINUTE': test_df_nan['MINUTE'], 
    'HOUR': test_df_nan['HOUR'], 
    'DAY': test_df_nan['DAY'], 
    'MONTH': test_df_nan['MONTH']
})


features_origin = pd.DataFrame({
    'HOUR': test_df_origin['HOUR'], 
    'DAY': test_df_origin['DAY'], 
    'MONTH': test_df_origin['MONTH'], 
    'STAND': test_df_origin['ORIGIN_STAND']
})

print(features_origin)

     HOUR  DAY  MONTH  STAND
0      17    3      8   15.0
1      17    3      8   57.0
2      17    3      8   15.0
3      17    3      8   53.0
4      17    3      8   18.0
..    ...  ...    ...    ...
290    14    6     12   16.0
295    14    6     12   42.0
297    14    6     12   53.0
310    14    6     12   22.0
316    14    6     12   53.0

[123 rows x 4 columns]


## WIP

In [38]:
features_nan = h2o.H2OFrame(features_nan)
features_origin = h2o.H2OFrame(features_origin)

import csv

fields = ['TRIP_ID', 'TRAVEL_TIME']
rows = []

i_n = 0
i_o = 0

for i in range(320):
    trip_id = test_df.iloc[i, 0]
    row = test_df.iloc[i]
    # print(i_n, i_o)
    if row['CALL_TYPE'] == 1:
        # print(features_origin.iloc[i_o].to_numpy().reshape(1, -1))
        rows.append([trip_id, aml_origin.predict(features_origin.iloc[i_o].to_numpy().reshape(1, -1))[0]])
        i_o += 1
    else:
        # print(features_nan.iloc[i_n].to_numpy().reshape(1, -1))
        rows.append([trip_id, aml_nan.predict(features_nan.iloc[i_n].to_numpy().reshape(1, -1))[0]])
        i_n += 1
    
with open("submission.csv", 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    csvwriter.writerows(rows)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


AttributeError: 'H2OFrame' object has no attribute 'iloc'

In [None]:
h2o.shutdown()