# H2O Auto ML

In [2]:
import h2o
from h2o.automl import H2OAutoML
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [3]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "19" 2022-09-20; OpenJDK Runtime Environment (build 19+36-2238); OpenJDK 64-Bit Server VM (build 19+36-2238, mixed mode, sharing)
  Starting server from /home/shishiriyer/.local/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpgtw3vqgb
  JVM stdout: /tmp/tmpgtw3vqgb/h2o_shishiriyer_started_from_python.out
  JVM stderr: /tmp/tmpgtw3vqgb/h2o_shishiriyer_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,00 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,1 month and 8 days
H2O_cluster_name:,H2O_from_python_shishiriyer_j7z3ao
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.770 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [4]:
# Load the dataset
train_df = pd.read_csv('archive/train.csv')
test_df = pd.read_csv('archive/test_public.csv')
coord_lookup = pd.read_csv('archive/metaData_taxistandsID_name_GPSlocation.csv')

In [5]:
def get_lon(station_no):
    return float(coord_lookup.loc[coord_lookup['ID'] == station_no]['Longitude'].iloc[0])

def get_lat(station_no):
    return float(coord_lookup.loc[coord_lookup['ID'] == station_no]['Latitude'].iloc[0])

In [54]:
encoder = LabelEncoder()
train_df['CALL_TYPE'] = encoder.fit_transform(train_df['CALL_TYPE'])

def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

# Normalize timestamp
train_df['TIMESTAMP'] = pd.to_datetime(train_df['TIMESTAMP'], unit='s')
train_df['HOUR'] = train_df['TIMESTAMP'].dt.hour
train_df['DAY_OF_WEEK'] = train_df['TIMESTAMP'].dt.dayofweek + 1
train_df['WEEK_OF_YEAR'] = train_df['TIMESTAMP'].dt.isocalendar().week.astype(int)
train_df['TIME'] = train_df['POLYLINE'].apply(polyline_to_trip_duration)

# train_df = train_df[train_df["TIME"] != 0]  # Remove rows that don't have a polyline

# Split the training data based on whether CALL_TYPE == 'B' (i.e. there is an origin stand present)
has_origin = train_df['CALL_TYPE'] == 1
train_df_origin = train_df[has_origin]
train_df_nan = train_df[~has_origin]
train_df_origin = train_df_origin.dropna(subset=['ORIGIN_STAND'])

features_nan = pd.DataFrame({ 
    'HOUR': train_df_nan['HOUR'], 
    'DAY_OF_WEEK': train_df_nan['DAY_OF_WEEK'], 
    'WEEK_OF_YEAR': train_df_nan['WEEK_OF_YEAR'], 
    'TIME': train_df_nan['TIME']
})

features_origin = pd.DataFrame({
    'HOUR': train_df_origin['HOUR'], 
    'DAY_OF_WEEK': train_df_origin['DAY_OF_WEEK'], 
    'WEEK_OF_YEAR': train_df_origin['WEEK_OF_YEAR'], 
    'STAND': train_df_origin['ORIGIN_STAND'],
    'TIME': train_df_origin['TIME']
})

print(features_nan)

         HOUR  DAY_OF_WEEK  WEEK_OF_YEAR  TIME
0           0            1            27   330
2           0            1            27   960
3           0            1            27   630
4           0            1            27   420
5           0            1            27   375
...       ...          ...           ...   ...
1710656     4            7            26   240
1710659     7            6            26   795
1710663    11            4             1  2895
1710665    23            1            27   465
1710666    23            1            27   435

[865269 rows x 4 columns]


In [55]:
import numpy as np

# Calculate the threshold for the top 1% travel time
top_percentile_nan = np.percentile(features_nan['TIME'], 99)
top_percentile_origin = np.percentile(features_origin['TIME'], 99)

# Filter the data based on the travel time threshold
features_nan = features_nan[features_nan['TIME'] <= top_percentile_nan]
features_origin = features_origin[features_origin['TIME'] <= top_percentile_origin]

In [56]:
features_nan = h2o.H2OFrame(features_nan)
features_origin = h2o.H2OFrame(features_origin)
train_nan, test_nan, _ = features_nan.split_frame(ratios=[0.2,0.2], seed = 1)
train_origin, test_origin, _ = features_origin.split_frame(ratios=[0.2,0.2], seed=1)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [57]:
aml_nan = H2OAutoML(max_models=25, seed=1, nfolds=0)
aml_nan.train(training_frame=train_nan, y='TIME', validation_frame=test_nan)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,35.0

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2023-06-06 01:04:04,0.010 sec,0.0,871.5342658,732.9167741,759571.9764714,867.1822305,729.8872258,752005.0209648
,2023-06-06 01:04:05,0.194 sec,5.0,480.4084015,323.1857697,230792.2322455,476.672179,321.7500235,227216.366237
,2023-06-06 01:04:05,0.389 sec,10.0,462.3968465,325.6209533,213810.8436582,459.6376333,324.974258,211266.7539531
,2023-06-06 01:04:05,0.581 sec,15.0,461.3861279,328.1268418,212877.1590498,458.9762772,327.7281005,210659.2230172
,2023-06-06 01:04:05,0.792 sec,20.0,461.0164992,328.241695,212536.2125292,458.7850375,327.9583906,210483.7106771
,2023-06-06 01:04:05,1.013 sec,25.0,460.675228,328.1296933,212221.665675,458.7703779,328.0984733,210470.2596596
,2023-06-06 01:04:06,1.550 sec,30.0,460.5349835,328.0470198,212092.4710532,458.7727235,328.1199036,210472.4118427
,2023-06-06 01:04:06,1.814 sec,35.0,460.5018557,328.1151198,212061.9590884,458.795169,328.2282508,210493.007089

variable,relative_importance,scaled_importance,percentage
HOUR,1466476032.0,1.0,0.5828124
WEEK_OF_YEAR,560155392.0,0.3819738,0.2226191
DAY_OF_WEEK,489574432.0,0.3338441,0.1945685


In [58]:
leaderboard = aml_nan.leaderboard
print(leaderboard.head())

model_id                                           rmse     mse      mae     rmsle    mean_residual_deviance
XGBoost_grid_1_AutoML_7_20230606_10344_model_1  458.795  210493  328.228  0.738222                    210493
XGBoost_grid_1_AutoML_7_20230606_10344_model_3  458.824  210519  328.006  0.737747                    210519
XGBoost_3_AutoML_7_20230606_10344               458.923  210610  328.159  0.738404                    210610
GBM_4_AutoML_7_20230606_10344                   458.932  210619  328.2    0.738663                    210619
GBM_3_AutoML_7_20230606_10344                   458.997  210678  328.323  0.738942                    210678
XGBoost_2_AutoML_7_20230606_10344               459.02   210699  328.31   0.738255                    210699
GBM_1_AutoML_7_20230606_10344                   459.025  210704  328.35   0.738727                    210704
XGBoost_grid_1_AutoML_7_20230606_10344_model_2  459.05   210727  328.322  0.73793                     210727
GBM_2_AutoML_7_2023

In [59]:
aml_origin = H2OAutoML(max_models=25, seed=1, nfolds=0)
aml_origin.train(training_frame=train_origin, y='TIME', validation_frame=test_origin)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,55.0,55.0,384916.0,10.0,10.0,8.727273,1.0,837.0,553.4545

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2023-06-06 01:26:34,0.014 sec,0.0,332.4280892,254.5835176,110508.4344907,332.7260649,254.8786913,110706.6342659
,2023-06-06 01:26:35,0.146 sec,5.0,322.9408072,246.2460521,104290.7649357,324.7866361,247.7934482,105486.3589702
,2023-06-06 01:26:35,0.312 sec,10.0,318.1092173,241.9733669,101193.4741283,321.1919333,244.5317196,103164.2580047
,2023-06-06 01:26:35,0.479 sec,15.0,315.1567617,239.2904632,99323.7844404,319.3618413,242.7427724,101991.9856617
,2023-06-06 01:26:35,0.650 sec,20.0,312.8490017,237.2734434,97874.4978606,318.2182453,241.720362,101262.8516293
,2023-06-06 01:26:35,0.820 sec,25.0,311.1367172,235.771105,96806.056794,317.5397987,241.0402925,100831.5237307
,2023-06-06 01:26:35,1.017 sec,30.0,309.9205272,234.7193517,96050.7331889,317.1599738,240.6614582,100590.4489691
,2023-06-06 01:26:36,1.209 sec,35.0,308.7051515,233.6074999,95298.870565,316.8150178,240.2508637,100371.7555122
,2023-06-06 01:26:36,1.396 sec,40.0,307.9676122,233.0260555,94844.0501358,316.748532,240.1996327,100329.6324989
,2023-06-06 01:26:36,1.535 sec,45.0,307.3667741,232.5056351,94474.3338421,316.6823821,240.1135779,100287.7311463

variable,relative_importance,scaled_importance,percentage
HOUR,4483958272.0,1.0,0.3604948
STAND,3722455808.0,0.8301718,0.2992726
WEEK_OF_YEAR,2687619584.0,0.5993855,0.2160754
DAY_OF_WEEK,1544309248.0,0.3444076,0.1241572


In [60]:
test_df['CALL_TYPE'] = encoder.fit_transform(test_df['CALL_TYPE'])

# Normalize timestamp
test_df['TIMESTAMP'] = pd.to_datetime(test_df['TIMESTAMP'], unit='s')
test_df['HOUR'] = test_df['TIMESTAMP'].dt.hour
test_df['DAY_OF_WEEK'] = test_df['TIMESTAMP'].dt.dayofweek + 1
test_df['WEEK_OF_YEAR'] = test_df['TIMESTAMP'].dt.isocalendar().week.astype(int)

# Split the training data based on whether CALL_TYPE == 'B' (i.e. there is an origin stand present)
has_origin = test_df['CALL_TYPE'] == 1
test_df_origin = test_df[has_origin]
test_df_nan = test_df[~has_origin]
test_df_origin = test_df_origin.dropna(subset=['ORIGIN_STAND'])

features_nan = pd.DataFrame({ 
    'HOUR': test_df_nan['HOUR'], 
    'DAY_OF_WEEK': test_df_nan['DAY_OF_WEEK'], 
    'WEEK_OF_YEAR': test_df_nan['WEEK_OF_YEAR']
})


features_origin = pd.DataFrame({
    'HOUR': test_df_origin['HOUR'], 
    'DAY_OF_WEEK': test_df_origin['DAY_OF_WEEK'], 
    'WEEK_OF_YEAR': test_df_origin['WEEK_OF_YEAR'], 
    'STAND': test_df_origin['ORIGIN_STAND']
})

features_nan = h2o.H2OFrame(features_nan)
features_origin = h2o.H2OFrame(features_origin)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


## WIP

In [61]:
pred_nan = h2o.as_list(aml_nan.predict(features_nan))
pred_origin = h2o.as_list(aml_origin.predict(features_origin))

print(pred_nan['predict'].mean(), pred_origin['predict'].mean())

import csv

fields = ['TRIP_ID', 'TRAVEL_TIME']
rows = []

i_n = 0
i_o = 0

for i in range(320):
    trip_id = test_df.iloc[i, 0]
    row = test_df.iloc[i]
    # print(i_n, i_o)
    if row['CALL_TYPE'] == 1:
        # print(features_origin.iloc[i_o].to_numpy().reshape(1, -1))
        rows.append([trip_id, pred_origin.iloc[i_o]['predict']])
        i_o += 1
    else:
        # print(features_nan.iloc[i_n].to_numpy().reshape(1, -1))
        rows.append([trip_id, pred_nan.iloc[i_n]['predict']])
        i_n += 1
    
with open("submission.csv", 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    csvwriter.writerows(rows)

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
775.1161009502895 707.8473828314478


In [None]:
h2o.shutdown()