In [1]:
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('precision', 4)

SEED = 2311

# Data

In [2]:
train_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/analytics_vidhya/jobathon_feb22/data/raw/train.csv'
test_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/analytics_vidhya/jobathon_feb22/data/raw/test.csv'

In [3]:
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89197 entries, 0 to 89196
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   row_id            89197 non-null  int64  
 1   user_id           89197 non-null  int64  
 2   category_id       89197 non-null  int64  
 3   video_id          89197 non-null  int64  
 4   age               89197 non-null  int64  
 5   gender            89197 non-null  object 
 6   profession        89197 non-null  object 
 7   followers         89197 non-null  int64  
 8   views             89197 non-null  int64  
 9   engagement_score  89197 non-null  float64
dtypes: float64(1), int64(7), object(2)
memory usage: 6.8+ MB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11121 entries, 0 to 11120
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   row_id       11121 non-null  int64 
 1   user_id      11121 non-null  int64 
 2   category_id  11121 non-null  int64 
 3   video_id     11121 non-null  int64 
 4   age          11121 non-null  int64 
 5   gender       11121 non-null  object
 6   profession   11121 non-null  object
 7   followers    11121 non-null  int64 
 8   views        11121 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 782.1+ KB


# H2O AutoML

In [6]:
%%capture
!pip install --upgrade h2o

In [7]:
import h2o
from h2o.automl import H2OAutoML

In [8]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.13" 2021-10-19; OpenJDK Runtime Environment (build 11.0.13+8-Ubuntu-0ubuntu1.18.04); OpenJDK 64-Bit Server VM (build 11.0.13+8-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpcqhnsiof
  JVM stdout: /tmp/tmpcqhnsiof/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpcqhnsiof/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.2
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_unknownUser_bw9k52
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.172 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [9]:
h2o_train = h2o.H2OFrame(train)
h2o_test = h2o.H2OFrame(test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [10]:
features = [f for f in train.columns if f not in ('row_id', 'engagement_score')]

In [11]:
model = H2OAutoML(stopping_metric='AUTO', max_runtime_secs=1800, seed=SEED)
model.train(x=features, y='engagement_score', training_frame=h2o_train)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_AllModels_3_AutoML_1_20220213_162247

No model summary for this model

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 0.3374821394085636
RMSE: 0.5809321297781382
MAE: 0.43406638391143265
RMSLE: 0.16131258518812094
R^2: 0.5423241097341807
Mean Residual Deviance: 0.3374821394085636
Null degrees of freedom: 9965
Residual degrees of freedom: 9959
Null deviance: 7348.786101862985
Residual deviance: 3363.3470013457445
AIC: 17472.7881918991

ModelMetricsRegressionGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.45786358723096243
RMSE: 0.6766561809596972
MAE: 0.506747936688585
RMSLE: 0.1850213286076423
R^2: 0.3859294578945449
Mean Residual Deviance: 0.45786358723096243
Null degrees of freedom: 89196
Residual degrees of freedom: 89190
Null deviance: 66509.98588



In [12]:
model.leaderboard

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_3_AutoML_1_20220213_162247,0.457864,0.676656,0.457864,0.506748,0.185021
StackedEnsemble_AllModels_1_AutoML_1_20220213_162247,0.461427,0.679284,0.461427,0.509394,0.185736
StackedEnsemble_AllModels_2_AutoML_1_20220213_162247,0.461481,0.679324,0.461481,0.509433,0.185744
StackedEnsemble_BestOfFamily_1_AutoML_1_20220213_162247,0.463509,0.680815,0.463509,0.511199,0.1862
StackedEnsemble_BestOfFamily_2_AutoML_1_20220213_162247,0.46459,0.681608,0.46459,0.511408,0.186302
StackedEnsemble_BestOfFamily_4_AutoML_1_20220213_162247,0.464624,0.681633,0.464624,0.511219,0.186344
StackedEnsemble_BestOfFamily_3_AutoML_1_20220213_162247,0.465738,0.68245,0.465738,0.512231,0.186525
GBM_4_AutoML_1_20220213_162247,0.467548,0.683774,0.467548,0.513677,0.186923
XGBoost_grid_1_AutoML_1_20220213_162247_model_6,0.468437,0.684425,0.468437,0.513576,0.187002
XGBoost_grid_1_AutoML_1_20220213_162247_model_5,0.469257,0.685023,0.469257,0.513332,0.186735




In [21]:
preds = model.leader.predict(h2o_test).as_data_frame()['predict']

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [23]:
sub1 = pd.DataFrame({'row_id': test.row_id, 'engagement_score': preds})
sub1.to_csv('sub1.csv', index=False)

In [24]:
!head sub1.csv

row_id,engagement_score
89198,4.163207503286123
89199,3.623942329665717
89200,2.7387019959561285
89201,3.8980790227755766
89202,1.968328387143284
89203,3.737067592684384
89204,3.917459300815463
89205,3.8146229809021
89206,2.6807972723468616
