In [None]:
ML Foundations: Hands on, Feature Engineering

In [1]:
import h2o
from h2o.estimators import H2OTargetEncoderEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) Client VM (build 25.261-b12, mixed mode, sharing)
  Starting server from C:\Users\Renzo\AppData\Local\Programs\Python\Python38-32\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Renzo\AppData\Local\Temp\tmpyckm9snx
  JVM stdout: C:\Users\Renzo\AppData\Local\Temp\tmpyckm9snx\h2o_Renzo_started_from_python.out
  JVM stderr: C:\Users\Renzo\AppData\Local\Temp\tmpyckm9snx\h2o_Renzo_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,America/Denver
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.2
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_Renzo_l5kycy
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,247.5 Mb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [2]:
#Load data into cluster
filename = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
df = h2o.import_file(filename)
df.head()

Parse progress: |█████████████████████████████████████████████████████████| 100%


pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1,1,Allen Miss. Elisabeth Walton,female,29.0,0,0,24160.0,211.338,B5,S,2.0,,St Louis MO
1,1,Allison Master. Hudson Trevor,male,0.9167,1,2,113781.0,151.55,C22 C26,S,11.0,,Montreal PQ / Chesterville ON
1,0,Allison Miss. Helen Loraine,female,2.0,1,2,113781.0,151.55,C22 C26,S,,,Montreal PQ / Chesterville ON
1,0,Allison Mr. Hudson Joshua Creighton,male,30.0,1,2,113781.0,151.55,C22 C26,S,,135.0,Montreal PQ / Chesterville ON
1,0,Allison Mrs. Hudson J C (Bessie Waldo Daniels),female,25.0,1,2,113781.0,151.55,C22 C26,S,,,Montreal PQ / Chesterville ON
1,1,Anderson Mr. Harry,male,48.0,0,0,19952.0,26.55,E12,S,3.0,,New York NY
1,1,Andrews Miss. Kornelia Theodosia,female,63.0,1,0,13502.0,77.9583,D7,S,10.0,,Hudson NY
1,0,Andrews Mr. Thomas Jr,male,39.0,0,0,112050.0,0.0,A36,S,,,Belfast NI
1,1,Appleton Mrs. Edward Dale (Charlotte Lamson),female,53.0,2,0,11769.0,51.4792,C101,S,,,Bayside Queens NY
1,0,Artagaveytia Mr. Ramon,male,71.0,0,0,,49.5042,,C,,22.0,Montevideo Uruguay




In [3]:
#Response column
df["survived"] = df["survived"].asfactor()
response = "survived"

# Split the data set
train, test = df.split_frame(ratios=[0.8], seed = 1234)
#train = splits[0]
#test = splits[1]


In [6]:
# Encoded columns
encoded = ["home.dest", "cabin", "embarked"]

# Target encoding
blended_avg = True
inflection_point = 3
smoothing = 10
noise = 0.15

# To avoid data leakage, using k_fild strategy
data_leakage_handling = "k_fold"
fold_column = "kfold_column"
n_folds = 5
train[fold_column] = train.kfold_column(n_folds, seed = 3456)




In [7]:
# Training a target encoded model

titanic_te = H2OTargetEncoderEstimator(fold_column = fold_column,
                                       data_leakage_handling = data_leakage_handling,
                                       blending = blended_avg,
                                       k = inflection_point,
                                       f = smoothing)

titanic_te.train(x = encoded, y = response, training_frame = train)



targetencoder Model Build progress: |█████████████████████████████████████| 100%


In [8]:
# New encoded train and test sets
train_te = titanic_te.transform(frame = train, data_leakage_handling = data_leakage_handling, seed = 1234, noise = noise)

test_te = titanic_te.transform(frame = test, data_leakage_handling = data_leakage_handling, seed = 1234, noise = 0)




In [9]:
# Define a gradient boosting machine model with target encoding

gbm_with_te = H2OGradientBoostingEstimator(max_depth = 6,
                                           min_rows = 1,
                                           fold_column = fold_column,
                                           score_tree_interval = 5,
                                           ntrees = 10000,
                                           sample_rate = 0.8,
                                           col_sample_rate = 0.8,
                                           seed = 1234,
                                           stopping_rounds = 5,
                                           stopping_metric = "auto",
                                           stopping_tolerance = 0.001,
                                           model_id = "gbm_with_te")


In [11]:
#Training

x_with_te = ["pclass", "sex", "age", "sibsp", "parch", "fare", "cabin", "embarked", "home.dest"]
gbm_with_te.train(x = x_with_te, y = response, training_frame = train_te)


gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [13]:
#test unseen data

my_gbm_metrics = gbm_with_te.model_performance(test_te)
auc_with_te = my_gbm_metrics.auc()
auc_with_te


0.8555900621118012

In [14]:
# Create a baseline GBM estimator

gbm_baseline = H2OGradientBoostingEstimator(max_depth = 6,
                                            min_rows = 1,
                                            fold_column = fold_column,
                                            score_tree_interval = 5,
                                            ntrees = 10000,
                                            sample_rate = 0.8,
                                            col_sample_rate = 0.8,
                                            seed = 1234,
                                            stopping_rounds = 5,
                                            stopping_metric = "auto",
                                            stopping_tolerance = 0.001,
                                            model_id = "gbm_baseline")

x_baseline = ["pclass", "sex", "age", "sibsp", "parch", "fare", "cabin", "embarked", "home.dest"]
gbm_baseline.train(x=x_baseline, y=response, training_frame=train)
gbm_baseline_metrics = gbm_baseline.model_performance(test)
auc_baseline = gbm_baseline_metrics.auc()
auc_baseline



gbm Model Build progress: |███████████████████████████████████████████████| 100%


0.8417470596008987

In [5]:
h2o.cluster().shutdown(prompt = True)