In [1]:
import importlib

import numpy as np

from pipeline import \
    data_collect as collect, \
    data_clean as clean, \
    data_enhance as enhance, \
    model_training as model

import pandas as pd

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_321"; Java(TM) SE Runtime Environment (build 1.8.0_321-b07); Java HotSpot(TM) 64-Bit Server VM (build 25.321-b07, mixed mode)
  Starting server from /Users/e/opt/anaconda3/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/sx/rf0mky_55k347433qghmqlmm0000gn/T/tmpso4ig6ua
  JVM stdout: /var/folders/sx/rf0mky_55k347433qghmqlmm0000gn/T/tmpso4ig6ua/h2o_e_started_from_python.out
  JVM stderr: /var/folders/sx/rf0mky_55k347433qghmqlmm0000gn/T/tmpso4ig6ua/h2o_e_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Amsterdam
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.3
H2O_cluster_version_age:,1 month and 8 days
H2O_cluster_name:,H2O_from_python_e_jaaxgm
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8.89 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [29]:
importlib.reload(collect)
importlib.reload(clean)
importlib.reload(enhance)
importlib.reload(model)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,22 mins 37 secs
H2O_cluster_timezone:,Europe/Amsterdam
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.3
H2O_cluster_version_age:,1 month and 8 days
H2O_cluster_name:,H2O_from_python_e_jaaxgm
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8.56 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


<module 'pipeline.model_training' from '/Users/e/uva/BD/project/IMDB---Big-Data-2022/pipeline/model_training.py'>

## Variables

In [2]:
# received data location
training_folder         = './source_imdb/'
validation_path         = 'source_imdb/validation_hidden.csv'
test_path               = 'source_imdb/test_hidden.csv'
writer_path             = 'source_imdb/writing.json'
director_path           = 'source_imdb/directing.json'

# outputs
training_agr_path       = 'output/train_aggregated.csv'
writer_director_path    = 'output/writers_directors.json'
omdb_path               = 'output/omdb.json'

---
# Gathering data

In [3]:
# construct information stores
collect.merge_training_files(training_folder, training_agr_path, verbose = False)
collect.construct_writers_directors(writer_path, director_path, writer_director_path)

In [4]:
#pyspark
train_df = pd.read_csv(training_agr_path,   index_col=0)
valid_df = pd.read_csv(validation_path,     index_col=0)
test_df  = pd.read_csv(test_path,           index_col=0)

In [5]:
tconst_list = list(pd.concat([train_df.tconst,test_df.tconst,valid_df.tconst]))
collect.automate_omdb_retrieval(tconst_list, omdb_path)

total tconsts:  10000
current missing tconsts:  0
finished


---
# Brining the processing together

In [6]:
print(train_df.shape, valid_df.shape, test_df.shape)

(7959, 8) (955, 7) (1086, 7)


In [7]:
def pipe(dataframe):
    """
    all processing steps to go from dirty data to cleaned+enhanced data
    :param dataframe: dirty dataframe
    :return: cleaned+enhanced dataframe
    """
    enhance.match_writer_director(dataframe, writer_director_path)
    dataframe = enhance.match_omdb(dataframe, omdb_path)  # concat used here is not inplace :(
    clean.fix_dtypes(dataframe)
    clean.col_a_if_a_else_b(dataframe, "startYear", "endYear", "year", "num")
    clean.col_a_if_a_else_b(dataframe, "primaryTitle", "originalTitle","title", "text" )
    clean.clean_text_df(dataframe, ["title"])
    enhance.years_until_today(dataframe, "year")
    clean.fill_missing_numerics(dataframe, ["numVotes", "runtimeMinutes"])

    return dataframe

In [8]:
# massage the three sets
train_df = pipe(train_df)
valid_df = pipe(valid_df)
test_df = pipe(test_df)

---
# Model training

In [9]:
print(train_df.shape, valid_df.shape, test_df.shape)

(7959, 38) (955, 37) (1086, 37)


In [27]:
import h2o
from h2o.estimators import H2OXGBoostEstimator

In [30]:
# turn dataframes into h2o frames
train_hf = h2o.H2OFrame(train_df)
valid_hf = h2o.H2OFrame(valid_df)
test_hf  = h2o.H2OFrame(test_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [31]:
# Set the target and response; set the response as a factor:
exclude = ["writer", "director", "primaryTitle", "tconst", "primaryTitle", "originalTitle", "imdb_score"]
target, predictors = model.get_target_and_predictors(train_hf, "label", exclude=exclude)

# Split the dataset into a train and valid set:
train_train, train_valid = train_hf.split_frame(ratios=[.7], seed=1234)

In [32]:
%%time

xgb_params = {
    'learn_rate': [0.01, 0.1],
    'max_depth': [3, 5, 9],
    'sample_rate': [0.8, 1.0],
    'col_sample_rate': [0.2, 0.5, 1.0],
    'skip_drop': [0, 0.5, 1.0],
    'ntrees': [30, 50, 100, 200, 300],
}
grid, perf = model.grid_search_xgb(predictors, target, train_train, train_valid, xgb_params)
perf

xgboost Grid Build progress: |███████████████████████████████████████████████████| (done) 100%
CPU times: user 29.9 s, sys: 3.69 s, total: 33.6 s
Wall time: 44min 21s
        col_sample_rate  learn_rate  max_depth  ntrees  sample_rate  \
0                   0.5        0.10        5.0   200.0          0.8   
1                   0.5        0.10        5.0   200.0          0.8   
2                   0.5        0.10        5.0   200.0          0.8   
3                   1.0        0.10        3.0   300.0          0.8   
4                   1.0        0.10        3.0   300.0          0.8   
..  ..              ...         ...        ...     ...          ...   
535                 0.2        0.01        3.0    30.0          1.0   
536                 0.2        0.01        3.0    30.0          1.0   
537                 0.2        0.01        3.0    30.0          0.8   
538                 0.2        0.01        3.0    30.0          0.8   
539                 0.2        0.01        3.0    30



In [33]:
# Build and train the model:
imdb_xgb = H2OXGBoostEstimator(
    seed=10,
    learn_rate=0.1,
    max_depth=5,
    sample_rate=0.8,
    col_sample_rate=0.5,
    skip_drop=0.5,
    ntrees=200,
)
imdb_xgb.train(x=predictors,
               y=target,
               training_frame=train_hf)
               #validation_frame=train_valid)

xgboost Model Build progress: |



██████████████████████████████████████████████████| (done) 100%
Model Details
H2OXGBoostEstimator :  XGBoost
Model Key:  XGBoost_model_python_1648211921548_61848


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees
0,,200.0




ModelMetricsBinomial: xgboost
** Reported on train data. **

MSE: 0.048977760157752945
RMSE: 0.2213091958273604
LogLoss: 0.16903798097496678
Mean Per-Class Error: 0.06334221797880946
AUC: 0.9854590179151582
AUCPR: 0.9860811611795959
Gini: 0.9709180358303164

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.49193911467279705: 


Unnamed: 0,Unnamed: 1,False,True,Error,Rate
0,False,3691.0,278.0,0.07,(278.0/3969.0)
1,True,226.0,3764.0,0.0566,(226.0/3990.0)
2,Total,3917.0,4042.0,0.0633,(504.0/7959.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.491939,0.937251,205.0
1,max f2,0.289418,0.956594,263.0
2,max f0point5,0.64507,0.947833,158.0
3,max accuracy,0.491939,0.936675,205.0
4,max precision,0.999777,1.0,0.0
5,max recall,0.017412,1.0,380.0
6,max specificity,0.999777,1.0,0.0
7,max absolute_mcc,0.491939,0.87342,205.0
8,max min_per_class_accuracy,0.518867,0.93584,197.0
9,max mean_per_class_accuracy,0.491939,0.936658,205.0



Gains/Lift Table: Avg response rate: 50.13 %, avg score: 50.09 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010052,0.99991,1.994737,1.994737,1.0,0.99995,1.0,0.99995,0.02005,0.02005,99.473684,99.473684,0.02005
1,2,0.020103,0.999796,1.994737,1.994737,1.0,0.999857,1.0,0.999903,0.02005,0.0401,99.473684,99.473684,0.0401
2,3,0.030029,0.999659,1.994737,1.994737,1.0,0.999739,1.0,0.999849,0.019799,0.0599,99.473684,99.473684,0.0599
3,4,0.04008,0.999485,1.994737,1.994737,1.0,0.999563,1.0,0.999777,0.02005,0.07995,99.473684,99.473684,0.07995
4,5,0.050006,0.999269,1.994737,1.994737,1.0,0.999388,1.0,0.9997,0.019799,0.099749,99.473684,99.473684,0.099749
5,6,0.100013,0.997226,1.994737,1.994737,1.0,0.998343,1.0,0.999021,0.099749,0.199499,99.473684,99.473684,0.199499
6,7,0.150019,0.993124,1.994737,1.994737,1.0,0.995515,1.0,0.997852,0.099749,0.299248,99.473684,99.473684,0.299248
7,8,0.200025,0.984169,1.994737,1.994737,1.0,0.989178,1.0,0.995684,0.099749,0.398997,99.473684,99.473684,0.398997
8,9,0.300038,0.932783,1.984713,1.991396,0.994975,0.96312,0.998325,0.984829,0.198496,0.597494,98.471304,99.139557,0.596486
9,10,0.40005,0.804962,1.891993,1.966545,0.948492,0.878925,0.985867,0.958353,0.189223,0.786717,89.199286,96.65449,0.775379




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2022-03-25 15:30:26,0.186 sec,0.0,0.5,0.693147,0.5,0.501319,1.0,0.498681
1,,2022-03-25 15:30:26,0.365 sec,1.0,0.47706,0.648213,0.890433,0.897377,1.958698,0.190979
2,,2022-03-25 15:30:26,0.458 sec,2.0,0.457745,0.61159,0.907608,0.915217,1.98543,0.164845
3,,2022-03-25 15:30:26,0.532 sec,3.0,0.440142,0.578972,0.92214,0.927577,1.994737,0.166478
4,,2022-03-25 15:30:27,0.604 sec,4.0,0.423768,0.548995,0.930836,0.935223,1.994737,0.158186
5,,2022-03-25 15:30:27,0.675 sec,5.0,0.409273,0.522805,0.936615,0.939849,1.994737,0.149516
6,,2022-03-25 15:30:27,0.739 sec,6.0,0.398171,0.502454,0.937953,0.941153,1.994737,0.15115
7,,2022-03-25 15:30:27,0.809 sec,7.0,0.387571,0.483127,0.938602,0.941954,1.994737,0.14625
8,,2022-03-25 15:30:27,0.880 sec,8.0,0.377102,0.464049,0.941436,0.944723,1.994737,0.138962
9,,2022-03-25 15:30:27,0.945 sec,9.0,0.368229,0.447366,0.942609,0.945521,1.994737,0.138208



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,rotten_score,9172.374023,1.0,0.24475
1,runtimeMinutes,4571.787598,0.49843,0.121991
2,meta_score,3811.72998,0.415566,0.10171
3,year,3156.798828,0.344164,0.084234
4,numVotes,2619.933838,0.285633,0.069909
5,yearsFromRelease,2275.590576,0.248092,0.06072
6,Horror.True,1704.892456,0.185873,0.045492
7,Horror.False,1327.255981,0.144701,0.035416
8,Documentary.False,1270.192749,0.13848,0.033893
9,awards.False,1199.269897,0.130748,0.032001



See the whole table with table.as_data_frame()




In [None]:
# Extract feature interactions:
feature_interactions = imdb_xgb.feature_interaction()
# feature_interactions

In [34]:
p_valid  = imdb_xgb.predict(valid_hf)
p_test = imdb_xgb.predict(test_hf)

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%


---
# Make predictions

In [35]:
output_path_valid = 'output/prediction_validation.txt'
model.write_prediction_to_file(p_valid, output_path_valid)

output_path_test = 'output/prediction_test.txt'
model.write_prediction_to_file(p_test, output_path_test)

 T/T+F Ratio: 0.49214659685863876
 T/T+F Ratio: 0.5184162062615101
