# Exploration
the goal of this notebook is to explore the data that we are working with

In [24]:
import pandas as pd
import os
import re
import plotly.express as px
from unidecode import unidecode
import json

---
# Concatenating Traning Files

In [25]:
def merge_training_files(training_folder, training_agr_path, verbose = False):
    # importing the csv files
    # The path for listing items
    path = training_folder

    # All files in the source folder
    files = os.listdir(path)

    # Filter out non training data
    r = re.compile("train-*")
    files = list(filter(r.match, files))

    # Aggregate training files
    valid_columns = ["tconst", "primaryTitle", "originalTitle", "startYear", "endYear", "runtimeMinutes", "numVotes", "label"]
    data = pd.DataFrame(columns=valid_columns)
    size_counter = 0
    for filename in files:
        filepath = path+filename
        _data = pd.read_csv(path+filename, index_col=0)
        data = pd.concat([data, _data], ignore_index=True)

        size_counter += _data.shape[0]
        if verbose: print(f'file: {filename} - {_data.shape[0]} | total {size_counter}')
    if verbose: print(f' >> data shape: {data.shape}')

    data.to_csv(training_agr_path)

In [26]:
# training_folder = './source_imdb/'
# data = merge_training_files(training_folder)

# Creating directors and writers files

In [None]:
def construct_writers_directors(writer_path, director_path, output_path):
    with open(writer_path, 'r') as w:
        _ = json.load(w)
        writers = {}
        for dict in _:
            writers[dict["movie"]] = dict["writer"]

    with open(director_path, 'r') as d:
        _ = json.load(d)
        directors = {}
        for key, movie in _['movie'].items():
            directors[movie] = _["director"][key]

    # print(f'writer count: {len(writers)}, director count: {len(directors)}')

    d = {k: {'writer': w, 'director': directors[k]} for k,w in writers.items()}

    del _, writers, directors

    with open(output_path, "w") as f:
        json.dump(d, f)

In [28]:
# writer_path   = 'source_imdb/writing.json'
# director_path = 'source_imdb/directing.json'
# writer_director_path   = 'output/writers_directors.json'
#
# construct_writers_directors(writer_path, director_path, writer_director_path)

# Matching writers and directors to movies

In [None]:
def writer_director_lookup(tconst, writer_director_dict):
    '''matches given tconst to writer and directs, pergorms a similarity check'''
    writer = writer_director_dict[tconst]['writer']
    director = writer_director_dict[tconst]['director']
    same = (writer == director)
    return writer, director, same

def match_writer_director(dataframe, writer_director_path):
    with open(writer_director_path, 'r') as w_d_json:
        w_d_dict = json.load(w_d_json)
        dataframe['writer'], dataframe['director'], dataframe['same_writer_director'] = \
            zip(*[writer_director_lookup(t, w_d_dict) for t in dataframe['tconst']])
    return dataframe

In [30]:
# data = match_writer_director(data, writer_director_path)
# data.head()

# fixing d-types

In [31]:
def fix_dtypes(dataframe):
    integer_columns  = ["startYear", "endYear", "runtimeMinutes", "numVotes"]
    bool_columns  = ["label", "same_writer_director"]
    string_columns  = ["tconst", "primaryTitle", "originalTitle", "writer", "director"]

    for c in integer_columns:
        dataframe[c] = pd.to_numeric(dataframe[c], errors="coerce")
    for c in string_columns:
        dataframe[c] = dataframe[c].astype('string')
    for c in bool_columns:
        if c in dataframe.columns:
            dataframe[c] = dataframe[c].astype('boolean')

    # print(dataframe.dtypes)

    return dataframe

# Cleaning text

In [98]:
# fix lexical bullshit from title column
def clean_text(text: str):
    text = re.sub(r'[^\w\s]','',text)
    text = text.lower()
    text = unidecode(text)
    return text

---
# Brining the processing together

In [33]:
def clean_data(raw_data_path):
    """Join all the data preprocessing steps here"""
    dataframe = pd.read_csv(raw_data_path, index_col=0)
    dataframe = match_writer_director(dataframe, writer_director_path)
    dataframe = fix_dtypes(dataframe)
    return dataframe

def feature_engineer_data(dataframe):
    """All feature engineering steps go here"""
    return dataframe

In [43]:
# prexisting data
training_folder         = './source_imdb/'
validation_path         = 'source_imdb/validation_hidden.csv'
test_path               = 'source_imdb/test_hidden.csv'
writer_path             = 'source_imdb/writing.json'
director_path           = 'source_imdb/directing.json'

# our data
training_agr_path       = 'output/train_aggregated.csv'
writer_director_path    = 'output/writers_directors.json'

# construct information stores
merge_training_files(training_folder, training_agr_path, verbose = False)
construct_writers_directors(writer_path, director_path, writer_director_path)

# clean the three sets
train_clean = clean_data(training_agr_path)
valid_clean  = clean_data(validation_path)
test_clean  = clean_data(test_path)

# apply feature engineering steps
train_df = feature_engineer_data(train_clean)
valid_df  = feature_engineer_data(valid_clean)
test_df  = feature_engineer_data(test_clean)

---
# Model training

In [55]:
import h2o
from h2o.estimators import H2OXGBoostEstimator
from h2o.grid.grid_search import H2OGridSearch

In [36]:
h2o.init(max_mem_size_GB=10)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 9 mins
H2O_cluster_timezone:,Europe/Amsterdam
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.3
H2O_cluster_version_age:,1 month and 6 days
H2O_cluster_name:,H2O_from_python_e_pg7917
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8.87 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [51]:
# Set the predictors and response; set the response as a factor:
train_hf = h2o.H2OFrame(train_df)

y = "label"
X = train_hf.columns
X = list(filter(lambda x: x not in [y], X))
print(f'y: {y}')
print(f'X: {X}')

# Split the dataset into a train and valid set:
train, valid = train_hf.split_frame(ratios=[.8], seed=1234)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
y: label
X: ['tconst', 'primaryTitle', 'originalTitle', 'startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'writer', 'director', 'same_writer_director']


In [58]:
xgb_params = {
        # 'learn_rate': [0.01, 0.1],
        'max_depth': [3, 5, 9],
        'sample_rate': [0.8, 1.0],
        'col_sample_rate': [0.2, 0.5, 1.0],
        # 'skip_drop': [0, 0.5, 1.0],
        'ntrees': [30, 50, 100, 200, 300],
}

# Train and validate a cartesian grid of GBMs
xgb_grid = H2OGridSearch(model=H2OXGBoostEstimator,
                                grid_id='xgb_grid_ntrees',
                                hyper_params=xgb_params)
xgb_grid.train(
    x=X, y=y,
    training_frame=train,
    validation_frame=valid,
    seed=35
)

# Get the grid results, sorted by validation RMSE
xgb_grid_ntrees_perf = xgb_grid.get_grid(sort_by='RMSE', decreasing=False)
xgb_grid_ntrees_perf

xgboost Grid Build progress: |█ (cancelled)

Hyper-parameter: col_sample_rate, 0.5
Hyper-parameter: learn_rate, 0.3
Hyper-parameter: max_depth, 3
Hyper-parameter: ntrees, 50
Hyper-parameter: sample_rate, 0.8
Hyper-parameter: skip_drop, 0.0
failure_details: Job Canceled
failure_stack_traces: java.lang.RuntimeException: Error while training XGBoost model
	at hex.tree.xgboost.XGBoost$XGBoostDriver.buildModelImpl(XGBoost.java:443)
	at hex.tree.xgboost.XGBoost$XGBoostDriver.buildModel(XGBoost.java:395)
	at hex.tree.xgboost.XGBoost$XGBoostDriver.computeImpl(XGBoost.java:381)
	at hex.ModelBuilder$Driver.compute2(ModelBuilder.java:247)
	at water.H2O$H2OCountedCompleter.compute(H2O.java:1658)
	at jsr166y.CountedCompleter.exec(CountedCompleter.java:468)
	at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
	at jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:976)
	at jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1479)
	at jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104

H2OJobCancelled: Job<$03017f00000132d4ffffffff$_9f5a01bd9f5b7032af24a62992a14202> was cancelled by the user.

In [61]:
# Build and train the model:
imdb_xgb = H2OXGBoostEstimator(seed=10,
                               ntrees=200,
                               max_depth=6)
imdb_xgb.train(x=X,
               y=y,
               training_frame=train,
               validation_frame=valid)

xgboost Model Build progress: |



██████████████████████████████████████████████████| (done) 100%
Model Details
H2OXGBoostEstimator :  XGBoost
Model Key:  XGBoost_model_python_1647970082100_4183


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees
0,,200.0




ModelMetricsBinomial: xgboost
** Reported on train data. **

MSE: 0.0871641245075922
RMSE: 0.29523571008194827
LogLoss: 0.29623792719762754
Mean Per-Class Error: 0.10271989103857493
AUC: 0.9631350157612215
AUCPR: 0.968267514213273
Gini: 0.926270031522443

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.44603567251137327: 


Unnamed: 0,Unnamed: 1,False,True,Error,Rate
0,False,2873.0,295.0,0.0931,(295.0/3168.0)
1,True,361.0,2853.0,0.1123,(361.0/3214.0)
2,Total,3234.0,3148.0,0.1028,(656.0/6382.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.446036,0.896888,208.0
1,max f2,0.285174,0.91986,269.0
2,max f0point5,0.577778,0.925011,164.0
3,max accuracy,0.446036,0.897211,208.0
4,max precision,0.999483,1.0,0.0
5,max recall,0.038624,1.0,388.0
6,max specificity,0.999483,1.0,0.0
7,max absolute_mcc,0.487176,0.796188,193.0
8,max min_per_class_accuracy,0.433802,0.895202,212.0
9,max mean_per_class_accuracy,0.446036,0.89728,208.0



Gains/Lift Table: Avg response rate: 50.36 %, avg score: 50.35 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010028,0.999457,1.985688,1.985688,1.0,0.99974,1.0,0.99974,0.019913,0.019913,98.568762,98.568762,0.019913
1,2,0.020056,0.998973,1.985688,1.985688,1.0,0.999224,1.0,0.999482,0.019913,0.039826,98.568762,98.568762,0.039826
2,3,0.030085,0.998191,1.985688,1.985688,1.0,0.998581,1.0,0.999182,0.019913,0.059739,98.568762,98.568762,0.059739
3,4,0.040113,0.997255,1.985688,1.985688,1.0,0.997752,1.0,0.998824,0.019913,0.079652,98.568762,98.568762,0.079652
4,5,0.050141,0.996107,1.985688,1.985688,1.0,0.99677,1.0,0.998414,0.019913,0.099564,98.568762,98.568762,0.099564
5,6,0.100125,0.985883,1.985688,1.985688,1.0,0.9918,1.0,0.995112,0.099253,0.198818,98.568762,98.568762,0.198818
6,7,0.15011,0.96394,1.985688,1.985688,1.0,0.975957,1.0,0.988734,0.099253,0.298071,98.568762,98.568762,0.298071
7,8,0.200094,0.922081,1.985688,1.985688,1.0,0.944008,1.0,0.977561,0.099253,0.397324,98.568762,98.568762,0.397324
8,9,0.300063,0.778458,1.970126,1.980503,0.992163,0.856165,0.997389,0.937117,0.196951,0.594275,97.01258,98.050305,0.592697
9,10,0.400031,0.598424,1.767822,1.927354,0.890282,0.693148,0.970623,0.876148,0.176727,0.771002,76.78222,92.735367,0.747328




ModelMetricsBinomial: xgboost
** Reported on validation data. **

MSE: 0.17826101103308145
RMSE: 0.42220967662179587
LogLoss: 0.5478112410975715
Mean Per-Class Error: 0.27922410131665315
AUC: 0.8114108331080995
AUCPR: 0.8310235340556952
Gini: 0.6228216662161989

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3364734426140785: 


Unnamed: 0,Unnamed: 1,False,True,Error,Rate
0,False,492.0,309.0,0.3858,(309.0/801.0)
1,True,134.0,642.0,0.1727,(134.0/776.0)
2,Total,626.0,951.0,0.2809,(443.0/1577.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.336473,0.743486,249.0
1,max f2,0.019776,0.829587,395.0
2,max f0point5,0.692911,0.773481,119.0
3,max accuracy,0.55741,0.741281,164.0
4,max precision,0.999518,1.0,0.0
5,max recall,0.002476,1.0,399.0
6,max specificity,0.999518,1.0,0.0
7,max absolute_mcc,0.687148,0.502958,121.0
8,max min_per_class_accuracy,0.464632,0.730337,199.0
9,max mean_per_class_accuracy,0.536727,0.740351,171.0



Gains/Lift Table: Avg response rate: 49.21 %, avg score: 50.95 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010146,0.999481,2.032216,2.032216,1.0,0.999747,1.0,0.999747,0.020619,0.020619,103.221649,103.221649,0.020619
1,2,0.020292,0.999268,2.032216,2.032216,1.0,0.999363,1.0,0.999555,0.020619,0.041237,103.221649,103.221649,0.041237
2,3,0.030438,0.998915,2.032216,2.032216,1.0,0.999097,1.0,0.999402,0.020619,0.061856,103.221649,103.221649,0.061856
3,4,0.040583,0.998195,2.032216,2.032216,1.0,0.998625,1.0,0.999208,0.020619,0.082474,103.221649,103.221649,0.082474
4,5,0.050095,0.997538,2.032216,2.032216,1.0,0.997867,1.0,0.998953,0.01933,0.101804,103.221649,103.221649,0.101804
5,6,0.10019,0.988227,1.877871,1.955044,0.924051,0.994059,0.962025,0.996506,0.094072,0.195876,87.787094,95.504372,0.188386
6,7,0.150285,0.96766,1.852147,1.920745,0.911392,0.979062,0.945148,0.990691,0.092784,0.28866,85.214668,92.07447,0.27243
7,8,0.20038,0.922564,1.826422,1.897164,0.898734,0.949629,0.933544,0.980426,0.091495,0.380155,82.642242,89.716413,0.353937
8,9,0.299937,0.748797,1.50151,1.765837,0.738854,0.839072,0.868922,0.933507,0.149485,0.529639,50.151028,76.583717,0.452236
9,10,0.400127,0.584169,1.093281,1.597432,0.537975,0.667876,0.786054,0.866994,0.109536,0.639175,9.328103,59.743167,0.470636




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
0,,2022-03-22 20:23:33,0.033 sec,0.0,0.5,0.693147,0.5,0.503604,1.0,0.496396,0.5,0.693147,0.5,0.492074,1.0,0.507926
1,,2022-03-22 20:23:33,0.128 sec,1.0,0.458871,0.613126,0.816924,0.84275,1.950728,0.271702,0.461035,0.617182,0.801606,0.80371,1.874155,0.278377
2,,2022-03-22 20:23:33,0.161 sec,2.0,0.437072,0.570702,0.824606,0.851408,1.985688,0.266688,0.441028,0.578069,0.808967,0.815938,1.964476,0.250476
3,,2022-03-22 20:23:33,0.197 sec,3.0,0.423267,0.542367,0.831758,0.857694,1.985688,0.261987,0.428343,0.552006,0.815697,0.82344,1.964476,0.269499
4,,2022-03-22 20:23:33,0.236 sec,4.0,0.414175,0.522453,0.835454,0.862137,1.985688,0.241147,0.420257,0.534384,0.819917,0.827634,1.974153,0.25111
5,,2022-03-22 20:23:33,0.284 sec,5.0,0.408541,0.509155,0.838765,0.866357,1.985688,0.254309,0.415518,0.523574,0.822105,0.83538,2.032216,0.248573
6,,2022-03-22 20:23:33,0.332 sec,6.0,0.403433,0.497334,0.843419,0.870357,1.985688,0.244751,0.412782,0.517076,0.823215,0.835405,2.032216,0.246037
7,,2022-03-22 20:23:33,0.382 sec,7.0,0.400397,0.489632,0.845793,0.873069,1.985688,0.24099,0.410858,0.512454,0.825168,0.837589,2.032216,0.250476
8,,2022-03-22 20:23:33,0.430 sec,8.0,0.397654,0.48272,0.848743,0.875827,1.985688,0.243184,0.409683,0.509238,0.826579,0.83956,2.032216,0.25111
9,,2022-03-22 20:23:33,0.480 sec,9.0,0.39523,0.476712,0.851408,0.878321,1.985688,0.243341,0.408832,0.507049,0.827322,0.838909,2.032216,0.255549



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,runtimeMinutes,3040.630371,1.0,0.314347
1,numVotes,2943.498047,0.968055,0.304305
2,startYear,2407.962646,0.791929,0.24894
3,writer.\N,474.970215,0.156208,0.049103
4,same_writer_director.False,269.822388,0.088739,0.027895
5,originalTitle.<NA>,157.572235,0.051822,0.01629
6,endYear.2017.0,28.185413,0.00927,0.002914
7,endYear.2010.0,19.56333,0.006434,0.002022
8,writer.nm1347153,19.013681,0.006253,0.001966
9,endYear.2019.0,18.436451,0.006063,0.001906



See the whole table with table.as_data_frame()




In [62]:
# Eval performance:
perf = imdb_xgb.model_performance()

# Generate predictions on a test set (if necessary):
pred = imdb_xgb.predict(valid)

# Extract feature interactions:
feature_interactions = imdb_xgb.feature_interaction()

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%


---
# Make predictions

In [65]:
valid_hf = h2o.H2OFrame(valid_df)
test_hf = h2o.H2OFrame(test_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [66]:
pred_valid = imdb_xgb.predict(valid_hf)
pred_test = imdb_xgb.predict(test_hf)

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |



███████████████████████████████████████████████████| (done) 100%




In [79]:
pred_test_list = h2o.as_list(pred_test)['predict'].tolist()
pred_test_list

with open('output/prediction_test.txt', 'w') as f:
    for item in pred_test_list:
        f.write("%s\n" % item)

pred_valid_list = h2o.as_list(pred_valid)['predict'].tolist()
pred_valid_list

with open('output/prediction_validation.txt', 'w') as f:
    for item in pred_valid_list:
        f.write("%s\n" % item)

In [96]:
# check F/F+T
print(f' test {pred_test_list.count(False)/(pred_test_list.count(False) + pred_test_list.count(True))}')

 test 0.427255985267035


In [97]:
# check F/F+T
print(f' valid {pred_test_list.count(False)/(pred_valid_list.count(False) + pred_valid_list.count(True))}')

 valid 0.48586387434554973
