In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import pickle
from tqdm import tqdm

import h2o
from h2o.automl import H2OAutoML


I trained 206 individual models with H2O AutoML which includes StackedEnsemble in the model family as well. With a fairly well `max_runtime_secs` the AutoML optimizes over a fixed model and hyperparameter space with k-fold CV and picks the best model on its own **"Leaderboard"**. 

Later on I stored log loss of best individual models to see for which labels we are having predictions that increase the mean log loss of labels which is the competition metric. 

Below you can make a comparison between positive value counts for each label and logloss of each label's model. Seems like some labels require a seperate effort because even their "optimized" model losses are very high compared to the competiton leaderboard.

In [None]:
label_df = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

In [None]:
labels = label_df.columns.difference(['sig_id'])

In [None]:
pos_counts = {}
for i,col in enumerate(labels):
    pos_count = label_df[col].value_counts()[1]
    pos_counts[f"{i}-{col}"] = pos_count
    if pos_count<10:
        print(col,':',pos_count)

In [None]:
pos_df = pd.DataFrame({'label':list(pos_counts.keys()), 
              'pos_counts':list(pos_counts.values())})
pos_df_sorted = pos_df.sort_values('pos_counts', ascending = False)

In [None]:
fig ,ax = plt.subplots(figsize=(20,30))
sns.barplot(data=pos_df_sorted, x='pos_counts', y='label',ax=ax)

In [None]:
with open('../input/h2oleaderboards/LBs.pkl', 'rb') as f:
    lbs = pickle.load(f)

In [None]:
scores = []
for i,val in tqdm(enumerate(lbs.values()), total=206):
    if type(val) == str:
        print(val,f"{i}-{list(lbs.keys())[i]}")
        score = np.nan
    else:
        score = val['logloss'].values[0]
    scores.append(score)
    

In [None]:
pos_df['logloss'] = scores 

In [None]:
pos_df_score_sorted = pos_df.sort_values('logloss', ascending=False)

In [None]:
fig ,ax = plt.subplots(figsize=(20,30))
sns.barplot(data=pos_df_score_sorted, x='logloss', y='label',ax=ax)