In [None]:
import os
import numpy as np
import pandas as pd
import gc

def cust_blend(dt, W = [1]):
    #Create a list of all model predictions
    REC = []

    # Second Try
    for i in range(len(dt)-1):
        REC.append(dt[f'prediction{i}'].split())
    
    #Create a dictionary of items recommended.
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v == '': continue
            if v in res:
                res[v] += (W[M%len(W)]/(n+1))
            else:
                res[v] = (W[M%len(W)]/(n+1))

    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())

    # Return the top 12 items only
    return ' '.join(res[:12])

def prep_subs(submissions):
    sub0 = submissions[0]
    if len(sub0.columns) == 2:
        sub0.columns = ['customer_id', 'prediction0']
    for i in range(1, len(subs)):
        sub0[f'prediction{i}'] = submissions[i]['prediction'].fillna('')
        sub0[f'prediction{i}'] = sub0[f'prediction{i}'].astype(str)

    gc.collect()
    sub0.head(3)
    return sub0

def clean_sub(submission, weights):
    for i in range(len(weights)):
        del submission[f'prediction{i}']
    gc.collect()
    return submission

# To ensemble I used submissions from 9 public notebooks:
* LB: 0.0225 - https://www.kaggle.com/lunapandachan/h-m-trending-products-weekly-add-test/notebook
* LB: 0.0217 - https://www.kaggle.com/tarique7/hnm-exponential-decay-with-alternate-items/notebook
* LB: 0.0221 - https://www.kaggle.com/astrung/lstm-sequential-modelwith-item-features-tutorial
* LB: 0.0224 - https://www.kaggle.com/code/hirotakanogami/h-m-eda-customer-clustering-by-kmeans
* LB: 0.0220 - https://www.kaggle.com/code/hengzheng/time-is-our-best-friend-v2/notebook
* LB: 0.0227 - https://www.kaggle.com/code/hechtjp/h-m-eda-rule-base-by-customer-age
* LB: 0.0231 - https://www.kaggle.com/code/ebn7amdi/trending/notebook?scriptVersionId=90980162
* LB: 0.0225 - https://www.kaggle.com/code/mayukh18/svd-model-reranking-implicit-to-explicit-feedback

# our own models:
* LB: 0.0226 - LightSAN recbole model
* LB: 0.0211 - LGBM
* LB: ? - items together ONLY

In [None]:
%%time
subs = []
subs.append(pd.read_csv('../input/handmbestperforming/h-m-trending-products-weekly-add-test.csv').sort_values('customer_id').reset_index(drop=True))
subs.append(pd.read_csv('../input/handmbestperforming/hnm-exponential-decay-with-alternate-items.csv').sort_values('customer_id').reset_index(drop=True))
subs.append(pd.read_csv('../input/handmbestperforming/lstm-sequential-modelwith-item-features-tutorial.csv').sort_values('customer_id').reset_index(drop=True))
subs.append(pd.read_csv('../input/hm-00224-solution/submission.csv').sort_values('customer_id').reset_index(drop=True))
subs.append(pd.read_csv('../input/handmbestperforming/time-is-our-best-friend-v2.csv').sort_values('customer_id').reset_index(drop=True))
subs.append(pd.read_csv('../input/handmbestperforming/rule-based-by-customer-age.csv').sort_values('customer_id').reset_index(drop=True))
subs.append(pd.read_csv('../input/h-m-faster-trending-products-weekly/submission.csv').sort_values('customer_id').reset_index(drop=True))
#subs.append(pd.read_csv('../input/hm-00231-solution/submission.csv').sort_values('customer_id').reset_index(drop=True))
#subs.append(pd.read_csv('../input/h-m-framework-for-partitioned-validation/submission.csv').sort_values('customer_id').reset_index(drop=True)) 
subs.append(pd.read_csv('../input/0237-ensemble-submission-handm/0226_lightsan.csv.gzip').sort_values('customer_id').reset_index(drop=True))     # 0.0226
subs.append(pd.read_csv('../input/my-best-submissions-to-ensemble/basic_model_submission.csv').sort_values('customer_id').reset_index(drop=True))                  # 0.0211
subs.append(pd.read_parquet('../input/0237-ensemble-submission-handm/model_subs/items_together_sub.parquet.gzip').sort_values('customer_id').reset_index(drop=True))

In [None]:
sub0 = prep_subs(subs)

In [None]:
import math
import numpy as np
#scores = [0.0231, 0.0225, 0.0217, 0.0221, 0.0224, 0.022, 0.0227, 0.0231, 0.0225, 0.0226, 0.0211]
scores = [0.0225, 0.0217, 0.0221, 0.0224, 0.022, 0.0227, 0.0231, 0.0225, 0.0226, 0.0211]
print(f'{len(scores)} ensembles being weighted')
print(f'avg score: {np.mean(scores)}')
avg_score = np.mean(scores)
weights = np.array([math.e**((x-avg_score)*500) for x in scores])

print(f'suggested weights: {weights}')

### first ensemble

In [None]:
# weights = [1.05,0.78,0.86,0.85,0.68,0.64,0.7,0.24,1.01, 0.8, 0.9] # 0.0239
# weights = [1.05,0.78,0.86,0.85,0.68,0.64,0.7,0.5,1.01, 0.8, 0.9]
# weights = [1.05, 0.78, 0.86, 0.85, 0.68, 0.64, 0.70, 0.24, 1.0, 0.8, 0.7] # 0.0241
# weights = [1.05, 0.78, 0.86, 0.85, 0.68, 0.64, 0.70, 0.24, 1.2, 0.8, 0.7] # 0.0241
# weights = [1.05, 0.78, 0.86, 0.85, 0.68, 0.64, 0.70, 0.24, 1.2, 0.5, 0.6] # 0.0241
# weights = [1.05, 0.78, 0.86, 0.85, 0.68, 0.64, 0.70, 0.24, 1.0, 1.5, 0.5, 0.6] # 0.0240
#weights = [1.05, 0.78, 0.86, 0.85, 0.68, 0.64, 0.70, 0.24, 1.2, 0.5] # 0.0242
#weights = [1.05, 0.78, 0.86, 0.85, 0.68, 0.64, 0.70, 1.2, 0.24, 0.5] # 0.0240

weights = [1.05, 0.78, 0.86, 0.85, 0.68, 0.64, 0.70, 0.24, 1.2, 0.5] # 0.0242
print(f'using weights {weights} for custom blend')
sub0['prediction'] = sub0.apply(cust_blend, W = weights, axis=1)
sub0.head(3)

In [None]:
sub0 = clean_sub(sub0, weights)

In [None]:
sub0.head(2)

In [None]:
sub0.to_parquet('init_blend.parquet.gzip', index=False)

### second ensemble

In [None]:
sub0 = pd.read_parquet('init_blend.parquet.gzip')
sub0.head(2)

In [None]:
%%time

subs = []
# init mix from first ensemble
subs.append(sub0)#pd.read_parquet('../input/0237-ensemble-submission-handm/init_lsan_lgbm_it_ensemble.parquet.gzip'))

# stuff for 2nd ensemble
subs.append(pd.read_csv('../input/h-m-framework-for-partitioned-validation/submission.csv').sort_values('customer_id').reset_index(drop=True)) 
subs.append(pd.read_csv('../input/hm-00231-solution/submission.csv').sort_values('customer_id').reset_index(drop=True))

In [None]:
sub0 = prep_subs(subs)
sub0.head(2)

In [None]:
# weights = [1.20, 0.85, 0.8]
# weights = [1.20, 0.85] # .0241
# weights = [1.20, 0.90] # .0241
# weights = [1.20, 1.00] # .0239
# weights = [1.30, 0.85] # .0240
# weights = [1.20, 0.85, 0.1] # .0241
# weights = [1.20, 0.85, 0.3] # .0241
# weights = [1.20, 1.1, 0.9] # 0.0241
# weights = [1.20, 0.85, 0.9] # 0.0242
# weights = [1.20, 0.9, 0.85] # 0.0242

weights = [1.20, 0.85, 0.75] # 
sub0['prediction'] = sub0.apply(cust_blend, W = weights, axis=1)
sub0.head(3)

In [None]:
sub0 = clean_sub(sub0, weights)
sub0.head(2)

In [None]:
print(f'2nd ensemble weights: {weights}')

# Make a submission

In [None]:
sub0.to_csv('the69toRuleThemAllv3.csv.gz', index=False)