In [1]:
import polars as pl

paths = [
    'i2i_submission_already_likes.csv', # 0.521
    'baseline_submission.csv', # 0.563
    'candidate-rerank-model-lb-0-575.csv', # 0.575
    'otto-pipeline2-lb-0-576.csv', # 0.576
    'otto-tuning-candidate-rerank-model-lb-0-577.csv' # 0.577
]

In [2]:
def read_sub(path, weight=1): # by default let us assing the weight of 1 to predictions from each submission, this will be akin to a standard vote ensemble
    '''a helper function for loading and preprocessing submissions'''
    return (
        pl.read_csv(path)
        .with_column(pl.col('labels').str.split(by=' '))
        .with_column(pl.lit(weight).alias('vote'))
        .explode('labels')
        .rename({'labels': 'aid'})
        .with_column(pl.col('aid').cast(pl.UInt32)) # we are casting the `aids` to `Int32`! memory management is super important to ensure we don't run out of resources
        .with_column(pl.col('vote').cast(pl.UInt8))
    )

In [17]:
subs = [read_sub(path) for path in paths]
subs[0].head()

session_type,aid,vote
str,u32,u8
"""12899779_click...",1048547,1
"""12899779_click...",894169,1
"""12899779_click...",631502,1
"""12899779_click...",1854910,1
"""12899779_click...",1566282,1


In [18]:
subs = (
    subs[0]
    .join(subs[1], how='outer', on=['session_type', 'aid'])
    .join(subs[2], how='outer', on=['session_type', 'aid'], suffix='_right2')
    .join(subs[3], how='outer', on=['session_type', 'aid'], suffix='_right3')
    .join(subs[4], how='outer', on=['session_type', 'aid'], suffix='_right4')
)
subs.head()

session_type,aid,vote,vote_right,vote_right2,vote_right3,vote_right4
str,u32,u8,u8,u8,u8,u8
"""12899779_click...",59625,1.0,1,1.0,1.0,1.0
"""12899779_click...",737445,,1,1.0,1.0,1.0
"""12899779_click...",1804863,,1,,,
"""12899779_click...",499621,,1,,,
"""12899779_click...",941596,,1,,,


In [19]:
subs = (
    subs
    .fill_null(0)
    .with_column((pl.col('vote') + 
                  pl.col('vote_right') + 
                  pl.col('vote_right2') +
                  pl.col('vote_right3') + 
                  pl.col('vote_right4')).alias('vote_sum'))
    .drop(['vote', 'vote_right', 'vote_right2', 'vote_right3', 'vote_right4'])
    .sort(by='vote_sum')
    .reverse()
)

In [20]:
%%time
preds = subs.groupby('session_type').agg([
    pl.col('aid').head(20).alias('labels')
])

preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))

CPU times: user 6min 11s, sys: 4.17 s, total: 6min 15s
Wall time: 4min 48s


In [23]:
preds

session_type,labels
str,str
"""14504045_click...","""1587170 208593..."
"""13528660_click...","""272744 1197804..."
"""13474676_carts...","""1431605 123775..."
"""13845309_click...","""930070 1412262..."
"""13291109_carts...","""513932 1556772..."
"""14534031_order...","""7226 809665 10..."
"""14407164_click...","""583140 1320174..."
"""13776094_click...","""1210062 958133..."
"""14150417_carts...","""717527 1500972..."
"""13872862_order...","""1730711 677663..."


In [24]:
# LB - 0.574
preds.write_csv('voting_5subs_submission.csv')