In [None]:
import cudf
import gc

## The idea here comes from Chris Deotte notebook that calculates the articles usually purchased together. (https://www.kaggle.com/code/cdeotte/customers-who-bought-this-frequently-buy-this)

In [None]:
# Load the dataset and discard unused columns
train = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
del train['price']
del train['sales_channel_id']
gc.collect()

# Convert customer_id to int to save memory and speedup processings.
train['customer_id'] = train['customer_id'].factorize()[0].astype('int32')
train['t_dat'] = train['t_dat'].factorize()[0].astype('int16')
gc.collect()

# number of rows of train
print(train.shape)
train.head(10)

In [None]:
def calc_pairs(train):
    # Calculate all articles purchased together
    dt = train.groupby(['customer_id','t_dat'])['article_id'].agg(list).rename('pair').reset_index()
    df = train[['customer_id', 't_dat', 'article_id']].merge(dt, on=['customer_id', 't_dat'], how='left')
    del dt
    gc.collect()

    # Explode the rows vs list of articles
    df = df[['article_id', 'pair']].explode(column='pair')
    gc.collect()
    
    # Discard duplicates
    df = df.loc[df['article_id']!=df['pair']].reset_index(drop=True)
    gc.collect()

    # Count how many times each pair combination happens
    df = df.groupby(['article_id', 'pair']).size().rename('count').reset_index()
    gc.collect()
    
    # Sort by frequency
    df = df.sort_values(['article_id' ,'count'], ascending=False).reset_index(drop=True)
    gc.collect()
    
    # Pick only top1 most frequent pair
    df['rank'] = df.groupby('article_id')['pair'].cumcount()
    df = df.loc[df['rank']==0].reset_index(drop=True)
    del df['rank']
    gc.collect()
    
    return df

In [None]:
%%time

pairs = calc_pairs(train)
pairs

In [None]:
pairs.to_parquet('top1-article-pairs.parquet')