In [None]:
import numpy as np
import pandas as pd

from collections import defaultdict
from heapq import nlargest
from tqdm import tqdm
from pathlib import Path

data_path = Path('/kaggle/input/h-and-m-personalized-fashion-recommendations/')

In [None]:
transactions = pd.read_csv(
    data_path / 'transactions_train.csv',
    # set dtype or pandas will drop the leading '0' and convert to int
    dtype={'article_id': str} 
)

submission = pd.read_csv(data_path / 'sample_submission.csv')

In [None]:
transactions.head()

In [None]:
submission.head()

In [None]:
# For each customer_id, count each article_id they've previously purchased

counter = defaultdict(dict) # nested dict

for idx, row in tqdm(transactions.iterrows()):
    customer_id = row['customer_id']
    article_id = row['article_id']
    counter[customer_id][article_id] = counter[customer_id].get(article_id, 0) + 1

most_common_benchmark = submission.set_index('customer_id', drop=True)

for customer_id, purchase_dict in tqdm(counter.items()):
    top_purchases = ' '.join(nlargest(12, purchase_dict, key=purchase_dict.get)) # top 12 purchases
    most_common_benchmark.loc[customer_id, 'prediction'] = top_purchases

most_common_benchmark.to_csv('most_common_benchmark.csv')