In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

input_trans = '/kaggle/input/h-and-m-personalized-fashion-recommendations/'
input_eda = '/kaggle/input/eda-analysis/'

In [None]:
transactions_path = input_trans + 'transactions_train.csv'
transactions = pd.read_csv(transactions_path, parse_dates=['t_dat'])
transactions.head()

In [None]:
articles_by_customers = transactions.groupby('customer_id').article_id.apply(set).to_dict()
articles_popularity = transactions.groupby('article_id').customer_id.nunique().sort_values(ascending=False)
most_popular = articles_popularity.index

In [None]:
art_path = input_eda + 'scaled_articles.csv'
cust_path = input_eda + 'scaled_customers.csv'
scaled_articles = pd.read_csv(art_path, index_col='article_id')
scaled_customers = pd.read_csv(cust_path, index_col='customer_id')

In [None]:
from sklearn.neighbors import NearestNeighbors

NR_PREDS = 12

model_art = NearestNeighbors(n_neighbors=NR_PREDS, n_jobs=-1).fit(scaled_articles)
model_cust = NearestNeighbors(n_neighbors=NR_PREDS, n_jobs=-1).fit(scaled_customers)

In [None]:
preds_art = model_art.kneighbors(scaled_articles)
preds_cust = model_cust.kneighbors(scaled_customers)

In [None]:
dists_articles = pd.DataFrame(preds_art[0], index=scaled_articles.index)
similar_articles = pd.DataFrame([[dists_articles.index[x] for x in y] for y in preds_art[1]], index=scaled_articles.index)


dists_users = pd.DataFrame(preds_cust[0], index=scaled_customers.index)
similar_users = pd.DataFrame([[dists_users.index[x] for x in y] for y in preds_cust[1]], index=scaled_customers.index)

Take the predictions based on articles and users and compute the distance metric between each entity (article or customer) and their closest neighbours. Used for sorting in order to choose the articles to recommend to each user.

In [None]:
choices = {}
for customer in scaled_customers.index:
    if customer not in articles_by_customers:
        choices[customer] = most_popular[:NR_PREDS]
    else:
        similar_to_bought = set()
        for article_id in articles_by_customers[customer]:
            similar_to_bought.update(list(zip(dists_articles.loc[article_id].to_list(), similar_articles.loc[article_id].to_list())))
        similar_to_bought = sorted(list(similar_to_bought), key=lambda x: x[0])
        customer_choices = []
        for _, article in similar_to_bought:
            if article not in customer_choices and article not in articles_by_customers[customer]:
                customer_choices.append(article)
                if len(customer_choices) == NR_PREDS:
                    break
        if len(customer_choices) < NR_PREDS:
            for _, article in similar_to_bought:
                if article not in customer_choices:
                    customer_choices.append(article)
                    if len(customer_choices) == NR_PREDS:
                        break
        if len(customer_choices) < NR_PREDS:
            for article in most_popular:
                if article not in customer_choices:
                    customer_choices.append(article)
                    if len(customer_choices) == NR_PREDS:
                        break
        choices[customer] = customer_choices

pd.DataFrame({'customer_id': choices.keys(), 'prediction': list(' '.join(map(str, x)) for x in choices.values())}).to_csv('submission_art.csv', index=False)

Articles based selection: for each customer, we select the bought articles and look for the ones similar to them. We sort them by similarity with the original ones and select the most similar 12 that the customer hasn't bought (if that's not possible, we'll also add bought articles to get 12 predictions, if that's also not enough, we'll add from the most_popular list)

In [None]:
choices = {}
for customer in scaled_customers.index:
    customer_choices = []
    for sim_user in similar_users.loc[customer]:
        if sim_user in articles_by_customers:
            for article in articles_by_customers[sim_user]:
                if article not in customer_choices and article not in articles_by_customers.get(customer, set()):
                    customer_choices.append(article)
                    if len(customer_choices) == NR_PREDS:
                        break
            if len(customer_choices) == NR_PREDS:
                break
    if len(customer_choices) < NR_PREDS:
        for article in most_popular:
            if article not in customer_choices:
                customer_choices.append(article)
                if len(customer_choices) == NR_PREDS:
                    break
    choices[customer] = customer_choices


pd.DataFrame({'customer_id': choices.keys(), 'prediction': list(' '.join(map(str, x)) for x in choices.values())}).to_csv('submission_cust.csv', index=False)

Customer based selection: for each customer, we select the most similar customers and look for products that our customer hasn't bought. If we can't find 12 such products, we'll add from the list of the most popular ones.