In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torchvision.models as models
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms

from sklearn.neighbors import NearestNeighbors

In [None]:
all_embeddings = torch.load('all_embeddings.pt')
all_embeddings = torch.tensor(all_embeddings)

In [None]:
nbrs = NearestNeighbors(n_neighbors=12, algorithm='ball_tree').fit(all_embeddings)

As for now, we have identified the embeddings for each article, and created the nn classifier. The next step is to find the neighbors for each of the articles

In [None]:
# NOTICE: Time consuming, it would take around 105 * 80s, do not run this unless you are sure

import time

result = []

for i in range(0, 105):
    print(f"Start processing batch {i}")
    start = time.time()
    sub_group = all_embeddings[i*1000:(i+1)*1000]
    _, neighbors = nbrs.kneighbors(sub_group)
    result.append(neighbors)
    end = time.time()
    print(f"Finish processing batch {i}, time consumed {end - start}")

For the next step, we are going to retrieve the recommendations based on customer's purchase history in the past 7 days

In [None]:
all_item_ids = []

for dirname, _, filenames in os.walk('h-and-m-personalized-fashion-recommendations/images/'):
    for filename in filenames:
        tokens = filename.split('.')
        all_item_ids.append(int(tokens[0]))

In [None]:
all_transactions = pd.read_csv('h-and-m-personalized-fashion-recommendations/transactions_train.csv')

In [None]:
purchase_list = all_transactions.groupby('customer_id')['article_id'].apply(list).reset_index(name='new')

In [None]:
embedding_neighbors = torch.load('embedding_neighbors.pt')

article_id_to_embedding_neighbors_dict = {
    article_id: neig for article_id, neig in zip(all_item_ids, embedding_neighbors)
}

In [None]:
from collections import Counter

customer_id_to_recommendations = {}

for idx, row in purchase_list.iterrows():
    if idx % 10000 == 0:
        print(f"Processing {idx} customer ...")
    
    customer_id = row['customer_id']
    purchased_articles = row[1]
    
    recommendation = {}
    for article in purchased_articles:
        if article in article_id_to_embedding_neighbors_dict:
            for rec in article_id_to_embedding_neighbors_dict[article]:
                if rec in purchased_articles:
                    continue
                if rec not in recommendation:
                    recommendation[rec] = 0
                recommendation[rec] += 1
    
    if idx % 10000 == 0:
        print(f"Find {len(recommendation)} new items for {customer_id}")
    
    li = sorted(list(recommendation.items()), key=lambda x: x[1], reverse=True)
    customer_id_to_recommendations[customer_id] = li[:12]


In [61]:
default = [(0, 0)] * 12

In [62]:
for customer_id, rec in customer_id_to_recommendations.items():
    if len(rec) == 0:
        customer_id_to_recommendations[customer_id] = default

In [77]:
customers = pd.read_csv('h-and-m-personalized-fashion-recommendations/customers.csv')
customers = customers['customer_id']

In [80]:
customers = customers.to_list()

In [90]:
submission = {'customer_id': [], 'prediction': []}
for customer in customers:
    if customer in customer_id_to_recommendations:
        rec = customer_id_to_recommendations[customer]
        submission['customer_id'].append(customer)
        prediction = [str(elem[0]) if type(elem) == tuple else str(elem) for elem in rec]
        submission['prediction'].append(" ".join(prediction))
    else:
        submission['customer_id'].append(customer)
        submission['prediction'].append(" ".join(["0"] * 12))

In [91]:
first_try = pd.DataFrame(submission)

In [92]:
first_try

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,90388 87488 103826 101665 80061 92114 65185 71...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,63941 63604 43184 27208 50441 51046 38374 8582...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,53575 69399 78835 85927 25016 51833 76994 9107...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,56520 43352 39263 47324 78015 46737 3085 76201...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,47816 8134 35324 59500 73519 104713 10712 3719...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,14549 48492 31273 71747 43042 12480 36711 5898...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,74331 88547 12622 22619 22881 20166 20078 7808...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,3003 32365 5195 450 17702 44232 20802 408 3145...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,72063 79680 101934 99991 55519 76199 66658 102...


In [93]:
first_try.to_csv('first_submission.csv', index=False)