In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

input_dir = '/kaggle/input/h-and-m-personalized-fashion-recommendations/'

In [None]:
articles_path = input_dir + 'articles.csv'
articles = pd.read_csv(articles_path, index_col='article_id')
articles.head()

In [None]:
articles.isnull().sum()

In [None]:
articles.head()

In [None]:
article_features = ['product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id',
                    'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no']

In [None]:
train_articles = articles[article_features]

In [None]:
object_cols = [col for col in train_articles.columns if train_articles[col].dtype == 'object']
object_cols

In [None]:
train_articles['index_code'].value_counts()

In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
train_articles[['index_code']] = encoder.fit_transform(articles[['index_code']]).astype(int)
train_articles.head()

In [None]:
customers_path = input_dir + 'customers.csv'
customers = pd.read_csv(customers_path)
customers.head()

In [None]:
customers.isnull().sum()

In [None]:
customers['club_member_status'].value_counts()

In [None]:
customers['fashion_news_frequency'].value_counts()

In [None]:
features = ['customer_id', 'club_member_status', 'fashion_news_frequency', 'age']

In [None]:
from sklearn.impute import SimpleImputer

train_customers = customers[features]
train_customers['club_member_status'] = customers['club_member_status'].fillna('PRE-CREATE').map(
    {'LEFT CLUB': 0, 'PRE-CREATE': 1, 'ACTIVE': 2}).astype(int)
train_customers['fashion_news_frequency'] = customers['fashion_news_frequency'].copy().fillna('NONE').map(
    {'NONE': 0, 'None': 0, 'Monthly': 1, 'Regularly': 2}).astype(int)

imputer = SimpleImputer(strategy='mean').fit(customers[['age']])
train_customers[['age']] = imputer.transform(train_customers[['age']])
train_customers = train_customers.set_index('customer_id')
train_customers.head()

In [None]:
sample_path = input_dir + 'sample_submission.csv'
samples = pd.read_csv(sample_path)
samples.head()

In [None]:
transactions_path = input_dir + 'transactions_train.csv'
transactions = pd.read_csv(transactions_path, parse_dates=['t_dat'])
transactions.head()

In [None]:
transactions.isnull().sum()

In [None]:
articles_by_customers = transactions.groupby('customer_id').article_id.apply(set).to_dict()

In [None]:
articles_popularity = transactions.groupby('article_id').customer_id.nunique()

In [None]:
most_popular = articles_popularity.sort_values(ascending=False).index

In [None]:
most_popular

In [None]:
NR_PREDS = 12

In [None]:
from sklearn.preprocessing import StandardScaler
scaled_customers = pd.DataFrame(StandardScaler().fit_transform(train_customers), index=train_customers.index, columns=train_customers.columns)
scaled_customers.head()

In [None]:
scaled_articles = pd.DataFrame(StandardScaler().fit_transform(train_articles), index=train_articles.index, columns=train_articles.columns)
scaled_articles.head()

In [None]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=NR_PREDS, n_jobs=-1).fit(scaled_articles)

In [None]:
preds = model.kneighbors(scaled_articles)

In [None]:
dists = pd.DataFrame(preds[0], index=articles.index)
similar_articles = pd.DataFrame([[dists.index[x] for x in y] for y in preds[1]], index=articles.index)

In [None]:
dists.head()

In [None]:
similar_articles.head()

In [None]:
#choices = {}
#for customer in customers.customer_id:
#    if customer not in articles_by_customers:
#        choices[customer] = most_popular[:NR_PREDS]
#    else:
#        similar_to_bought = set()
#        for article_id in articles_by_customers[customer]:
#            similar_to_bought.update(list(zip(dists.loc[article_id].to_list(), similar_articles.loc[article_id].to_list())))
#        similar_to_bought = sorted(list(similar_to_bought), key=lambda x: x[0])
#        customer_choices = []
#        for _, article in similar_to_bought:
#            if article not in customer_choices and article not in articles_by_customers[customer]:
#                customer_choices.append(article)
#                if len(customer_choices) == NR_PREDS:
#                    break
#        if len(customer_choices) < NR_PREDS:
#            for _, article in similar_to_bought:
#                if article not in customer_choices:
#                    customer_choices.append(article)
#                    if len(customer_choices) == NR_PREDS:
#                        break
#        if len(customer_choices) < NR_PREDS:
#            for article in most_popular:
#                if article not in customer_choices:
#                    customer_choices.append(article)
#                    if len(customer_choices) == NR_PREDS:
#                        break
#        choices[customer] = customer_choices

In [None]:
# pd.DataFrame({'customer_id': choices.keys(), 'prediction': list(' '.join(map(str, x)) for x in choices.values())}).to_csv('submission_art.csv', index=False)

In [None]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=NR_PREDS, n_jobs=-1).fit(scaled_customers)

In [None]:
preds = model.kneighbors(scaled_customers)

In [None]:
dists = pd.DataFrame(preds[0], index=train_customers.index)
similar_users = pd.DataFrame([[dists.index[x] for x in y] for y in preds[1]], index=train_customers.index)

In [None]:
choices = {}
for customer in customers.customer_id:
    customer_choices = []
    for sim_user in similar_users.loc[customer]:
        if sim_user in articles_by_customers:
            for article in articles_by_customers[sim_user]:
                if article not in customer_choices and article not in articles_by_customers.get(customer, set()):
                    customer_choices.append(article)
                    if len(customer_choices) == NR_PREDS:
                        break
            if len(customer_choices) == NR_PREDS:
                break
    if len(customer_choices) < NR_PREDS:
        for article in most_popular:
            if article not in customer_choices:
                customer_choices.append(article)
                if len(customer_choices) == NR_PREDS:
                    break
    choices[customer] = customer_choices

In [None]:
pd.DataFrame({'customer_id': choices.keys(), 'prediction': list(' '.join(map(str, x)) for x in choices.values())}).to_csv('submission_cust.csv', index=False)