In [1]:
#This notebook takes inspiration from https://www.kaggle.com/code/andradaolteanu/h-m-eda-rapids-and-similarity-recommenders

# Libraries
import os
import gc
import pandas as pd
import numpy as np
from numpy import dot, sqrt

!pip install turicreate --user

import turicreate as tc
import cudf
import cuml
import cupy

from cuml.model_selection import train_test_split

### Useful Functions

In [2]:
def adjust_id(x):
    
    x = str(x)
    if len(x) == 9:
        x = "0"+x
    
    return x

# Dataset

**3 data files have been provided. We also will load sample submission file**

In [3]:
# Read in the data
articles_df = pd.read_csv("./articles.csv")
customers_df = pd.read_csv("./customers.csv")
transactions_df = pd.read_csv("./transactions_train.csv")
ss_df = pd.read_csv("./sample_submission.csv")

### More Helper Functions

In [4]:
def create_predictions_format(data):
    
    # Adjust ID
    data["article_id"] = data["article_id"].apply(lambda x: adjust_id(x))

    # Group article_ids
    all_preds = data.groupby("customer_id")["article_id"].unique().to_dict()
    data["preds"] = data["customer_id"].map(all_preds)
    data["preds"] = data["preds"].apply(lambda x: " ".join([str(y) for y in x]))

    # Unicize
    data = data.groupby("customer_id")["preds"].first().reset_index()
    
    return data


def get_frequent_purchases(transactions, n=50):

    
    # Compute count per each customer and article
    temp = transactions.groupby(["customer_id", "article_id"])["t_dat"].count().reset_index()
    temp.columns = ["customer_id", "article_id", "count"]

    # Compute total count per each customer
    temp2 = transactions.groupby(["customer_id"])["t_dat"].count().reset_index()
    temp2.columns = ["customer_id", "full_count"]

    temp = temp.merge(temp2, on="customer_id", how="left")
    temp["perc"] = (temp["count"] / temp["full_count"])*100

    # Select only articles that represented at least 50% of the entire purchase
    temp = temp[temp["perc"] >= n].reset_index(drop=True).to_pandas()

    temp = create_predictions_format(temp)

    return  cudf.DataFrame(temp)

In [5]:
# Read in RAPIDS dataframes
transactions = cudf.read_parquet("../input/hm-fashion-recommender-dataset/transactions.pqt")

### Data Preprocessing

In [6]:
# Keep only last 16 digits from customer_id and convert to int
transactions['customer_id'] = transactions['customer_id'].str[-16:].str.hex_to_int().astype('int64')
transactions['t_dat'] = cudf.to_datetime(transactions['t_dat'])
transactions = transactions[['t_dat','customer_id','article_id']]

In [7]:
TOP_CUSTOMERS = 400000
TOP_N = 10000

In [8]:
# Select only most frequent article ids
most_frequent_articles = transactions["article_id"].value_counts().reset_index()
most_frequent_articles.columns = ["article_id", "count"]
print("Total Unique IDs in Transactions:", len(most_frequent_articles))
print("Total Unique IDs that are selected:", TOP_N)
# Get top n most frequent products
most_frequent_articles = cupy.asarray(most_frequent_articles.head(TOP_N)["article_id"])
transactions = transactions[transactions["article_id"].isin(most_frequent_articles)].reset_index(drop=True)

In [9]:
customers_top_trans = list(transactions["customer_id"].value_counts().reset_index()\
                            .head(TOP_CUSTOMERS)["index"].unique().to_pandas())
transactions = transactions[transactions["customer_id"].isin(customers_top_trans)].reset_index(drop=True)

print("Total unique users to recommend:", transactions["customer_id"].nunique())

In [10]:
most_frequent_articles = most_frequent_articles.tolist()
articles_df = articles_df[articles_df['article_id'].isin(most_frequent_articles)]
articles_df.shape

In [11]:
del most_frequent_articles, customers_top_trans
gc.collect()

# Creating Dummy Dataset

In [12]:
# Count per each customer how many products of each they have bought
train = transactions.groupby(["customer_id","article_id"])["t_dat"].count().reset_index()
train.columns = ["customer_id","article_id", "purchase_count"]
dummy_train = train.copy()
dummy_train['purchase_dummy'] = 1
dummy_train.head()

# Data Validation

In [13]:
def split_data(data):
    
    train, test = train_test_split(data, test_size=0.1)
    train_data = tc.SFrame(train.to_pandas())
    test_data = tc.SFrame(test.to_pandas())
    
    return train_data, test_data

In [14]:
train_data, test_data = split_data(train)
train_data_dummy, test_data_dummy = split_data(dummy_train)

# Collaborative Model

In [15]:
articles_df.drop(['product_code', 'product_type_no', 'graphical_appearance_no', 'perceived_colour_master_id', 'department_no', 'colour_group_code', 'index_code', 'index_group_no', 'section_no', 'garment_group_no', 'detail_desc', 'perceived_colour_value_id'], axis=1, inplace=True)
articles_df.head()

In [16]:
user_id = 'customer_id'
item_id = 'article_id'
users_to_recommend = list(train["customer_id"].unique().to_pandas())

In [17]:
def train_model(train_data, name, user_id, item_id, target, users_to_recommend):
    '''
    Trains a recommender model.
    '''
    
    if name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                      user_id=user_id, 
                                                      item_id=item_id, 
                                                      target=target,
                                                      similarity_type='cosine', verbose=False)
    elif name == 'pearson':
            model = tc.item_similarity_recommender.create(train_data, 
                                                          user_id=user_id, 
                                                          item_id=item_id, 
                                                          target=target, 
                                                          similarity_type='pearson', verbose=False)
    
    # k is set to 12 => maximum items to recommend for one customer
    recom = model.recommend(users=users_to_recommend, k=12, verbose=False)
    
    return model, recom

# Prediction

In [18]:
METHOD = "cosine"
TARGET = "purchase_dummy"

In [20]:
collaborative_model = tc.item_similarity_recommender.create(train_data_dummy, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=TARGET,
                                                    similarity_type=METHOD,
                                                    item_data = tc.SFrame(articles_df),
                                                    verbose=False)

pred_recom = collaborative_model.recommend(users=users_to_recommend, k=12, verbose=False)
pred_recom_df = pred_recom.to_dataframe()
pred_recom_df = create_predictions_format(pred_recom_df)

# Getting Rest of the predictions from other algorithm

In [21]:
ss_df = cudf.read_csv("../input/submission_colab_merge.csv")
ss_df['customer_id_new'] = ss_df['customer_id'].str[-16:].str.hex_to_int().astype('int64')
ss_df = ss_df.merge(cudf.DataFrame(recom_df[["customer_id", "preds"]]), 
              left_on="customer_id_new", right_on="customer_id", how="left")

In [22]:
del ss_df['customer_id_new']
del ss_df['customer_id_y']

In [24]:
import numba
numba.cuda.profile_stop()

def merge_preds(x):
    w = [1.05, 1]
    x1 = x.prediction
    x2 = x.preds
    if x1 is not None and x2 is not None:
        x1 = x1.split()
        x2 = x2.split()
        rec = {}
        
        for n in range(len(x1)):
            val = x1[n]
            if val in rec:
                rec[val] += (w[0]/(n+1))
            else:
                rec[val] = (w[0]/(n+1))
                
        for n in range(len(x2)):
            val = x2[n]
            if val in rec:
                rec[val] += (w[1]/(n+1))
            else:
                rec[val] = (w[1]/(n+1))
                
        rec = list(dict(sorted(rec.items(), key=lambda item: -item[1])).keys())
        return ' '.join(rec[:12])

    else:
        xn = x1.split()
        return ' '.join(xn)

ss_pandas = ss_df.to_pandas()
ss_pandas['final_preds'] = ss_pandas.apply(merge_preds, axis=1)
del ss_pandas['prediction']
del ss_pandas['preds']
ss_pandas.head()

# Generating Submission File

In [25]:
ss_pandas.rename({'customer_id_x': 'customer_id', 'final_preds': 'prediction'}, axis=1, inplace=True) 

In [26]:
ss_pandas.to_csv('submissions.csv', index=False)