In [None]:
# !pip install cupy

In [None]:
# import cupy as np
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
# Load data from different CSV files
transactions_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
submission_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [None]:
def get_most_bought_articles(data, num_articles=5):
    # Create dataframe that contains the number of times each article has been bought
    articles_counts = data[['article_id', 't_dat']].groupby('article_id').count().reset_index().rename(columns={'t_dat': 'count'})
    articles_counts = articles_counts.sort_values(by='count', ascending=False)
        
    most_bought_articles = articles_counts.loc[articles_counts['count'] >= num_articles]['article_id'].values
    
    return most_bought_articles

In [None]:
# Create training dataset with positive examples.
# The training data will contain all transactions starting from 01/07/2020.
# Only items that have been bought at least 10 times will be kept. Also, we
# are only going to compute the information for the customers that appear
# in these transactions.
start_date = pd.to_datetime('2020-07-01')

filtered_transactions_df = transactions_df.copy()
filtered_transactions_df.t_dat = pd.to_datetime(filtered_transactions_df.t_dat)
filtered_transactions_df = filtered_transactions_df.loc[filtered_transactions_df.t_dat >= start_date]

train_df = filtered_transactions_df.copy()

most_bought_articles = get_most_bought_articles(train_df, num_articles=10)
most_bought_articles = np.sort(most_bought_articles)

train_df = train_df.drop(train_df.loc[~train_df.article_id.isin(most_bought_articles)].index)
filtered_transactions_df = filtered_transactions_df.drop(filtered_transactions_df.loc[~filtered_transactions_df.article_id.isin(most_bought_articles)].index)

recent_customers = train_df.loc[train_df.article_id.isin(most_bought_articles)].customer_id.unique()
recent_customers = np.sort(recent_customers)

num_articles = len(most_bought_articles)
num_customers = len(recent_customers)

# Create dictionaries with mapping keys
articles_id_to_idx = dict(zip(most_bought_articles, range(num_articles)))
customers_id_to_idx = dict(zip(recent_customers, range(num_customers)))

train_df = train_df.loc[train_df['article_id'].isin(most_bought_articles)]
train_df = train_df[['customer_id', 'article_id']]

train_df['article_id'] = train_df['article_id'].apply(lambda x: articles_id_to_idx[x])
train_df['customer_id'] = train_df['customer_id'].apply(lambda x: customers_id_to_idx[x])
train_df['bought'] = np.ones(train_df.shape[0])

train_df

In [None]:
# Generate negative examples
np.random.seed(47)

num_transactions = train_df.shape[0]
negative_data = pd.DataFrame(
    {
        'article_id': np.random.choice(num_articles, num_transactions),
        'customer_id': np.random.choice(num_customers, num_transactions),
        'bought': np.zeros(num_transactions)
    }
)

train_df = pd.concat([train_df, negative_data])
train_df = train_df.sample(frac=1).reset_index(drop=True)

train_df

In [None]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))


def evaluate(predict_f,data_train,data_test):
    """ RMSE-based predictive performance evaluation with pandas. """
    ids_to_estimate = zip(data_test.customer_id, data_test.article_id)
    list_users = set(data_train.customer_id)
    estimated = np.array([predict_f(u,i) for u, i in ids_to_estimate ])
    real = data_test.bought.values
    return compute_rmse(estimated, real)


# Recommender
class ItemBasedRecommender:
    """ Collaborative filtering using a custom sim(u,u'). """
    def __init__(self, df, num_articles, num_customers, num_components=10):
        """ Constructor """
        self.num_components = num_components
        self.num_articles = num_articles
        self.num_customers = num_customers
        
        self.train = df
        
        self.articles = self.train.article_id.values
        self.customers = self.train.customer_id.values
        self.bought = self.train.bought.values


    def __sdg__(self):
        for idx in tqdm(self.training_indices):
            customer_idx = self.customers[idx]
            article_idx = self.articles[idx]
            real_bought = self.bought[idx]
            
            prediction = self.__predict_train(customer_idx, article_idx)
            error = (real_bought - prediction) # error
            
            #Update latent factors
            self.customers_lat_mat[customer_idx] += self.learning_rate * \
                                    (error * self.articles_lat_mat[article_idx] - \
                                     self.lmbda * self.customers_lat_mat[customer_idx])
            self.articles_lat_mat[article_idx] += self.learning_rate * \
                                    (error * self.customers_lat_mat[customer_idx] - \
                                     self.lmbda * self.articles_lat_mat[article_idx])
                
                
    def fit(self, n_epochs=10, learning_rate=0.001, lmbda=0.1):
        """Compute the matrix factorization R = P \times Q"""
        self.learning_rate = learning_rate
        self.lmbda = lmbda
        self.n_samples = self.train.shape[0]
        
        self.train_rmse =[]
        self.test_rmse = []
        
        # Initialize latent matrices
        self.customers_lat_mat = np.random.normal(scale=1., size=(self.num_customers, self.num_components))
        self.articles_lat_mat = np.random.normal(scale=1., size=(self.num_articles, self.num_components))

        for epoch in range(n_epochs):
            print('Epoch: {}'.format(epoch))
            
            self.training_indices = np.random.permutation(self.n_samples)
            self.__sdg__()
            
            # self.train_rmse.append(evaluate(self.__predict_train, self.train, self.train))
            # self.test_rmse.append(evaluate(self.predict,data_train,data_test))
            
            
            # print('\tTrain rmse: %s' % self.train_rmse[-1])
            # print('\tTest rmse: %s' % self.test_rmse[-1])
        
        del self.customers_lat_mat
            
        
    def __predict_train(self, customer_idx, article_idx):
        """ Single user and item prediction."""
        prediction = np.dot(self.customers_lat_mat[customer_idx], self.articles_lat_mat[article_idx])
        prediction = np.clip(prediction, 0, 1)
        
        return prediction
    
    def predict(self, transactions_df, customers, most_bought_articles, articles_id_to_idx, customers_id_to_idx):
        recommendations = []
        
        # Compute similarity matrix
        similarity_matrix = np.dot(self.articles_lat_mat, self.articles_lat_mat.T)
        norms = np.sqrt(np.sum(self.articles_lat_mat ** 2, axis=1)).reshape(-1, 1)
        similarity_matrix = similarity_matrix / norms
        similarity_matrix = similarity_matrix / norms.T
        
        # Convert the similarity matrix to a matrix of indices of the 12 most similar items for each item
        similarity_matrix = np.argsort(similarity_matrix, axis=1)[:, ::-1]
        similarity_matrix = similarity_matrix[:, :12]
        
        top_sold_articles = transactions_df[['article_id', 't_dat']].groupby('article_id').count().reset_index().rename(columns={'t_dat': 'count'})
        top_sold_articles = top_sold_articles.sort_values(by='count', ascending=False).iloc[:12]
        top_sold_articles = top_sold_articles.article_id.values
        
        default_recommendation = '0' + str(top_sold_articles[0]) + ' ' + ' 0'.join([str(article) for article in top_sold_articles[1:]])
        
        last_items = transactions_df.loc[transactions_df.groupby('customer_id').t_dat.idxmax()].article_id.values
        
        for customer in tqdm(customers):
            try:
                customer_idx = customers_id_to_idx[customer]                    
                last_article = last_items[customer_idx]
                article_idx = articles_id_to_idx[last_article]
                
                similar_articles_idx = similarity_matrix[article_idx]
                
                recommended_articles = most_bought_articles[similar_articles_idx]
                recommendation = '0' + str(recommended_articles[0]) + ' ' + ' 0'.join([str(article) for article in recommended_articles[1:]])
            except KeyError as kerr:
                recommendation = default_recommendation
            
            recommendations.append(recommendation)
        
        predictions_df = pd.DataFrame({'customer_id': customers, 'prediction': recommendations})
        
        return predictions_df
                
    

In [None]:
recommender = ItemBasedRecommender(train_df, num_articles, num_customers, num_components=1000)
recommender.fit(n_epochs=15)

In [None]:
customers = submission_df.customer_id.values
submission = recommender.predict(filtered_transactions_df, customers, most_bought_articles, articles_id_to_idx, customers_id_to_idx)
submission

In [None]:
submission.to_csv('submission.csv', index=False)