In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from collections import defaultdict  
import math

In [None]:
df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', 
                 dtype = {'customer_id': str, 'article_id': str},
                 usecols=['t_dat', 'customer_id','article_id'])

In [None]:
df.shape

In [None]:
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')

In [None]:
customers.shape

In [None]:
def get_sim_item(df, user_col, item_col, use_iif=False):  
    user_item_ = df.groupby(user_col)[item_col].agg(set).reset_index()  
    user_item_dict = dict(zip(user_item_[user_col], user_item_[item_col]))  
    
    sim_item = {}
    item_cnt = defaultdict(int)  
    for user, items in tqdm(user_item_dict.items()):  
        for i in items:  
            item_cnt[i] += 1  
            sim_item.setdefault(i, {})  
            for relate_item in items:  
                if i == relate_item:  
                    continue  
                sim_item[i].setdefault(relate_item, 0)  
                if not use_iif:  
                    sim_item[i][relate_item] += 1  
                else:  
                    sim_item[i][relate_item] += 1 / math.log(1 + len(items))  
    sim_item_corr = sim_item.copy()  
    for i, related_items in tqdm(sim_item.items()):  
        for j, cij in related_items.items():  
            sim_item_corr[i][j] = cij/math.sqrt(item_cnt[i]*item_cnt[j])  
  
    return sim_item_corr, user_item_dict   

In [None]:
# def recommend(sim_item_corr, user_item_dict, user_id, top_k, item_num):  
#     rank = {}
#     interacted_items = user_item_dict[user_id]  
#     for i in interacted_items:
#         for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1], reverse=True)[0:top_k]:  
#             if j not in interacted_items:  
#                 rank.setdefault(j, 0)  
#                 rank[j] += wij  
#     return sorted(rank.items(), key=lambda d: d[1], reverse=True)[:item_num]  

In [None]:
def recommend(sim_item_corr, user_item_dict, user_id, top_k, item_num):  
    rank = {}
    interacted_items = user_item_dict[user_id]  
    for i in interacted_items:
        for j, wij in sorted(sim_item_corr[i].items(), key=lambda d: d[1], reverse=True)[0:top_k]:  
            rank.setdefault(j, 0)  
            rank[j] += wij  
    return sorted(rank.items(), key=lambda d: d[1], reverse=True)[:item_num]  

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df['customer_id'].nunique()

In [None]:
df = df.sort_values(by = 't_dat')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
temp = df.groupby(['customer_id']).tail(2)

In [None]:
temp.shape

In [None]:
temp.shape

In [None]:
# temp = df.sample(frac=0.1)

In [None]:
# df.loc[~df['customer_id'].isin(temp['customer_id'].unique())].shape

In [None]:
# item_sim_list, user_item = get_sim_item(df, user_col = 'customer_id', item_col = 'article_id', use_iif=True)
# item_sim_list, user_item = get_sim_item(temp.append(df.loc[~df['customer_id'].isin(temp['customer_id'].unique())]),
#                                         user_col = 'customer_id', item_col = 'article_id', use_iif=True)

item_sim_list, user_item = get_sim_item(temp, user_col = 'customer_id', item_col = 'article_id', use_iif=True)

In [None]:
recom_item = []
for user_id in tqdm(df['customer_id'].unique(), total = 1362281):
    rank_item = recommend(item_sim_list, user_item, user_id, 12, 12)
    for j in rank_item:
        recom_item.append([user_id, j[0], j[1]])

In [None]:
recom_item = pd.DataFrame(recom_item, columns = ['user_id', 'item_id', 'sim'])

In [None]:
recom_item.head()

In [None]:
# find most popular items
top12_click = df['article_id'].value_counts().index[:12].values  
top12_click = ','.join([str(i) for i in top12_click])
# top12_click

In [None]:
def get_predict(df, pred_col, top_fill):  
    top_fill = [int(t) for t in top_fill.split(',')]  
    scores = [-1 * i for i in range(1, len(top_fill) + 1)]  
    
    
#     ids = list(df['user_id'].unique())  
    ids = list(customers['customer_id'].unique())
    
    
    fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id'])  
    fill_df.sort_values('user_id', inplace=True)  
    fill_df['item_id'] = top_fill * len(ids)  
    fill_df[pred_col] = scores * len(ids)
    df = df.append(fill_df)
    df.sort_values(pred_col, ascending=False, inplace=True)  
    df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first')  
    df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False)  
    df = df[df['rank'] <= 12]  
    df = df.groupby('user_id')['item_id'].apply(lambda x: ' '.join([str(i) for i in x])).reset_index()
    df.columns = ['customer_id', 'prediction']
    return df

In [None]:
result = get_predict(recom_item, 'sim', top12_click)

In [None]:
result.to_csv(f'submission.csv',index=False)

In [None]:
sub = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [None]:
result.shape, sub.shape

In [None]:
result.head()

In [None]:
sub.head()