In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

from collections import defaultdict

In [None]:
article_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv", dtype={'article_id': str})


In [None]:
%%time
transaction_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", 
                             dtype={'article_id': str},
                             usecols=['t_dat', 'customer_id', 'article_id'])

transaction_df = transaction_df.groupby(['customer_id', 't_dat'], as_index=False)[['article_id']].agg(list)
#transaction_df = transaction_df[transaction_df.t_dat >= '2019-09-01']
transaction_df['num_articles'] = transaction_df.article_id.apply(lambda x: len(set(x)))
transaction_df = transaction_df[transaction_df.num_articles > 1]
transaction_df = transaction_df[transaction_df.num_articles<18]
transaction_df['article_id'] = transaction_df['article_id'].apply(lambda x: list(set(x)))

transaction_df.head()

In [None]:
transaction_df.num_articles.describe()

In [None]:
transaction_df.num_articles.quantile(0.99)

In [None]:
print("number of transactions:", len(transaction_df))

In [None]:
item_map=defaultdict(int)
for it, article_ids in enumerate(transaction_df.article_id.values):
    num_articles = len(article_ids)
    for article_id in article_ids:
        item_map[article_id] += 1
        
item_df = pd.DataFrame.from_dict({
    'item': item_map.keys(),
    'freq': item_map.values()
})
item_df.head()

In [None]:
item_df = item_df[ item_df.freq > 20]
candidate_items = set(item_df.item.values)

In [None]:
%%time
pair_map={}
for it, article_ids in enumerate(transaction_df.article_id.values):
    if it%1000000 == 0:
        print(it)
    num_articles = len(article_ids)
    for i in range(num_articles):
        item1 = article_ids[i]
        if item1 not in candidate_items:
            continue
            
        for j in range(i+1, num_articles):
            item2 = article_ids[j]
            if item2 not in candidate_items:
                continue
                
            if item1 not in pair_map:
                pair_map[item1] = {}
            if item2 not in pair_map:
                pair_map[item2] = {}
            
            if item2 not in pair_map[item1]:
                pair_map[item1][item2] = 0
            if item1 not in pair_map[item2]:
                pair_map[item2][item1] = 0
            
            pair_map[item1][item2] += 1
            pair_map[item2][item1] += 1

In [None]:
item1 = []
item2 = []
freq  = []

for i1 in pair_map.keys():
    for i2 in pair_map[i1].keys():
        v = pair_map[i1][i2]
        if v <= 20:
            continue
        item1.append(i1)
        item2.append(i2)
        freq.append(v)

pair_df = pd.DataFrame.from_dict({ 'item1': item1, 'item2': item2, 'joint_freq': freq})
pair_df['item_freq1'] = pair_df.item1.apply(lambda k: item_map[k])
pair_df['item_freq2'] = pair_df.item2.apply(lambda k: item_map[k])
pair_df.head()

In [None]:
pair_df = pair_df[pair_df.joint_freq>20]
pair_df['confidence'] = pair_df.joint_freq.div(pair_df['item_freq1'])
pair_df = pair_df.sort_values(['item1', 'confidence'], ascending=[True, False])

pair_df = pair_df.groupby('item1').head(10)
pair_df.head()

In [None]:
print("number of pairs:", len(pair_df)//2)

In [None]:
pair_df.confidence.describe()

In [None]:
pair_df.head()

In [None]:
gc.collect()

# predict based on the association

In [None]:
transaction_df = transaction_df.groupby('customer_id', as_index=False)[['article_id']].agg(list)
transaction_df.head()

In [None]:
popular_articles = "0706016001 0720125001 0706016002 0372860001 0759871002 0610776002 0751471001 0372860002 0673677002 0706016003 0464297007 0562245046"


In [None]:
def get_recommended_items(articles):
    df = pair_df[pair_df.item1.isin(articles)]
    df.groupby('item2', as_index=False)[['joint_freq']].sum().sort_values('joint_freq', ascending=False)
    df = df.head(12)
    items = df.item2.values
    items = ' '.join(items)
    return items

In [None]:
final_candidate_items = set(pair_df.item1.values)
print(len(final_candidate_items))

In [None]:
preds=[]
for it, row in transaction_df.iterrows():
    customer_id = row.customer_id
    articles = []
    for article_lst in row.article_id:
        articles += article_lst
    articles = set(articles)
    cur_articles = final_candidate_items.intersection(articles)
    
    if len(cur_articles) == 0:
        continue
        
    pred_items = get_recommended_items(cur_articles)
    preds.append({
        'customer_id': customer_id,
        'prediction': pred_items
    })
    if it%100000==0:
        print(it)
pred_df = pd.DataFrame.from_dict(preds)
pred_df.head()

In [None]:
%%time
sub_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv", usecols=['customer_id'])
sub_df = sub_df.merge(pred_df, how='left')
sub_df.prediction.fillna(popular_articles, inplace=True)
sub_df.to_csv("submission.csv", index=False)

sub_df.head()