In [2]:
import pandas as pd
import numpy as np
from math import sqrt
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import scikit_learn as skl
import scipy.stats as stats
import scipy.signal as signal
tqdm.pandas()

N = 12
df_trans = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv',dtype={'article_id': str})
df_trans['t_dat'] = pd.to_datetime(df_trans['t_dat'])

In [5]:
df = df_trans[['t_dat', 'customer_id', 'article_id']].copy()
last_ts = df['t_dat'].max()

In [9]:
(last_ts - df['t_dat']).dt.floor('7D')

0          728 days
1          728 days
2          728 days
3          728 days
4          728 days
             ...   
31788319     0 days
31788320     0 days
31788321     0 days
31788322     0 days
31788323     0 days
Name: t_dat, Length: 31788324, dtype: timedelta64[ns]

In [10]:
df = df_trans[['t_dat', 'customer_id', 'article_id']].copy()
last_ts = df['t_dat'].max()
df['ldbw'] = df['t_dat'] - (last_ts - df['t_dat']).dt.floor('7D')
weekly_sales = df.drop('customer_id', axis=1).groupby(['ldbw', 'article_id']).count()
weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})
df = df.join(weekly_sales, on=['ldbw', 'article_id'])
weekly_sales = weekly_sales.reset_index().set_index('article_id')
last_day = last_ts.strftime('%Y-%m-%d')

In [11]:
df = df.join(
    weekly_sales.loc[weekly_sales['ldbw']==last_day, ['count']],
    on='article_id', rsuffix="_targ")

df['count_targ'].fillna(0, inplace=True)
del weekly_sales
df['quotient'] = df['count_targ'] / df['count']

purchase_dict = {}

for i in tqdm(df.index):
    cust_id = df.at[i, 'customer_id']
    art_id = df.at[i, 'article_id']
    t_dat = df.at[i, 't_dat']

    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}

    if art_id not in purchase_dict[cust_id]:
        purchase_dict[cust_id][art_id] = 0
    
    x = max(1, (last_ts - t_dat).days)

    a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
    y = a / np.sqrt(x) + b * np.exp(-c*x) - d

    value = df.at[i, 'quotient'] * max(0, y)
    purchase_dict[cust_id][art_id] += value

target_sales = df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(N).index.tolist()

100%|██████████| 31788324/31788324 [12:43<00:00, 41645.61it/s]


In [14]:
df['quotient']

0           0.0
1           0.0
2           0.0
3           0.0
4           0.0
           ... 
31788319    1.0
31788320    1.0
31788321    1.0
31788322    1.0
31788323    1.0
Name: quotient, Length: 31788324, dtype: float64

: 

In [12]:
purchase_dict

{'000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318': {'0663713001': 0.0,
  '0541518023': 0.0,
  '0578020002': 0.0,
  '0723529001': 35.62946726369621,
  '0351484002': 38.86850974221406,
  '0727808001': 0.0,
  '0727808007': 0.0,
  '0858883002': 497.60860558625836,
  '0851400006': 0.0,
  '0750424014': 0.0,
  '0870304002': 0.0,
  '0852643001': 331.7390703908389,
  '0852643003': 0.0,
  '0794321007': 6990.562525611022},
 '00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2': {'0505221004': 0.0,
  '0685687003': 0.0,
  '0685687004': 0.0,
  '0685687001': 0.0,
  '0505221001': 0.0,
  '0508184022': 0.0,
  '0522992001': 0.0,
  '0605106001': 0.0,
  '0567618001': 0.0,
  '0528931002': 0.0,
  '0349301001': 0.0,
  '0590414001': 0.0,
  '0590414002': 0.0,
  '0570309005': 0.0,
  '0577992001': 0.0,
  '0552570004': 0.0,
  '0649018001': 0.0,
  '0633150009': 0.0,
  '0581162008': 0.0,
  '0616808001': 0.0,
  '0567618002': 0.0,
  '0622964004': 0.0,
  '0464454004': 0.0,
  '0550718001':

In [None]:
# Step2 & Step3
pairs = np.load('../input/hmitempairs/pairs_cudf.npy',allow_pickle=True).item()
sub = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

pred_list = []
for cust_id in tqdm(sub['customer_id']):
    if cust_id in purchase_dict:
        series = pd.Series(purchase_dict[cust_id])
        series = series[series > 150]
        l = series.nlargest(N).index.tolist()
        tmp_l = l.copy()
        for elm in tmp_l:
            if len(l) < N and int(elm) in pairs.keys():
                itm = pairs[int(elm)]
                l.append('0' + str(itm))
        if len(l) < N:
            l = l + general_pred[:(N-len(l))]
    else:
        l = general_pred
    pred_list.append(' '.join(l))

sub['prediction'] = pred_list
sub.to_csv(f'submission.csv',index=False)