In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


from tqdm import tqdm
from PIL import Image

In [None]:
train = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv',
                   dtype={'article_id':str})
print(train.shape)
train.head()

In [None]:
train.info()

In [None]:
article = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
print(article.shape)
article.head()

In [None]:
article.info()

In [None]:
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
print(customers.shape)
customers.head()

In [None]:
submission = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
print(submission.shape)
submission.head()

In [None]:
train.info()

In [None]:
total_folders = total_files = 0
folder_info = []
images_names = []
for base, dirs, files in tqdm(os.walk('/kaggle/input/h-and-m-personalized-fashion-recommendations/')):
    for directories in dirs:
        folder_info.append((directories, len(os.listdir(os.path.join(base, directories)))))
        total_folders += 1
    for _files in files:
        total_files += 1
        if len(_files.split(".jpg"))==2:
            images_names.append(_files.split(".jpg")[0])

In [None]:
folder_info_df = pd.DataFrame(folder_info,columns=['folder','files count'])
folder_info_df.sort_values(['files count'],ascending=False)

In [None]:
article_group = article.groupby(['product_group_name'])['product_type_name'].nunique()
df = pd.DataFrame({
    'product group': article_group.index,
    'product types': article_group.values
})
df.sort_values(['product types'],ascending=False,inplace=True)
plt.figure(figsize=(15,10))
plt.title('Number of product types per each product group')
s = sns.barplot(x='product group',y='product types',data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
df = train.copy()
fig,ax = plt.subplots(1,1,figsize=(15,10))
sns.kdeplot(np.log(df.loc[df.sales_channel_id == 1].price.value_counts()))
sns.kdeplot(np.log(df.loc[df.sales_channel_id == 2].price.value_counts()))
ax.legend(labels=['Sales channel 1','Sales channel 2'])
plt.show()

In [None]:
image_name_df = pd.DataFrame(images_names, columns=['image_name'])
image_name_df['article_id'] = image_name_df['image_name'].apply(lambda x: int(x[1:]))
image_name_df.head()

In [None]:
image_article_df = article[['article_id','product_code','product_group_name','product_type_name']].merge(
image_name_df,on=['article_id'],how='left')
print(image_article_df.shape)
image_article_df.head()

In [None]:
article_no_image_df = image_article_df.loc[image_article_df.image_name.isna()]
print(article_no_image_df.shape)
article_no_image_df.head()

In [None]:
def plot_image_samples(image_article_df, product_group_name, cols=1,rows=-1):
    image_path = '../input/h-and-m-personalized-fashion-recommendations/images/'
    df_ = image_article_df.loc[image_article_df.product_group_name == product_group_name]
    article_ids = df_.article_id.values[0:cols*rows]
    article_product = df_.product_type_name.values[0:cols*rows]
    plt.figure(figsize=(2 + 3 * cols, 2 + 4 * rows))
    for i in range(cols * rows):
        article_id = ('0' + str(article_ids[i]))[-10:]
        product_type_name = article_product[i]
        plt.subplot(rows,cols,i+1)
        plt.axis('off')
        plt.title(f'{product_group_name} {article_id[:3]}\n{article_id}.jpg\n{product_type_name}')
        image = Image.open(f'{image_path}{article_id[:3]}/{article_id}.jpg')
        plt.imshow(image)

In [None]:
print(image_article_df.product_group_name.value_counts())

In [None]:
plot_image_samples(image_article_df,'Accessories',5,1)

In [None]:
plot_image_samples(image_article_df,'Furniture',5,1)

In [None]:
train.t_dat = pd.to_datetime(train.t_dat)

In [None]:
train_ = train[train.t_dat >= pd.to_datetime('2020-09-01')].copy()

In [None]:
train_

In [None]:
purchase_dict = {}

for i,x in enumerate(zip(train_['customer_id'], train_['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict:
        purchase_dict[cust_id] = {}
    
    if art_id not in purchase_dict[cust_id]:
        purchase_dict[cust_id][art_id] = 0
    
    purchase_dict[cust_id][art_id] += 1
    
print(len(purchase_dict))

In [None]:
not_so_fancy_but_fast_benchmark = submission[['customer_id']]
prediction_list = []
dummy_list = list((train_['article_id'].value_counts()).index)[:12]
dummy_pred = ' '.join(dummy_list)

for i, cust_id in enumerate(submission['customer_id'].values.reshape((-1,))):
    if cust_id in purchase_dict:
        l = sorted((purchase_dict[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>12:
            s = ' '.join(l[:12])
        else:
            s = ' '.join(l+dummy_list[:(12-len(l))])
    else:
        s = dummy_pred
    prediction_list.append(s)

not_so_fancy_but_fast_benchmark['prediction'] = prediction_list
print(not_so_fancy_but_fast_benchmark.shape)
not_so_fancy_but_fast_benchmark.head()

In [None]:
import gc
del train 
gc.collect()

In [None]:
not_so_fancy_but_fast_benchmark.to_csv('submission.csv',index=False)

# Reference
https://www.kaggle.com/abhilashawasthi/not-so-fancy-but-fast-benchmark?scriptVersionId=87382298