# Importing nessesasry libraries

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from PIL import Image

# data directory and overview

In [None]:
print(f"folders: {os.listdir('/kaggle/input/h-and-m-personalized-fashion-recommendations/')}")
print("subfolders: ", len(list(os.listdir("/kaggle/input/h-and-m-personalized-fashion-recommendations/images"))))

In [None]:
total_folders = total_files = 0
folder_info = []
images_names = []
for base, dirs, files in tqdm(os.walk('/kaggle/input/h-and-m-personalized-fashion-recommendations/')):
    for directories in dirs:
        folder_info.append((directories, len(os.listdir(os.path.join(base, directories)))))
        total_folders += 1
    for _files in files:
        total_files += 1
        if len(_files.split(".jpg"))==2:
            images_names.append(_files.split(".jpg")[0])

In [None]:
print(f"Total number of folders: {total_folders}\nTotal number of files: {total_files}")
folder_info_df = pd.DataFrame(folder_info, columns=["folder", "files count"])
folder_info_df.sort_values(["files count"], ascending=False).head()

In [None]:
print("folder names: ", list(folder_info_df.folder.unique()))

# Checking the available files format

In [None]:
articles_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
sample_submission_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")

In [None]:
transactions_train_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
articles_df.head()

In [None]:
customers_df.head()

In [None]:
sample_submission_df.head()

In [None]:
transactions_train_df.head()

In [None]:
articles_df.info()

In [None]:
customers_df.info()

In [None]:
sample_submission_df.info()

In [None]:
transactions_train_df.info()

# EDA

There are 3 main tables:
- articles - contains informations about each article (like product code, name, product group code, name ...)    
- customers - contains informations about each customer (fidelity card membership, age, postal code)
- transactions (train)  

Transactions have `customer_id` and `article_id`, which are foreign keys for the customer and articles tables.
Beside this, transaction also contains `sales_channel_id`.


In [None]:
temp = articles_df.groupby(["product_group_name"])["product_type_name"].nunique()
df = pd.DataFrame({'Product Group': temp.index,
                   'Product Types': temp.values
                  })
df = df.sort_values(['Product Types'], ascending=False)
plt.figure(figsize = (8,6))
plt.title('Number of Product Types per each Product Group')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Product Group', y="Product Types", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = articles_df.groupby(["product_group_name"])["article_id"].nunique()
df = pd.DataFrame({'Product Group': temp.index,
                   'Articles': temp.values
                  })
df = df.sort_values(['Articles'], ascending=False)
plt.figure(figsize = (8,6))
plt.title('Number of Articles per each Product Group')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Product Group', y="Articles", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = articles_df.groupby(["product_type_name"])["article_id"].nunique()
df = pd.DataFrame({'Product Type': temp.index,
                   'Articles': temp.values
                  })
total_types = len(df['Product Type'].unique())
df = df.sort_values(['Articles'], ascending=False)[0:50]
plt.figure(figsize = (16,6))
plt.title(f'Number of Articles per each Product Type (top 50 from total: {total_types})')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Product Type', y="Articles", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = articles_df.groupby(["department_name"])["article_id"].nunique()
df = pd.DataFrame({'Department Name': temp.index,
                   'Articles': temp.values
                  })
total_depts = len(df['Department Name'].unique())
df = df.sort_values(['Articles'], ascending=False).head(50)
plt.figure(figsize = (16,6))
plt.title(f'Number of Articles per each Department (top 50 from total: {total_depts})')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Department Name', y="Articles", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = articles_df.groupby(["index_group_name"])["article_id"].nunique()
df = pd.DataFrame({'Index Group Name': temp.index,
                   'Articles': temp.values
                  })
df = df.sort_values(['Articles'], ascending=False)
plt.figure(figsize = (6,6))
plt.title(f'Number of Articles per each Index Group Name')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Index Group Name', y="Articles", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = articles_df.groupby(["index_name"])["article_id"].nunique()
df = pd.DataFrame({'Index Name': temp.index,
                   'Articles': temp.values
                  })
df = df.sort_values(['Articles'], ascending=False)
plt.figure(figsize = (8,6))
plt.title(f'Number of Articles per each Index Name')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Index Name', y="Articles", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = articles_df.groupby(["garment_group_name"])["article_id"].nunique()
df = pd.DataFrame({'Garment Group Name': temp.index,
                   'Articles': temp.values
                  })
df = df.sort_values(['Articles'], ascending=False)
plt.figure(figsize = (12,6))
plt.title(f'Number of Articles per each Garment Group Name')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Garment Group Name', y="Articles", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = articles_df.groupby(["section_name"])["article_id"].nunique()
df = pd.DataFrame({'Section Name': temp.index,
                   'Articles': temp.values
                  })
df = df.sort_values(['Articles'], ascending=False)
plt.figure(figsize = (16,6))
plt.title(f'Number of Articles per each Section Name')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Section Name', y="Articles", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
stopwords = set(STOPWORDS)

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=5,
        random_state=1
    ).generate(str(data))

    fig = plt.figure(1, figsize=(10,10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=14)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [None]:
show_wordcloud(articles_df["detail_desc"], "Wordcloud from detailed description of articles")

In [None]:
temp = customers_df.groupby(["age"])["customer_id"].count()
df = pd.DataFrame({'Age': temp.index,
                   'Customers': temp.values
                  })
df = df.sort_values(['Age'], ascending=False)
plt.figure(figsize = (16,6))
plt.title(f'Number of Customers per each Age')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Age', y="Customers", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = customers_df.groupby(["fashion_news_frequency"])["customer_id"].count()
df = pd.DataFrame({'Fashion News Frequency': temp.index,
                   'Customers': temp.values
                  })
df = df.sort_values(['Customers'], ascending=False)
plt.figure(figsize = (6,6))
plt.title(f'Number of Customers per each Fashion News Frequency')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Fashion News Frequency', y="Customers", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = customers_df.groupby(["club_member_status"])["customer_id"].count()
df = pd.DataFrame({'Club Member Status': temp.index,
                   'Customers': temp.values
                  })
df = df.sort_values(['Customers'], ascending=False)
plt.figure(figsize = (6,6))
plt.title(f'Number of Customers per each Club Member Status')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Club Member Status', y="Customers", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
transactions_train_df.sales_channel_id.unique()

In [None]:
df = transactions_train_df.sample(100_000)
fig, ax = plt.subplots(1, 1, figsize=(7, 7))
sns.kdeplot(np.log(df.loc[df["sales_channel_id"]==1].price.value_counts()))
sns.kdeplot(np.log(df.loc[df["sales_channel_id"]==2].price.value_counts()))
ax.legend(labels=['Sales channel 1', 'Sales channel 1'])
plt.title("Logaritmic distribution of price frequency in transactions, grouped per sales channel (100k sample)")
plt.show()

# Image data

There are 105542 articles and 105100 different images. Let's check first which articles does not have corresponding images.

The `article_id` corresponds to digits from 2nd to the last of the image name. 
The digits from 2nd to 7th of image name  correspond to product code (`product_code`). 

In [None]:
image_name_df = pd.DataFrame(images_names, columns = ["image_name"])
image_name_df["article_id"] = image_name_df["image_name"].apply(lambda x: int(x[1:]))

In [None]:
image_name_df.head()

In [None]:
image_article_df = articles_df[["article_id", "product_code", "product_group_name", "product_type_name"]].merge(image_name_df, on=["article_id"], how="left")
print(image_article_df.shape)
image_article_df.head()

Products without images.

In [None]:
article_no_image_df = image_article_df.loc[image_article_df.image_name.isna()]
print(article_no_image_df.shape)
article_no_image_df.head()

In [None]:
print("Product codes without images: ", article_no_image_df.product_code.nunique())
print("Product group names without images: ", list(article_no_image_df.product_group_name.unique()))

Let's visualize few images.

In [None]:
def plot_image_samples(image_article_df, product_group_name, cols=1, rows=-1):
    image_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/images/"
    _df = image_article_df.loc[image_article_df.product_group_name==product_group_name]
    article_ids = _df.article_id.values[0:cols*rows]
    plt.figure(figsize=(2 + 3 * cols, 2 + 4 * rows))
    for i in range(cols * rows):
        article_id = ("0" + str(article_ids[i]))[-10:]
        plt.subplot(rows, cols, i + 1)
        plt.axis('off')
        plt.title(f"{product_group_name} {article_id[:3]}\n{article_id}.jpg")
        image = Image.open(f"{image_path}{article_id[:3]}/{article_id}.jpg")
        plt.imshow(image)

Let's choose from some product group name.

In [None]:
print(image_article_df.product_group_name.unique())

We will represent images grouped on product group name.

In [None]:
plot_image_samples(image_article_df, "Stationery", 4, 1)

In [None]:
plot_image_samples(image_article_df, "Fun", 2, 1)

# Model 1

In [None]:

from pathlib import Path

data_path = Path('/kaggle/input/h-and-m-personalized-fashion-recommendations/')
df = pd.read_csv(
    data_path / 'transactions_train.csv',
    # set dtype or pandas will drop the leading '0' and convert to int
    dtype={'article_id': str} 
)


In [None]:
print(df.shape)
df.head()

In [None]:
df['t_dat'] = pd.to_datetime(df['t_dat'])

In [None]:
df_3_week = df[df['t_dat'] >= pd.to_datetime('2020-08-31')].copy()
df_2_week = df[df['t_dat'] >= pd.to_datetime('2020-09-07')].copy()
df_1_week = df[df['t_dat'] >= pd.to_datetime('2020-09-15')].copy()

In [None]:
purchase_dict_3_week= {}

for i,x in enumerate(zip(df_3_week['customer_id'], df_3_week['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_3_week:
        purchase_dict_3_week[cust_id] = {}
    
    if art_id not in purchase_dict_3_week[cust_id]:
        purchase_dict_3_week[cust_id][art_id] = 0
    
    purchase_dict_3_week[cust_id][art_id] += 1
    
print(len(purchase_dict_3_week))

dummy_list_3_week = list((df_3_week['article_id'].value_counts()).index)[:12]

In [None]:
purchase_dict_2_week= {}

for i,x in enumerate(zip(df_2_week['customer_id'], df_2_week['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_2_week:
        purchase_dict_2_week[cust_id] = {}
    
    if art_id not in purchase_dict_2_week[cust_id]:
        purchase_dict_2_week[cust_id][art_id] = 0
    
    purchase_dict_2_week[cust_id][art_id] += 1
    
print(len(purchase_dict_2_week))

dummy_list_2_week = list((df_2_week['article_id'].value_counts()).index)[:12]

In [None]:
purchase_dict_1_week= {}

for i,x in enumerate(zip(df_1_week['customer_id'], df_1_week['article_id'])):
    cust_id, art_id = x
    if cust_id not in purchase_dict_1_week:
        purchase_dict_1_week[cust_id] = {}
    
    if art_id not in purchase_dict_1_week[cust_id]:
        purchase_dict_1_week[cust_id][art_id] = 0
    
    purchase_dict_1_week[cust_id][art_id] += 1
    
print(len(purchase_dict_1_week))

dummy_list_1_week = list((df_1_week['article_id'].value_counts()).index)[:12]

In [None]:
print(sample_submission_df.shape)
sample_submission_df.head()

In [None]:
need_improvemnet_model = sample_submission_df[['customer_id']]
prediction_list = []

dummy_list = list((df_2_week['article_id'].value_counts()).index)[:12]
dummy_pred = ' '.join(dummy_list)

for i, cust_id in enumerate(sample_submission_df['customer_id'].values.reshape((-1,))):
    if cust_id in purchase_dict_1_week:
        l = sorted((purchase_dict_1_week[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>12:
            s = ' '.join(l[:12])
        else:
            s = ' '.join(l+dummy_list_1_week[:(12-len(l))])
    elif cust_id in purchase_dict_2_week:
        l = sorted((purchase_dict_2_week[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>12:
            s = ' '.join(l[:12])
        else:
            s = ' '.join(l+dummy_list_2_week[:(12-len(l))])
    elif cust_id in purchase_dict_3_week:
        l = sorted((purchase_dict_3_week[cust_id]).items(), key=lambda x: x[1], reverse=True)
        l = [y[0] for y in l]
        if len(l)>12:
            s = ' '.join(l[:12])
        else:
            s = ' '.join(l+dummy_list_3_week[:(12-len(l))])
    else:
        s = dummy_pred
    prediction_list.append(s)

need_improvemnet_model['prediction'] = prediction_list
print(need_improvemnet_model.shape)
need_improvemnet_model.head()


In [None]:
need_improvemnet_model.to_csv('submission.csv', index=False)

# Model 2