In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
from datetime import datetime
from PIL import Image

## Read Data

In [None]:
articles_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
sample_submission_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")

In [None]:
transactions_train_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
transactions_train_df['t_dat'] = pd.to_datetime(transactions_train_df['t_dat'])

## Watch Data

In [None]:
last_date = transactions_train_df.t_dat.max()
start_date = transactions_train_df.t_dat.min()
print(last_date)
print(start_date)

In [None]:
print(len(transactions_train_df))

## Lighten transictions data

The data is so big...

My MacBook CPU is on fire.

In [None]:
# delete duplicate customer ID
simple_transictions_df = transactions_train_df.drop_duplicates(keep='first', subset='customer_id')

In [None]:
len(simple_transictions_df)

In [None]:
len(customers_df)

In [None]:
articles_df.head()

## Add feature "index_group_name" to transictions_df 

In [None]:
easy_articles_df = articles_df[['article_id', 'index_group_name']]

In [None]:
easy_articles_df.head()

In [None]:
group_labeled_df = simple_transictions_df.merge(easy_articles_df, on="article_id")
group_labeled_df = group_labeled_df[["customer_id", "index_group_name"]]
group_labeled_df.head()

In [None]:
group_labeled_df["index_group_name"].value_counts()

## Create prediction

In [None]:
df_3w = transactions_train_df[transactions_train_df['t_dat'] >= pd.to_datetime('2019-08-31')].copy()

In [None]:
add_group_name_df = df_3w.merge(easy_articles_df, on="article_id")

In [None]:
add_group_name_df.head()

In [None]:
def get_frequent_articles_list_group_name(group_name:str):
    output_dict = {}
    df = add_group_name_df[add_group_name_df.index_group_name == group_name]
    frequent_articles_list = list((df['article_id'].value_counts()).index)[:12]
    frequent_articles_list = [("0" + str(article_id))[-10:] for article_id in frequent_articles_list]
    return frequent_articles_list


In [None]:
ladies_list = get_frequent_articles_list_group_name(group_name="Ladieswear")
print(ladies_list)

In [None]:
Divided_list = get_frequent_articles_list_group_name(group_name="Divided")
print(Divided_list)

In [None]:
Sport_list = get_frequent_articles_list_group_name(group_name="Sport")
print(Sport_list)

In [None]:
Menswear_list = get_frequent_articles_list_group_name(group_name="Menswear")
print(Menswear_list)

In [None]:
baby_list = get_frequent_articles_list_group_name(group_name="Baby/Children")
print(baby_list)

## Show Prediction images

In [None]:
images_names = []
for _, _, files in tqdm(os.walk('/kaggle/input/h-and-m-personalized-fashion-recommendations/')):
    for _files in files:
        if len(_files.split(".jpg"))==2:
            images_names.append(_files.split(".jpg")[0])

In [None]:
image_name_df = pd.DataFrame(images_names, columns = ["image_name"])
image_name_df["article_id"] = image_name_df["image_name"].apply(lambda x: int(x[1:]))

In [None]:
image_article_df = articles_df[["article_id", "product_code", "product_group_name", "product_type_name", "index_group_name", "index_group_no", "colour_group_code"]].merge(image_name_df, on=["article_id"], how="left")
print(image_article_df.shape)
image_article_df.head()

In [None]:
len(articles_df)

In [None]:
from PIL import Image

# art_listから可視化を行う
def plot_image_art_list(art_list, cols=4, rows=3):
    image_path = "/kaggle/input/h-and-m-personalized-fashion-recommendations/images/"
    plt.figure(figsize=(2 + 3 * cols, 2 + 4 * rows))
    for i, article_id in enumerate(art_list):
#         index_group_name = image_article_df.loc[image_article_df.image_name==article_id]["index_group_name"]
#         index_group_name = index_group_name.to_list()[0]
#         print(index_group_name)
        plt.subplot(rows, cols, i + 1)
        plt.axis('off')
        plt.title(f"{article_id[:3]}\n{article_id}.jpg")
        try:
            image = Image.open(f"{image_path}{article_id[:3]}/{article_id}.jpg")
        except FileNotFoundError:
            print('!!! FileNotFoundError !!!')
            continue
        
        plt.imshow(image)


In [None]:
plot_image_art_list(ladies_list)

In [None]:
plot_image_art_list(Sport_list)

In [None]:
plot_image_art_list(Divided_list)

In [None]:
plot_image_art_list(Menswear_list)

In [None]:
plot_image_art_list(baby_list)

In [None]:
dummy_list = list((simple_transictions_df['article_id'].value_counts()).index)[:12]
dummy_list = [("0" + str(article_id))[-10:] for article_id in dummy_list]

In [None]:
plot_image_art_list(dummy_list)

In [None]:
submission_df = sample_submission_df.merge(group_labeled_df, on=["customer_id"], how="left")

In [None]:
submission_df.head()

In [None]:
def update_prediction(row):
    group_label = row["index_group_name"]
    if group_label == "Ladieswear":
        return " ".join(ladies_list)
    elif group_label == "Menswear":
        return " ".join(Menswear_list)
    elif group_label == "Divided":
        return " ".join(Divided_list)
    elif group_label == "Sport":
        return " ".join(Sport_list)
    elif group_label == "Baby/Children":
        return " ".join(baby_list)
    else:
        return " ".join(dummy_list)


In [None]:
submission_df["prediction"] = submission_df.apply(update_prediction, axis=1)

In [None]:
submission_df.head(10)

In [None]:
print("Rows with missing data in submission: ", submission_df.loc[submission_df.prediction.isna()].shape[0])

In [None]:
submission_df = submission_df[["customer_id", "prediction"]]

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv("submission.csv", index=False)