In [None]:
import csv
import pandas as pd
import missingno as msno
import numpy as np
from pandas import Series, DataFrame
import sklearn
from sklearn import preprocessing
from tqdm import tqdm
import gc
%precision 3

In [None]:
# load data
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions_train = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
# shape
print("articles", articles.shape)
print("customers", customers.shape)
print("transactions_train", transactions_train.shape)

In [None]:
# Check missing
msno.matrix(customers)

In [None]:
# Drop columns to avoid Multicollinearity
# It seems that "product_type_no", "product_type_name" are chosen 1
# I chose "product_type_no" because of handle easily
# Drop "product_type_name"
articles_1 = articles[["product_type_no", "product_type_name"]]
articles_1.head(5)

In [None]:
# same as the above theory, see "index_code", "index_name"
# I chose "index_code" because of handle easily
# Drop "index_name"
articles_2 = articles[["index_code", "index_name"]]
articles_2.head(5)

In [None]:
# "section_name" is more detail than "index_group_name"
# Drop "index_group_name"
articles_3 = articles[["index_group_name", "section_name"]]
articles_3.head(5)

In [None]:
# same as the above theory, see "garment_group_name", "detail_desc", "prod_name"
# Drop "detail_desc" and "prod_name"
articles_4 = articles[["garment_group_name", "detail_desc","prod_name"]]
articles_4.head(5)

In [None]:
# same as the above theory, see "color_group_code", "color_group_name", "prod_name"
# "perceived_colour_master_name", "perceived_colour_value_name"
# Drop "colour_group_name", "preceived_colour_master_name", "perceived_colour_value_name"
# "department_name" and  "garment_group_name"
articles_5 = articles[["colour_group_code", "colour_group_name", "perceived_colour_master_name",
                       "perceived_colour_value_name", "department_name", "garment_group_name"]]
articles_5.head(5)

In [None]:
# make new articles data
# drop above data "product_type_name", "index_name", "index_group_name", "detail_desc", "prod_name"
# "colour_group_name", "preceived_colour_master_name", "perceived_colour_value_name"
# "department_name" and "garment_group_name"
articles_new = articles.drop(["product_type_name", "index_name", "index_group_name",
                              "detail_desc", "prod_name", "colour_group_name",
                              "perceived_colour_master_name", "perceived_colour_value_name",
                             "department_name", "garment_group_name"], axis=1)

In [None]:
# Customer status like is no mean to predict what product will be bought
# we don't know what the "fashion_news" is written
# Drop "FN", "Active", "clubmember_status", "fashion_news_frequency" and "postal_code"
customers_new = customers.drop(["FN", "Active", "club_member_status",
                             "fashion_news_frequency", "postal_code"],axis=1)

# Few "age" data is missing
# So, filling any age is less impact
customers_new["age"] = customers_new["age"].fillna(customers_new["age"].mean())

In [None]:
# Binary data "sales_channel_id" is not important
# Because we predict variety of products that customer will buy
transactions_train_new = transactions_train.drop(["sales_channel_id"], axis=1)

In [None]:
# compare data shape
print("------articles------")
print("Before", articles.shape)
print("After", articles_new.shape)
print("------customers------")
print("Before", customers.shape)
print("After", customers_new.shape)
print("------transactions_train------")
print("Before", transactions_train.shape)
print("After", transactions_train_new.shape)

In [None]:
del articles, customers, transactions_train, articles_1, articles_2, articles_3, articles_4, articles_5
gc.collect()

In [None]:
# letter and long data is difficult to handle
# LabelEncoder
le = preprocessing.LabelEncoder()

articles_new["product_group_name"] = le.fit_transform(articles_new["product_group_name"])
articles_new["graphical_appearance_name"] = le.fit_transform(articles_new["graphical_appearance_name"])
articles_new["section_name"] = le.fit_transform(articles_new["section_name"])

articles_new["product_group_name"] = articles_new["product_group_name"].astype("int32")
articles_new["graphical_appearance_name"] = articles_new["graphical_appearance_name"].astype("int32")
articles_new["section_name"] = articles_new["section_name"].astype("int32")

In [None]:
# merge above data
merge_data = pd.merge(articles_new, transactions_train_new, on="article_id", copy=False)
del articles_new, transactions_train_new
gc.collect()

In [None]:
# see customer bought in around September
merge_data["month"] = pd.to_datetime(merge_data["t_dat"]).dt.strftime("%m")

In [None]:
merge_data = merge_data.drop(["t_dat"], axis=1)

In [None]:
drop_month_list = ["01", "02", "03", "04", "05", "06", "07", "11", "12"]
for mon in tqdm(drop_month_list):
    drop_index_month = merge_data.index[merge_data["month"] == mon]
    merge_data = merge_data.drop(drop_index_month)

In [None]:
merge_data = merge_data.drop(["month"], axis=1)

In [None]:
merge_data = pd.merge(customers_new, merge_data, on="customer_id", copy=False)

del customers_new
gc.collect()

In [None]:
merge_data.head(10)

In [None]:
merge_data.shape

I want to drop data more.
Below may be not appropriate processing.
As we see Discussion, few people bought same products and products are large variation.
we may drop data that same customer bought twice and unpopular products.

In [None]:
# drop buying twice data
merge_data = merge_data.drop_duplicates(subset=["customer_id", "article_id"])

In [None]:
# drop unpopluar products
feature, count = np.unique(merge_data["article_id"], return_counts=True)
unpopular = feature[count <=1 ]

In [None]:
drop_index_list = []
for i in tqdm(range(unpopular.size)):
    drop_index = merge_data.index[merge_data["article_id"] == unpopular[i]]
    drop_index_list.append(drop_index)

for i in tqdm(range(len(drop_index_list))):
    merge_data = merge_data.drop(drop_index_list[i], axis=0)

In [None]:
print("Data shape after this process", merge_data.shape)