In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import cv2
import matplotlib.pyplot as plt
import seaborn as sns

# image names to imgpaths

In [None]:
img2path = {}
for img_folder in os.listdir('../input/h-and-m-personalized-fashion-recommendations/images'):
    subfolder = os.path.join('../input/h-and-m-personalized-fashion-recommendations/images', img_folder)
    for imgname in os.listdir(subfolder):
        img2path[imgname.replace('.jpg', '')] = os.path.join(subfolder, imgname)

In [None]:
article_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv', 
                        dtype={'article_id': str})
article_df = article_df[[
    'article_id', 'prod_name', 'product_type_name',
    'product_group_name', 'department_name', 
    'index_name', 'index_group_name', 'section_name',
    'garment_group_name', 'detail_desc'
]].copy()

article_df.head()

In [None]:
%%time
transaction_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", 
                             usecols=['t_dat', 'customer_id', 'article_id'],
                             dtype={'article_id': str})
transaction_df['t_dat'] = pd.to_datetime(transaction_df['t_dat'])
transaction_df = transaction_df.merge(article_df)


transaction_df = transaction_df[transaction_df['t_dat'] > '2019-09-01']
min_date = transaction_df.t_dat.min()
transaction_df['week'] = ((transaction_df.t_dat - min_date).dt.days)//7

transaction_df = transaction_df.sort_values('t_dat')
transaction_df = transaction_df.groupby('customer_id', as_index=False)[['week', 'article_id']].agg(list)

transaction_df.head(2)

In [None]:
transaction_df['num_purchases'] = transaction_df['article_id'].apply(len)
transaction_df['num_unique_purchases'] = transaction_df['article_id'].apply(lambda x: len(set(x)))
transaction_df['num_weeks'] = transaction_df.week.apply(lambda x:len(set(x)))
transaction_df['avg_purchases_per_week'] = transaction_df['num_purchases'].div(transaction_df['num_weeks'])

transaction_df = transaction_df[(transaction_df.num_purchases!=1) & 
                                (transaction_df.num_weeks<25) &
                                (transaction_df.num_purchases<100)
                               ]

transaction_df.head()

In [None]:
plt.figure(figsize=(12, 4))
plt.title("distributions of Number of Active Transaction weeks per customer")
sns.countplot(data=transaction_df, x='num_weeks')
plt.show()

In [None]:
transaction_df.avg_purchases_per_week.describe()

In [None]:
transaction_df.avg_purchases_per_week.quantile(0.99)

In [None]:
plt.figure(figsize=(12, 4))
plt.title("distributions of Avg number of purchases per weeks per customer")
sns.boxplot(data=transaction_df, x='avg_purchases_per_week')
plt.show()

In [None]:
plt.figure(figsize=(12, 4))
plt.title("distributions of Avg number of purchases per weeks per customer")
sns.histplot(data=transaction_df[transaction_df.avg_purchases_per_week<15], x='avg_purchases_per_week')
plt.show()

In [None]:
transaction_df.head()

In [None]:
print("number of customers:",len(transaction_df))

In [None]:
def avg_gap_betweek_weeks(weeks):
    weeks = np.diff(weeks)
    return np.mean(np.unique(weeks))

In [None]:
transaction_df['avg_week_gap'] = transaction_df['week'].apply(avg_gap_betweek_weeks)
transaction_df.head()

In [None]:
transaction_df[transaction_df.avg_week_gap!=0].avg_week_gap.describe()

In [None]:
transaction_df.avg_week_gap.quantile(0.99)

In [None]:
plt.figure(figsize=(10, 5))
plt.xticks(np.arange(0, 50, 3))
plt.title("distribution of average gap between weeks.")
sns.histplot(data=transaction_df[transaction_df.avg_week_gap!=0], x='avg_week_gap')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.title("scatter plot of Number Of Weeks (vs) Average Week Gap")
plt.xticks(np.arange(0, 25, 3))
sns.scatterplot(data=transaction_df, x='num_weeks', y='avg_week_gap')
plt.show()

In [None]:
transaction_df.groupby('num_weeks')['avg_week_gap'].std()

In [None]:
print("number of customers with avg. week gap > 30:", len(transaction_df[ (transaction_df.num_weeks>1) & 
                                                                         (transaction_df.avg_week_gap>30)]))

In [None]:
transaction_df[ (transaction_df.num_weeks>1) & 
               (transaction_df.avg_week_gap>30)].num_weeks.value_counts()

# Observations:

1. Number of Weeks the purchase happened is long-tailed , i.e number of weeks customers coming back to purchases items are getting reduced.

2. If the customer comes to purchase items in a week, if we calculate the average number of items purchases by the person in a week that 90th quantile of 15 items. 75th quantile <5 items

3. If we consider the average gap between the weeks in which purchase happens, most users are coming back with-in 9weeks.We can also observe that few customers have avg. gap > 40 --> these are the old customers, by which difficult to esitmate, as their preference could have changed.

4. As can be seek, the Variation in the Average week gap is reduced as the number of weeks purchased increases.

5. There are like 2273 customers with week gap >=30 --> these are the customers that are coming back to platform after a period of atlease 30 weeks. Inference is difficult due to change in the intereset of the user.

In [None]:
transaction_df.avg_week_gap.quantile(0.8)

# 

In [None]:
transaction_df.head()

# Visualize images

In [None]:
def read_image(imgpath):
    img = cv2.imread(imgpath)
    return img

In [None]:
row = transaction_df.iloc[3]
products = row.article_id
week = row.week

for i, imgname in enumerate(products):
    print("week:", week[i])
    print("product:", imgname)
    
    img = read_image(img2path[imgname])
    plt.imshow(img)
    plt.show()
    
    

Looking at some of the images closely associated products are being bought by the user.

In [None]:
article_df[article_df.article_id.isin(products)]