<!-- 

import numpy as np 
import pandas as pd 



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) -->



# H&M Personalized Fashion Recommendations

**Analysing the data and developing product recommendations based on data from previous transactions. For each customer_id observed in the training data, predicted up to 12 labels for the article_id, which is the predicted items a customer will buy in the next 7-day period after the training time period.**

This notebook demonstrates how recommending items that are frequently purchased together is effective.  This notebook's strategy is as follows:

* recommend items previously purchased idea here
* recommend items that are bought together with previous purchases idea here
* recommend popular items

# Table of Content

**1. Loading the dataset:** Load the data and import the libraries.

**2. Data Cleaning:**
*   Deleting redundant columns.
*   Renaming the columns.
*   Dropping duplicates.
*   Cleaning individual columns.
*   Remove the NaN values from the dataset

**3. Data Visualization:** Using plots to find relations between the features.
   1.    Articles Data : 
*       Product Types Per Product Group
* Articles count per each product Group
* Number of Articles per each Product Type
* Number of articles per each index name
* The garments grouped by index:
* Number of Articles per each Perceived Colour Value Name
* Number of Articles per each  Colour group
   2. Customers Data: 
*           Age distribution of customers
* Distribution of Club member status
* Distribution of Fashion News Frequency
   3. Transactions data
* Price Outliers
* Mean Price of Each Product Group Name

**4. Items recommendations based on data from previous transactions**
*  Each Customer's Last Week of Purchases
*  Recommend Most Often Previously Purchased Items
*  Recommend Items Purchased Together
*  Recommend Last Week's Most Popular Items
*  Write Submission CSV

# 1. Loading the dataset

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
articles=pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv')
customers=pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
transactions=pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')

In [None]:
articles.head()

In [None]:
articles.info()

# 2. Data Cleaning

In [None]:
articles.isnull().sum()

In [None]:
articles.nunique()

In [None]:
for c in articles.columns:
    if not 'no' in c and not 'id' in c and not 'code' in c:
        n=articles[c].nunique()
        print(f'number of unique {c} is: {n}')

# Articles Data : 

**Product Types Per Product Group**

In [None]:
plt.figure(figsize=(12,5))
temp = articles.groupby(["product_group_name"])["product_type_name"].nunique()
articles_temp_df = pd.DataFrame({'Product Group': temp.index,
                   'Product Types': temp.values
                  })
articles_temp_df=articles_temp_df.sort_values('Product Types',ascending=False)
sns.barplot(y='Product Group',x='Product Types',data=articles_temp_df)
plt.title('NUmber of Product Types per Product group')

**And the product group-product structure. Accessories are really various, the most numerious: bags, earrings and hats. However, trousers prevail.**

In [None]:
pd.options.display.max_rows = None
articles.groupby(by=['product_group_name','product_type_name'])['article_id'].count()

**Articles count per each product group**

In [None]:
plt.figure(figsize=(10,5))
temp=articles.groupby(by='product_group_name')['article_id'].count()
temp_articles_df=pd.DataFrame({'Product Group':temp.index,'Articles':temp.values})
temp_articles_df=temp_articles_df.sort_values('Articles',ascending=False)
s=sns.barplot(x='Product Group',y='Articles',data=temp_articles_df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.title('Number of Articles per each Product Group')
plt.show()

**Number of Articles per each Product Type**

In [None]:
plt.figure(figsize=(15,5))
temp=articles.groupby(by='product_type_name')['article_id'].count()
temp_articles_df=pd.DataFrame({'Product Type':temp.index,'Articles':temp.values})
temp_articles_df=temp_articles_df.sort_values('Articles',ascending=False)[:50]
s=sns.barplot(x='Product Type',y='Articles',data=temp_articles_df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.title('Number of Articles per each Product Type ')
plt.show()

In [None]:
articles.groupby(by=['index_group_name','index_name'])['article_id'].count()

**Number of articles per each index name**

In [None]:
sns.countplot(y='index_name',data=articles)
plt.xlabel('Count by index name')
plt.ylabel('Index Name')
plt.title('Number of articles per each index name')


**The garments grouped by index: Jersey fancy is the most frequent garment, especially for women and children. The next by number is accessories, many various accessories with low price.**

In [None]:
plt.figure(figsize=(15,8))
sns.histplot(y='garment_group_name',data=articles,hue='index_name',multiple="stack")
plt.xlabel('Count by Garment Group Name')
plt.ylabel('Garment Group Name')

**Number of Articles per each Perceived Colour Value Name**

In [None]:
temp = articles.groupby(["perceived_colour_value_name"])["article_id"].nunique()
df = pd.DataFrame({'Perceived Colour Value Name': temp.index,
                   'Articles': temp.values
                  })
df = df.sort_values(['Articles'], ascending=False)
plt.figure(figsize = (8,6))
plt.title(f'Number of Articles per each Perceived Colour Value Name')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Perceived Colour Value Name', y="Articles", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

**Number of Articles per each Perceived Colour Master Name**

In [None]:
temp = articles.groupby(["perceived_colour_master_name"])["article_id"].nunique()
df = pd.DataFrame({'Perceived Colour Master Name': temp.index,
                   'Articles': temp.values
                  })
df = df.sort_values(['Articles'], ascending=False)
plt.figure(figsize = (12,6))
plt.title(f'Number of Articles per each Perceived Colour Master Name')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Perceived Colour Master Name', y="Articles", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

In [None]:
temp = articles.groupby(["colour_group_name"])["article_id"].nunique()
df = pd.DataFrame({'Color Group Name': temp.index,
                   'Articles': temp.values
                  })
df = df.sort_values(['Articles'], ascending=False)
plt.figure(figsize = (12,6))
plt.title(f'Number of Articles per each  Colour group')
sns.set_color_codes("pastel")
s = sns.barplot(x = 'Color Group Name', y="Articles", data=df)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
locs, labels = plt.xticks()
plt.show()

# Customers Data

In [None]:
customers.head()


In [None]:
customers.info()

In [None]:
customers.isnull().sum()

**There are no duplicates in customers**

In [None]:
customers.shape[0]- customers['customer_id'].nunique()

**Age Distribution**

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(x='age',data=customers,bins=40)
plt.title('Age Distribution of Customers')

**Distribution of Club member status**

In [None]:
sns.countplot(x='club_member_status',data=customers)
plt.xlabel('Distribution of Club member status')

In [None]:
customers['fashion_news_frequency'].unique()

In [None]:
customers.loc[~customers['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency']='None'

In [None]:
customers['fashion_news_frequency'].unique()

In [None]:
pie_data=customers[['customer_id','fashion_news_frequency']].groupby(by='fashion_news_frequency').count()
pie_data

**Distribution of Fashion News Frequency**

In [None]:

colors = sns.color_palette('colorblind')
f,ax=plt.subplots(figsize=(10,5))
ax.pie(pie_data.customer_id,labels=pie_data.index,colors=colors)
plt.title('Distribution of Fashion News Frequency')

# Transactions Data

In [None]:
transactions.head()

In [None]:
pd.set_option('display.float_format','{:.4f}'.format)
transactions['price'].describe()

**Price Outliers**

In [None]:
sns.boxplot(x='price',data=transactions)
plt.xlabel('Price outliers')

In [None]:
tran_temp=transactions.groupby(by='customer_id').count()


In [None]:
tran_temp.sort_values('price',ascending=False)['price'][:10]

In [None]:
articles_main=articles[['article_id', 'prod_name', 'product_type_name', 'product_group_name', 'index_name']]

In [None]:
articles_transactions_merge=transactions[['customer_id', 'article_id', 'price', 't_dat']].merge(articles_main,on='article_id',how='left')

In [None]:
articles_transactions_merge.head()

In [None]:
# f, ax = plt.subplots(figsize=(25,15))
# ax.set_xlabel('price outliers')
# ax.set_ylabel('index name')
# ax=sns.boxplot(x='price', y='product_group_name',data=articles_transactions_merge)
# ax.xaxis.set_tick_params(labelsize=22)
# ax.yaxis.set_tick_params(labelsize=22)


In [None]:
# f, ax = plt.subplots(figsize=(25,15))
# ax.set_xlabel('price outliers')
# ax.set_ylabel('index name')
# ax=sns.boxplot(x='price', y='index_name',data=articles_transactions_merge)
# ax.xaxis.set_tick_params(labelsize=22)
# ax.yaxis.set_tick_params(labelsize=22)

**Mean Price of Each Product Group Name**

In [None]:
# mean_price=articles_transactions_merge[['index_name','price']].groupby(by='product_group_name').mean()
# sns.barplot(x=mean_price.price,y=mean_price.index,color='orange')
# plt.xlabel('Price')
# plt.ylabel('Index Name')

In [None]:
mean_price=articles_transactions_merge[['product_group_name','price']].groupby(by='product_group_name').mean().sort_values(by='price',ascending=False)
sns.barplot(x=mean_price.price,y=mean_price.index,color='green')
plt.xlabel('Price')
plt.ylabel('Product Group Name')

# Recommend Items Purchased Together 

In [None]:

train = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
train['article_id'] = train.article_id.astype('int32')
train.t_dat = pd.to_datetime(train.t_dat)
train = train[['t_dat','customer_id','article_id']]
train.to_parquet('train.pqt',index=False)
print( train.shape )
train.head()

# Last Week Purchases of each customer 

In [None]:
tmp = train.groupby('customer_id').t_dat.max().reset_index()
tmp.columns = ['customer_id','max_dat']
train = train.merge(tmp,on=['customer_id'],how='left')
train['diff_dat'] = (train.max_dat - train.t_dat).dt.days
train = train.loc[train['diff_dat']<=6]
print('Train shape:',train.shape)
train.shape

# Recommend Most Often Previously Purchased Items


In [None]:
tmp = train.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
tmp.columns = ['customer_id','article_id','ct']
train = train.merge(tmp,on=['customer_id','article_id'],how='left')
train = train.sort_values(['ct','t_dat'],ascending=False)
train = train.drop_duplicates(['customer_id','article_id'])
train = train.sort_values(['ct','t_dat'],ascending=False)
train.head()

# Recommend Items Purchased Together

In [None]:
import pandas as pd, numpy as np
pairs = np.load('../input/pairs-cudfnpy/pairs_cudf.npy',allow_pickle=True).item()
train['article_id2'] = train.article_id.map(pairs)

In [None]:
# code in  pairs_cudf.npy file


# df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
# print('Transactions shape',df.shape)
# display( df.head() )

# # REDUCE MEMORY OF DATAFRAME
# df = df[['customer_id','article_id']]
# df.customer_id = df.customer_id.str[-16:].str.hex_to_int().astype('int64')
# df.article_id = df.article_id.astype('int32')
# _ = gc.collect()

# vc = df.article_id.value_counts()
# pairs = {}
# for j,i in enumerate(vc.index.values[1000:1032]):
#     #if j%10==0: print(j,', ',end='')
#     USERS = df.loc[df.article_id==i.item(),'customer_id'].unique()
#     vc2 = df.loc[(df.customer_id.isin(USERS))&(df.article_id!=i.item()),'article_id'].value_counts()
#     pairs[i.item()] = [vc2.index[0], vc2.index[1], vc2.index[2]]
# train['article_id2'] = train.article_id.map(pairs)

In [None]:
# RECOMMENDATION OF PAIRED ITEMS
train2 = train[['customer_id','article_id2']].copy()
train2 = train2.loc[train2.article_id2.notnull()]
train2 = train2.drop_duplicates(['customer_id','article_id2'])
train2 = train2.rename({'article_id2':'article_id'},axis=1)

In [None]:
# CONCATENATE PAIRED ITEM RECOMMENDATION AFTER PREVIOUS PURCHASED RECOMMENDATIONS
train = train[['customer_id','article_id']]
train = pd.concat([train,train2],axis=0,ignore_index=True)
train.article_id = train.article_id.astype('int32')
train = train.drop_duplicates(['customer_id','article_id'])

In [None]:
# CONVERT RECOMMENDATIONS INTO SINGLE STRING
train.article_id = ' 0' + train.article_id.astype('str')
preds = pd.DataFrame( train.groupby('customer_id').article_id.sum().reset_index() )
preds.columns = ['customer_id','prediction']
preds.head()

# Recommend Last Week's Most Popular Items

In [None]:
train = pd.read_parquet('train.pqt')
train.t_dat = pd.to_datetime(train.t_dat)
train = train.loc[train.t_dat >= pd.to_datetime('2020-09-16')]
top12 = ' 0' + ' 0'.join(train.article_id.value_counts().index.astype('str')[:12])
print("Last week's top 12 popular items:")
print( top12 )

# Output the Predictions into CSV

In [None]:
sub = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
sub = sub[['customer_id']]
sub = sub.merge(preds,on='customer_id', how='left').fillna('')
sub.prediction = sub.prediction + top12
sub.prediction = sub.prediction.str.strip()
sub.prediction = sub.prediction.str[:131]
sub.to_csv(f'submission.csv',index=False)
sub.head()
