<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#5642C5;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<h1 style="padding: 10px;
              color:white;">

              H&M Data Visualization
</h1>
</div>

<h1 style="background-color:blue;>If you find this notebook helpful please upvote it</h1>

<h1 style="color:blue">If you find this notebook helpful please upvote it</h1>

***

<h4 style="color:purple;">In this competitions H&M wants you to build a personalize fashion recommendation system because they have huge number of products on their online platform But with too many choices, customers might not quickly find what interests them or what they are looking for, and ultimately, they might not make a purchase. To enhance the shopping experience.</h4>

***

# Dataset

<ol style="color:purple;"><li><h4>images/ - a folder of images corresponding to each article_id; images are placed in subfolders starting with the first three digits of the article_id; note, not all article_id values have a corresponding image.</h4></li>
    <li><h4>articles.csv - detailed metadata for each article_id available for purchase</h4></li>
    <li><h4>customers.csv - metadata for each customer_id in dataset</h4></li>
    <li><h4>sample_submission.csv - a sample submission file in the correct format</h4></li>
<li><h4>transactions_train.csv - the training data, consisting of the purchases each customer for each date, as well as additional information. Duplicate rows correspond to multiple purchases of the same item. Your task is to predict the article_ids each customer will purchase during the 7-day period immediately after the training data period.</h4></li><ol>

***

In [None]:
# import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import os
import numpy as np
import cv2
import warnings
import missingno as msno
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)

In [None]:
IMG_DIR="../input/h-and-m-personalized-fashion-recommendations/images"

In [None]:
# Reading all the csv files
articles=pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers=pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions=pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")
sample_submission=pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")

<h4 style="color:blue;">Let's display first few rows of all the dataframes</h4>

In [None]:
articles.head(2)

In [None]:
customers.head(2)

In [None]:
transactions.head(2)

In [None]:
sample_submission.head()

<h1 style="color:purple;">Data Visualization</h1>

In [None]:
# Let's find out the shapes of all three dataframes
shape=pd.DataFrame({"Row":[articles.shape[0],customers.shape[0],transactions.shape[0]],
             "Column":[articles.shape[1],customers.shape[1],transactions.shape[1]]},index=['articles',
                                                                                          'customers','transactions'])
green = [{'selector': 'th', 'props': 'background-color: green'}]
red = [{'selector': 'th', 'props': 'background-color: red'}]
shape.style.set_table_styles({"articles": green, "customers": red, "transactions": green}, axis=1)

<h3 style="color:purple;">Visualization of missing values</h3>

In [None]:
# Missing values in articles dataframe
msno.bar(articles,sort='ascending',color='#7209b7',figsize=(20,10),fontsize=14)

### detail_desc column has very few missing values

In [None]:
# Missing values in customers dataframe
msno.bar(customers,color='#f72585',sort='ascending',figsize=(20,10),fontsize=14)

### In Active and FN column of customers dataset has more than 60% null values  

In [None]:
# Missing values in transactions dataframe
msno.bar(transactions,color='#4895ef',sort='ascending',figsize=(20,10),fontsize=14)

### Transactions dataset does not have any null value as all

In [None]:
articles.head()

In [None]:
cols=['prod_name','product_type_name','product_group_name','graphical_appearance_name',
      'colour_group_name','department_name','index_name','section_name','garment_group_name']
for col in cols:
    plt.figure(figsize=(10,10))
    sns.countplot(y=col,data=articles,order=articles[col].value_counts().index[:10])
    #plt.title("Product Group Name",font='serif',size=20,color="purple")
    plt.xlabel("Count",size=20,color="purple")
    plt.ylabel(col,size=20,color="purple")
    plt.xticks(size=16)
    plt.yticks(size=16)
    plt.tight_layout()
    plt.show()
    

### Top 10 most frequently appeared items in their respective columns. 

In [None]:
# First two digits of article_id showing the directroy number, So i am creating a new column
# by taking these two digits it will help us to create the paths of images while image visualization.

articles['dir'] = articles.article_id.astype(str).str[:2].astype(int)

In [None]:
articles.head()

In [None]:
(articles.groupby('product_group_name').count()['article_id']).plot.bar(figsize=(10,8))
plt.xticks(size=14)
plt.yticks(size=14)
plt.show()

### Garment Upper body, Garment Lower body, Garment Full body have maximum number of articles.

In [None]:
(articles.groupby('index_name').count()['article_id']).plot.bar(figsize=(10,8))
plt.xticks(size=14)
plt.yticks(size=14)
plt.show()

In [None]:
(articles.groupby('garment_group_name').count()['article_id']).plot.bar(figsize=(10,8))
plt.xticks(size=14)
plt.yticks(size=14)
plt.show()

### Jersey Fancy and Accessories have by far the most number of articles

In [None]:
(articles.groupby('section_name').count()['article_id']).plot.bar(figsize=(15,8))
plt.xticks(size=14)
plt.yticks(size=14)
plt.show()

### Womens Everyday collection and Divided collection appeared most number of time in articles

In [None]:
articles.head()

In [None]:
articles.groupby(['product_group_name','index_name']).count()['article_id']

### Ladieswear has maximum number of articles followed by Divided and Menswear

<h3 style="color:purple">article_id from the articles dataset is same as image id from the image folder
Here i am trying to access the article_id corresponding to the particular product_group_name for instance article_id corresponding to the shoes product_group_name and trying to visualiza them.</h3>

In [None]:
# Get article_id corresponding to the particular product group name
def get_article_id(df,group_name):
    article_id=df[df['product_group_name']==group_name]
    article_id['article_id']="0"+article_id['article_id'].astype(str)
    article_id['dir']="0"+article_id['dir'].astype(str)
    return article_id[['article_id','dir']].reset_index(drop=True)


# Read images from fetched articles ids and store the array into the empty list
def read_img(data):
    li=[]
    for i in range(10):
        arti=data['article_id'][i]
        di=data['dir'][i]
        im=cv2.imread("../input/h-and-m-personalized-fashion-recommendations/images/"+di+"/"+arti+".jpg")
        im=cv2.resize(im,(224,224),fx=0,fy=0, interpolation = cv2.INTER_CUBIC)
        li.append(im)
    return li

# Display images which are present in empty list in array form
def show_img(data):
    f, axarr = plt.subplots(1,5,figsize=(15,10)) 
    axarr[0].imshow(data[0])
    axarr[1].imshow(data[1])
    axarr[2].imshow(data[2])
    axarr[3].imshow(data[3])
    axarr[4].imshow(data[4])
    f.tight_layout()
    
def call(df,group_name):
    _id=get_article_id(df,group_name)
    img=read_img(_id)
    display_img=show_img(img)
    return display_img



    

In [None]:
# Images from Garment Lower body product group name
call(articles,"Garment Lower body")

In [None]:
# Images from Garment Upper body product group name
call(articles,"Garment Upper body")

In [None]:
# Images from Accessories product group name
call(articles,"Accessories")

In [None]:
# Images from Underwear product group name
call(articles,"Underwear")

In [None]:
# Images from Swimwear product group name
call(articles,"Swimwear")

In [None]:
# Images from Socks and tights product group name
call(articles,"Socks & Tights")

### Ladieswear has the maximum chunk in all unique identifiers  

In [None]:
f, ax = plt.subplots(figsize=(15, 12))
ax = sns.histplot(data=articles, y='garment_group_name', color='orange', hue='index_group_name', multiple="stack")
ax.set_xlabel('count by garment group',size=16)
ax.set_ylabel('garment group',size=16)
plt.xticks(size=16)
plt.yticks(size=16)
plt.show()

In [None]:
articles.groupby(['index_group_name', 'index_name']).count()['article_id']

In [None]:
pd.options.display.max_rows = None
articles.groupby(['product_group_name', 'product_type_name']).count()['article_id']

In [None]:
for col in articles.columns:
    if not 'no' in col and not 'code' in col and not 'id' in col:
        un_n = articles[col].nunique()
        print(f'n of unique {col}: {un_n}')

In [None]:
articles.head()

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
stopwords = set(STOPWORDS)

comment_words = ''
 
# iterate through the csv file
for val in articles.detail_desc:
     
    # typecaste each val to string
    val = str(val)
 
    # split the value
    tokens = val.split()
     
    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
     
    comment_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 800, height = 800,
                      background_color ='white',
                      stopwords = stopwords,
                      min_font_size = 10).generate(comment_words)
 
# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

### Visualization of text data from detail_desc column.
### The size of each word indicates its frequency or importance

In [None]:
customers.club_member_status.value_counts()

In [None]:
fig, ax  = plt.subplots(figsize=(16, 12))
fig.suptitle('Index Name', size = 20, font="Serif")
explode = (0.05, 0.05, 0.05)
labels = list(customers.club_member_status.value_counts().index)
sizes =customers.club_member_status.value_counts().values
ax.pie(sizes,explode=explode,startangle=60, labels=labels,autopct='%1.3f%%', pctdistance=0.7, colors=["#4895ef","#f72585","#7209b7"],textprops={"fontsize":15})
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
customers.head()

In [None]:
data_postal = customers.groupby('postal_code', as_index=False).count().sort_values('customer_id', ascending=False)
data_postal.head()

In [None]:
transactions.head()

In [None]:
transactions.sales_channel_id.value_counts()

In [None]:
fig, ax  = plt.subplots(figsize=(16, 12))
fig.suptitle('Sales Channel', size = 20, font="Serif")
explode = (0.05, 0.05)
labels = list(transactions.sales_channel_id.value_counts().index)
sizes =transactions.sales_channel_id.value_counts().values
ax.pie(sizes,explode=explode,startangle=60, labels=labels,autopct='%1.3f%%', pctdistance=0.7, colors=["#4895ef","#f72585","#7209b7"],textprops={"fontsize":15})
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
sns.set_style("darkgrid")
f, ax = plt.subplots(figsize=(10,5))
ax = sns.histplot(data=customers, x='age', bins=50, color='orange')
ax.set_xlabel('Distribution of the customers age')
plt.show()

In [None]:
sns.set_style("darkgrid")
f, ax = plt.subplots(figsize=(10,5))
ax = sns.boxplot(data=transactions, x='price', color='orange')
ax.set_xlabel('Price outliers')
plt.show()

In [None]:
transactions_byid = transactions.groupby('customer_id').count()
transactions_byid.sort_values(by='price', ascending=False)['price'][:10]

<center><h1 style="color:red;">Work in Progress</h1></center>