In [None]:
# need to install chart studio to use plotly
!pip install chart_studio

In [None]:
# import necessary libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import chart_studio.plotly as py
from plotly import graph_objs as go
import chart_studio.tools as tls
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
import plotly.express as px
tls.set_credentials_file(username="jagdish.mir",api_key="OM08LZvE1Z2kcNngYLHC")
import missingno as msnum

In [None]:
# load data
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
# check shapes
articles.shape,customers.shape,transactions.shape

# Helper Functions

In [None]:
# helper function to draw pie chart
def pie_chart(df,column):
    
    fig = {'data':[{'labels':df[column].value_counts().index[:5],'values':df[column].value_counts().values[:5],'type':'pie'}],
      'layout':{"title":"Pie Chart- Top 5 "+"Articles by " + column}}
   
    return(iplot(fig))

In [None]:
# helper function to draw histogram
def histogram(df,column,hue):
    fig = px.histogram(df, y=column, color=hue)
    fig.show()

In [None]:
# helper function to draw barplot
def barplot(df,title):
    # lets collect unique values for columns other than numeric columns
    unique_values = list()
    column_list = list()
    for column in df.columns:
        unique_values.append(df[column].nunique())
        column_list.append(column)
    data = [go.Bar(x=column_list,y=unique_values)]
    layout = dict(title=title,xaxis=dict(title="Column Name"),yaxis=dict(title="Count"))
    return(iplot(data))

In [None]:
# helper function to print value counts
def value_counts(df):
    for column in df.columns:
        if not 'no' in column and not 'code' in column and not 'id' in column and not 'customer' in column:
            print("Unique values for the column:",column,"\n",df[column].value_counts(),"\n")
            print("************************************************************************************")

In [None]:
def print_missing_values(df):
    # check for null values (only if its greater than 0)
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            print(col,df[col].isnull().sum())

In [None]:
# plot missing values
def missing_values(df):
        #msnum.bar(df)
        #plt.title("Bar Plot - Missing Values")
        #plt.show()
        msnum.matrix(df)
        plt.title("Matrix Representation - Missing Values")
        plt.show()
        #msnum.heatmap(df)
        plt.title("HeatMap Representation - Missing Values")
        #plt.show()

# EDA - Articles

In [None]:
print_missing_values(articles)

- Only field with missing values is detail_desc

In [None]:
# plot missing values (had to comment this because of memory issues)
#missing_values(articles)

In [None]:
# check for # of unique values for each column articles in the dataset
articles.nunique()

In [None]:
# visualize no. of unique values for each column
barplot(articles,"No.of Unique Values for Articles Data")

- We have data for more than 1lacs articles
- These articles belongs to thousands of groups
- Articles are present in multiple colours
- Indepth analysis to follow

In [None]:
# check different possible values for different columns in the article dataset
value_counts(articles)

- We have articles for kids,ladies,gents
- These items belongs to different categories like sports,accessories,clothes etc
- We will dig into these individual columns later in this notebook

### Let's dig into individual columns

In [None]:
# A unique identifier of every index and its name, lets check the distribution
pie_chart(articles,"index_name")

- Ladieswear forms the majority of the articles

In [None]:
# lets check the distribution of garment groups with index groups
histogram(articles,"garment_group_name","index_group_name")

- Jersey fancy is the most frequent garment,and its mostly available for women and children. 
- 2nd most frequent article is accessories, which is mostly available for Ladies.

In [None]:
# lets check the distribution of index_group_name with index name
histogram(articles,"index_group_name","index_name")

- Quite visible from the chart above, Ladieswear have categories - Ladieswear/Lingrie/Tights/Ladies Accessories
- Baby/Children also have categories - Baby Size 50-98/Children size 92-140/Children Sizes 134-170/Children Accessories & Swimwear

In [None]:
# lets check the distribution of product_group_name with product_type_name
histogram(articles,"product_group_name","product_type_name")

- Accessories & Garment Upper Body group has so many sub categories

In [None]:
import gc
del articles
gc.collect()

# EDA on Customers

In [None]:
print_missing_values(customers)

- We have missing values for 5 columns in customers dataset 

In [None]:
# plot missing values(commented due to memory issues)
#missing_values(customers)

In [None]:
# check for # of unique values for each column customers in the dataset
customers.nunique()

- Customer id seems to be the primary key for customers dataset
- FN & Active seems to have no variation in the dataset
- lets dig into individuals fields in a while

In [None]:
# Age distribution
age_counts = customers.age.value_counts()
fig = px.bar(age_counts, title="Age of customers")
fig.update_layout(
    xaxis_title = "Age",
    yaxis_title = "Frequency",
    title_x = 0.5, 
    showlegend = False
)
fig.show()

- Majority of the customers falls in the age range of 20-30 yrs

In [None]:
# visualize no. of unique values for each column
barplot(customers,"No.of Unique Values for Customers Data")

- Customer Id & postal code has max no. of unique values

In [None]:
value_counts(customers)

In [None]:
customers.postal_code.value_counts()

- A pin code with more that 1.2 lacs records is something wiered , this can not be a regular customer.

### Let's dig into individual columns

In [None]:
# A unique identifier of every index and its name, lets check the distribution
pie_chart(customers,"FN")

- No variation for FN

In [None]:
pie_chart(customers,"Active")

- No variation for "Active" field

In [None]:
pie_chart(customers,"club_member_status")

- More than 90% of the customers are active

In [None]:
pie_chart(customers,"fashion_news_frequency")

In [None]:
import gc
del customers
gc.collect()

# EDA - Transactions

In [None]:
print_missing_values(transactions)

- Transaction data does not have any missing values!

In [None]:
# plot missing values (commented due to memory issues)
#missing_values(transactions)

In [None]:
# check for # of unique values for each column articles in the dataset
transactions.nunique()

In [None]:
# visualize no. of unique values for each column
barplot(transactions,"No.of Unique Values for Transaction Data")

- Customer ids have maximum no. of unique values followed by article id (quite obvious)

In [None]:
pie_chart(transactions,"sales_channel_id")

- Channel id represents the mode of purchase, most of the transactions are done through channel id 2

In [None]:
# price distribution
sns.boxplot(y=transactions.price,color="red")
plt.xlabel("Box Plot")
plt.ylabel("Price")
plt.title("Box Plot - Price")

In [None]:
count = transactions.groupby("customer_id")["customer_id"].count().sort_values(ascending=False)[:10]
data = [go.Bar(x=count.index,y=count.values)]
layout = dict(title="Top 10 customers(by no. of transactions)",xaxis=dict(title="Customer Id"),yaxis=dict(title="Total No. of Transactions"))    
iplot({'data': data, 'layout': layout})

In [None]:
count = transactions.groupby("article_id")["article_id"].count().sort_values(ascending=False)[:10]
data = [go.Bar(x=count.index.astype(str),y=count.values)]
layout = dict(title="Top 10 articles sold",xaxis=dict(title="Article Id"),yaxis=dict(title="No. of Articles sold"))    
iplot({'data': data, 'layout': layout})

In [None]:
count = transactions.groupby("customer_id")["price"].sum().sort_values(ascending=False)[:10]
data = [go.Bar(x=count.index,y=count.values)]
layout = dict(title="Top 10 Customers by amount of money spent",xaxis=dict(title="Customer Id"),yaxis=dict(title="Money Spent"))    
iplot({'data': data, 'layout': layout})

In [None]:
count = transactions.groupby("article_id")["price"].sum().sort_values(ascending=False)[:10]
data = [go.Bar(x=count.index.astype(str),y=count.values)]
layout = dict(title="Sales - Top 10 Items",xaxis=dict(title="Article Id"),yaxis=dict(title="Total Sales"))    
iplot({'data': data, 'layout': layout})

# More EDA to come soon, please upvote if you liked this notebook!