## EDA on articles.csv

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
## Loading articles.csv
article_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
article_df.head()

In [None]:
article_df.info()

In [None]:
## Data type
## All are nominal data
article_df.columns

In [None]:
## Dataset shape
article_df.shape

In [None]:
## Seems like each variables "_no" or "_id" or "_code" correspond to the "_name"
## Drop all column with "_no", "_id", "_code" to prevent the machine thinking that "1" is more than "2", except "articel_id"

article_df_no1 = article_df.iloc[:, 1:]

article_df_no1.drop(article_df_no1.columns[article_df_no1.columns.str.contains('_no|_code|id')], axis=1, inplace=True)

article_df2 = pd.concat([article_df_no1, article_df['article_id']], axis =1)
article_df2

In [None]:
article_df2.info()

In [None]:
## Check for missing value

article_df2.isnull().sum()

In [None]:
## For the sake of ease of EDA, will drop column "detail desc". Other column already give almost the same description already
## Now, no missing value

article_df2.drop('detail_desc', axis= 1, inplace= True)
article_df2.head(20)

In [None]:
## There are three redundant columns: 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name'
## Checking the value of each column to see if they are the same or similar --> may delete if thet are similar

print(article_df2['colour_group_name'].value_counts())
print(article_df2['perceived_colour_value_name'].value_counts())
print(article_df2['perceived_colour_master_name'].value_counts())

In [None]:
## they all explain the color in a similar way 
## possibly when customer search, the result will be 'perceived_colour_value_name' + 'perceived_colour_master_name' = 'colour_group_name
## then no use for 'colour_group_name'

article_df2.drop('colour_group_name', axis=1, inplace=True)
article_df2.head()

In [None]:
## See if "index_name" and "index_group_name" are redundant
print(article_df2['index_name'].value_counts())
print(article_df2['index_group_name'].value_counts())

## See if "department_name" and "garment_group_name" are redundant
print(article_df2['department_name'].value_counts())
print(article_df2['garment_group_name'].value_counts())

In [None]:
##'garment_group_name' name things in a weird way --> remove
article_df2.drop('garment_group_name', axis=1, inplace=True)
article_df2.head()

In [None]:
## Check how many variety of item in each column are there

p = 0
for col in article_df2.columns:
    p = article_df2[col].value_counts().count()
    print(col, ':', p)
    
## There are 45875 products in the store, 

In [None]:
## Check point

article_df3 = article_df2.copy()

In [None]:
article_df3['prod_name'].value_counts()

In [None]:
## Create function to compute piechart of top 5 most frequent value
def piechart(data_set):
    n = 5
    tp = data_set.value_counts().sort_values(ascending = False)
    top = tp[:n].index.tolist()
    temp = data_set.value_counts().head(n)
    colors = sns.color_palette('pastel')[0:5]
    explode = [0.1,0.0,0.01,0.01,0.01]
    
    plt.pie(temp, labels = top, colors = colors, explode = explode, autopct='%.0f%%')
    plt.show()

In [None]:
column = article_df3.columns.tolist()
column

In [None]:
##Checking product type and group
product_type_column = ['product_type_name', 'product_group_name']
for col in product_type_column:
    plt.title(col,fontsize=20, pad = 3.0)
    piechart(article_df3[col])

In [None]:
## Check what 'Dress' product_group_name is 
dress = article_df3[article_df3['product_type_name'].str.contains('Dress')]
dress.head(1)

From what we can see here, 'Garment upper body' constitue almost half of the store with top 3 product types are Sweater, T-Shirt, and Top. 'Trouser' seems to be the most product for Garment Lower body. They also stock alot of 'Dress' even though 'Garment Full Body' contributed only 14% of the total product.

In [None]:
## Checking the Design of the poduct
product_type_column = [ 'graphical_appearance_name',
 'perceived_colour_value_name',
 'perceived_colour_master_name',]
for col in product_type_column:
    plt.title(col,fontsize=20, pad = 3.0)
    piechart(article_df3[col])

The most common theme of the product is 'Dark' with 'Black' and 'Blue' dominating the product line. About 2/3 of the product line has no pattern.

In [None]:
## Checking the categories of the poduct
product_type_column = [
 'index_name',
 'index_group_name',
 'section_name']
for col in product_type_column:
    plt.title(col,fontsize=20, pad = 3.0)
    piechart(article_df3[col])

Women's clothes comprised most of the H&M porduct categories. From what we can see, both Ladieswear and Baby/Children constitue to a little more than 2/3 of the store with Ladieswear at 38%. Most proportion of Baby/children wear seems to be for 'Young Girl' and 'Kids Girl'.

## EDA on Customer.csv

In [None]:
pd.set_option("display.max_rows", None)
cus_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
cus_df.head(100)

- Cannot find how to read the postal_code in the given format --> plan to group based on city or country, but no luck trying to read it, so will drop 'postal_code'. 
- Seems like 'FN' = if a customer get Fashion News newsletter,
    - Check if there is more value than 'Regulary' and 'NONE' -> if only these two then will drop 'fashion_news_frequency' as FN is already binary
- Have check the value for both 'club_member_status and 'Active' first

for column explanation
https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/307001

In [None]:
cus_df.info()

In [None]:
## Check columns
column_cus = cus_df.columns
column_cus

In [None]:
# Checking the sshape
cus_df.shape

In [None]:
## only 'postal_code' can be dropped
cus_df2 = cus_df.copy()
cus_df2.drop('postal_code', axis= 1, inplace= True)
cus_df2.head()


In [None]:
## Check for missing value
cus_df2.isnull().sum()

In [None]:
print(cus_df2['FN'].value_counts())
print(cus_df2['Active'].value_counts())

In [None]:
## Dealing with missing value
cus_df2['FN'] = cus_df2['FN'].fillna(0)
cus_df2['Active'] = cus_df2['Active'].fillna(0)

In [None]:
print(cus_df2['club_member_status'].value_counts())
print('number of missing value =', cus_df2['club_member_status'].isnull().sum())

In [None]:
## Use mode to replace missing value
cus_df2['club_member_status'] = cus_df2['club_member_status'].fillna(cus_df2['club_member_status'].mode()[0])
print(cus_df2['club_member_status'].value_counts())
print('number of missing value =', cus_df2['club_member_status'].isnull().sum())

In [None]:
## Check for missing value and value type
print(cus_df2['fashion_news_frequency'].value_counts())
print('number of missing value =', cus_df2['fashion_news_frequency'].isnull().sum())

In [None]:
## merge Redundant value NONE and None into 'NONE'
cus_df2['fashion_news_frequency'] = cus_df2['fashion_news_frequency'].replace('None', 'NONE')

## replacing missing value with mode
cus_df2['fashion_news_frequency'] = cus_df2['fashion_news_frequency'].fillna(cus_df2['fashion_news_frequency'].mode()[0])

#Recheck
print(cus_df2['fashion_news_frequency'].value_counts())
print('number of missing value =', cus_df2['fashion_news_frequency'].isnull().sum())

In [None]:
## Checking range of 'age'
max_age = max(cus_df2['age'])
min_age = min(cus_df2['age'])
print('max age is', max_age)
print('minimun age is', min_age)
print('range of "age" is', max_age - min_age)
## Check for missing value in 'age'
print('number of missing value =', cus_df2['age'].isnull().sum())

In [None]:
## minimum age of 16 make sense, but 90 years old might not make sense
## Ploting histogram: bimodal and skew to the left

sns.histplot(cus_df2['age'])

In [None]:
## Seeing the percentage of missing value in 'age' to the total number of row to see if it is possible to delete the row that have missing value in age
## only 1.15% --> delete the column
per = 15861/1371979 * 100
per

In [None]:
cus_df2.dropna(axis = 0, inplace=True)
cus_df2.isnull().sum()

In [None]:
#Plotting boxplot
age = cus_df2['age']

plt.boxplot(age, vert=False)
plt.title("Detecting outliers using Boxplot")
plt.xlabel('sample')

In [None]:
#Finding outlier, upper bound, and lower bound
outliers = []
def detect_outliers_iqr(data):
    data = sorted(data)
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    
    IQR = q3-q1
    lwr_bound = q1-(1.5*IQR)
    upr_bound = q3+(1.5*IQR)
    print("Upper bound is", upr_bound)
    print("Lower bound is", lwr_bound)
    
    for i in data:
        if (i<lwr_bound or i>upr_bound):
            outliers.append(i)
    return outliers
age_outliers = detect_outliers_iqr(age)
print("Outliers from IQR method: ", age_outliers)
print("Min value of outliers from IQR method: ", min(age_outliers))

In [None]:
## Putting a cap on the maximum age --> converting outlier age to age at 95th percentile
## Finding 95th percentile first
ninety_fifth_percentile = np.percentile(age, 95)
print("95th percentile is", ninety_fifth_percentile, "years old")

cus_df2['age'] = np.where((cus_df2['age']> 62), 62 ,cus_df2['age'])

In [None]:
## Check if there is still outliers
plt.boxplot(cus_df2['age'], vert=False)
plt.title("Detecting outliers using Boxplot")
plt.xlabel('sample')

In [None]:
## Checkpoint 
cus_df3 = cus_df2.copy()

In [None]:
cus_df3.head(10)

In [None]:
## Seeing proportion of each chart
def piechart2(data_set):
    top = data_set.value_counts().index.tolist()
    temp = data_set.value_counts()
    colors = sns.color_palette('bright')[0:5]
    
    
    plt.pie(temp, labels = top, colors = colors, autopct='%.0f%%')

In [None]:
FN = cus_df3['FN']
Active = cus_df3['Active']
club_member = cus_df3['club_member_status']
fashion_news = cus_df3['fashion_news_frequency']

font_size = 10

fig, axs = plt.subplots(2,2)


labels = FN.value_counts().index.tolist()
data = FN.value_counts()
axs[0,0].pie(data, labels=labels, autopct='%1.1f%%', shadow=True, radius=5)
plt.title('FN',fontsize=font_size, pad = 1.0)


labels = Active.value_counts().index.tolist()
data = Active.value_counts()
axs[0,1].pie(data, labels=labels, autopct='%.0f%%', shadow=True, radius=5)

          
labels = club_member.value_counts().index.tolist()
data = club_member.value_counts()
axs[1, 0].pie(data, labels=labels, autopct='%.0f%%', shadow=True, radius=5)

          
labels = fashion_news.value_counts().index.tolist()
data = fashion_news.value_counts()
axs[1, 1].pie(data, labels=labels, autopct='%.0f%%', shadow=True, radius=5)


plt.subplots_adjust(wspace=5, hspace=5)
plt.show()


In [None]:
fig = plt.figure(figsize=(5,5), dpi=100)
#2 rows 2 columns

#first row, first column
ax1 = plt.subplot2grid((2,2),(0,0))
piechart2(FN)
plt.title('FN')

#first row sec column
ax1 = plt.subplot2grid((2,2), (0, 1))
piechart2(Active)
plt.title('Active')

#Second row first column
ax1 = plt.subplot2grid((2,2), (1, 0))
piechart2(club_member)
plt.title('club_member')

#second row second column
ax1 = plt.subplot2grid((2,2), (1, 1))
piechart2(fashion_news)
plt.title('fashion_news')