# AN EDA notebooks

In [None]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
%matplotlib inline

In [None]:
input_path = '../input/h-and-m-personalized-fashion-recommendations'
image_paths = 'images/'

df_articles = pd.read_csv(os.path.join(input_path, 'articles.csv'))
df_customers = pd.read_csv(os.path.join(input_path, 'customers.csv'))
transactions_train = pd.read_csv(os.path.join(input_path, 'transactions_train.csv'))

# EDA of customers.csv

In [None]:
df_customers.head()

## Check data types
check what kind of data is in each columns and count nulls.

In [None]:
print('FN uniques are:', df_customers['FN'].unique())
print('Active uniques are:', df_customers['Active'].unique())
print('club_member_status uniques are:', df_customers['club_member_status'].unique())
print('fashion_news_frequency uniques are:', df_customers['fashion_news_frequency'].unique())
print('null/not null count in ages are:', df_customers['age'].isnull().sum(), '/', len(df_customers))
print('percentage where ages are null are', df_customers['age'].isnull().sum()/len(df_customers)*100, '%')

## Fill NaN
fill NaN values for visualization

In [None]:
df_customers['FN'] = df_customers['FN'].fillna(0)
df_customers['Active'] = df_customers['Active'].fillna(0)
df_customers['club_member_status'] = df_customers['club_member_status'].fillna('NON_MEMBER')
df_customers['fashion_news_frequency'] = df_customers['fashion_news_frequency'].fillna('NONE')
df_customers['fashion_news_frequency'] = df_customers['fashion_news_frequency'].replace('None', 'NONE')

df_customer = df_customers.dropna()
len(df_customer['age'])

## Ploting a simple histogram of ages

In [None]:
plt.title("histogram of age")
plt.xlabel("age")
plt.ylabel("number of people")
plt.hist(df_customer["age"], bins=9, range=(0, 90))

In [None]:
age_hist, age_bins = np.histogram(df_customer['age'], bins=9, range=(10,100))

## Creating data for ploting colored bar plots
- count uniqe FN
- count unique active
- count unique club member status
- count unique fashion news frequency

In [None]:
# FN
fn_0_ages = []
fn_1_ages = []

# Active
active_1_ages = []
active_0_ages = []

# club member status => cms
cms_active_ages = []
cms_nonmember_ages = []
cms_precreate_ages = []
cms_leftclub_ages = []

# fashion_news_frequency => fnf
fnf_regulary_age = []
fnf_monthly_age = []
fnf_none_age = []

for i, age in enumerate(age_bins):
    if i == 0:
        df_bin = df_customer.query(f"age < {age}")
    else:
        df_bin = df_customer.query(f"{age_bins[i-1]} < age < {age}")
        
    fn_0_ages.append(len(df_bin.query('FN == 0')))
    fn_1_ages.append(len(df_bin.query('FN == 1')))
    
    active_1_ages.append(len(df_bin.query('Active == 1')))
    active_0_ages.append(len(df_bin.query('Active == 0')))

    cms_active_ages.append(len(df_bin.query('club_member_status == "ACTIVE"')))
    cms_nonmember_ages.append(len(df_bin.query('club_member_status == "NON_MEMBER"')))
    cms_precreate_ages.append(len(df_bin.query('club_member_status == "PRE-CREATE"')))
    cms_leftclub_ages.append(len(df_bin.query('club_member_status == "LEFT CLUB"')))

    fnf_regulary_age.append(len(df_bin.query('fashion_news_frequency == "Regularly"')))
    fnf_monthly_age.append(len(df_bin.query('fashion_news_frequency == "Monthly"')))
    fnf_none_age.append(len(df_bin.query('fashion_news_frequency == "NONE"')))

In [None]:
df_plot_fn = pd.DataFrame({
    "0": fn_0_ages,
    "1": fn_1_ages
    },index=age_bins
)

df_plot_active = pd.DataFrame({
    "0": active_0_ages,
    "1": active_1_ages
    },index=age_bins
)

df_plot_cms = pd.DataFrame({
    "active": cms_active_ages,
    "non-member": cms_nonmember_ages,
    "pre-create": cms_precreate_ages,
    "left-club": cms_leftclub_ages
    },index=age_bins
)


df_plot_fnf = pd.DataFrame({
    "regulary": fnf_regulary_age,
    "monthly": fnf_monthly_age,
    "none": fnf_none_age
    },index=age_bins
)

In [None]:
df_plot_fn.plot(kind="bar", stacked=True)
plt.title("Bar plot of FN actives and ages")
plt.xlabel("age")
plt.ylabel("number of people")

In [None]:
df_plot_active.plot(kind="bar", stacked=True)
plt.title("Bar plot of active and ages")
plt.xlabel("age")
plt.ylabel("number of people")

In [None]:
df_plot_cms.plot(kind="bar", stacked=True)
plt.title("Bar plot of FN Club member status and ages")
plt.xlabel("age")
plt.ylabel("number of people")

In [None]:
df_plot_fnf.plot(kind="bar", stacked=True)
plt.title("Bar plot of Fashion new frequency and ages")
plt.xlabel("age")
plt.ylabel("number of people")

# EDA of articles.csv

In [None]:
df_articles.head()

In [None]:
df_articles.columns

In [None]:
def show_uniques(input_df, column_name):
    uniques = input_df[column_name].unique()
    counts = len(uniques)
    print(f"{counts} uniques vales in columns '{column_name}', the unique values are {uniques}")

### values of product type name

In [None]:
show_uniques(df_articles, "product_type_name")

### vales of product group name

In [None]:
show_uniques(df_articles, "product_group_name")

### values of graphical apperance name

In [None]:
show_uniques(df_articles, "graphical_appearance_name")

### values of colour group name

In [None]:
show_uniques(df_articles, "colour_group_name")

### values of perceived colour master name

In [None]:
show_uniques(df_articles, "perceived_colour_master_name")

### values of department name

In [None]:
show_uniques(df_articles, "department_name")

### values of index name

In [None]:
show_uniques(df_articles, "index_name")

### values of index group name

In [None]:
show_uniques(df_articles, "index_group_name")

### values of section name

In [None]:
show_uniques(df_articles, "section_name")

### values of garment group name

In [None]:
show_uniques(df_articles, "garment_group_name")