In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats

import vaex

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.compose       import ColumnTransformer

import warnings


# These are the 4 csv files' paths:
# 1. /kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv
# 2. /kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv
# 3. /kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv
# 4. /kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv

# And this is one of the images path:
# 1. /kaggle/input/h-and-m-personalized-fashion-recommendations/images/057/0570177001.jpg


# hide unwanted warning comming from pandas dataframe operations
pd.options.mode.chained_assignment = None
# display all the columns, (don't hide some columns while viewing the dataframe)
pd.options.display.max_columns     = None
# hide other unwanted warnings
warnings.filterwarnings("ignore")

# <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1. EDA</span>

## <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1.1 "article" csv file</span>

**<span style="color:#023e8a">Details of the `articles` csv file:</span>**
- **<span style="color:#B016E4">article_id</span>**<span style="color:#023e8a;">: **A unique identifier of every article.**</span>
- **<span style="color:#B016E4">product_code, prod_name</span>**<span style="color:#023e8a;">: **A unique identifier of every product and its name (not the same).</span>**  
- **<span style="color:#B016E4">product_type, product_type_name</span>**<span style="color:#023e8a;">: **The group of product_code and its name</span>**  
- **<span style="color:#B016E4">graphical_appearance_no, graphical_appearance_name</span>**<span style="color:#023e8a;">: **The group of graphics and its name</span>**  
- **<span style="color:#B016E4">colour_group_code, colour_group_name</span>**<span style="color:#023e8a;">: **The group of color and its name</span>**  
- **<span style="color:#B016E4">graphical_appearance_no, graphical_appearance_name</span>**<span style="color:#023e8a;">: **The group of graphics and its name</span>**  
- **<span style="color:#B016E4">perceived_colour_value_id, perceived_colour_value_name, perceived_colour_master_id, perceived_colour_master_name</span>**<span style="color:#023e8a;">: **The added color info</span>**  
- **<span style="color:#B016E4">department_no, department_name:</span>**<span style="color:#023e8a;">: **A unique identifier of every dep and its name</span>**  
- **<span style="color:#B016E4">index_code, index_name:</span>**<span style="color:#023e8a;">: **A unique identifier of every index and its name</span>**  
- **<span style="color:#B016E4">index_group_no, index_group_name:</span>**<span style="color:#023e8a;">: **A group of indeces and its name</span>**  
- **<span style="color:#B016E4">section_no, section_name:</span>**<span style="color:#023e8a;">: **A unique identifier of every section and its name</span>**  
- **<span style="color:#B016E4">garment_group_no, garment_group_name:</span>**<span style="color:#023e8a;">: **A unique identifier of every garment and its name</span>**  
- **<span style="color:#B016E4">detail_desc:</span>**<span style="color:#023e8a;">: **Short description</span>**  

In [None]:
df_articles = vaex.from_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv")
print(f"Shape of the articles dataset: {df_articles.shape}")
df_articles.sample(3)

## <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1.2 "transaction" csv file</span>

**<span style="color:#023e8a">Details of the `transactions` csv file:<span>**
- **<span style="color:#B016E4">t_dat</span>**<span style="color:#023e8a;">: **Transaction date.**</span>
- **<span style="color:#B016E4">customer_id</span>**<span style="color:#023e8a;">: **A unique identifier of every customer.</span>**  
- **<span style="color:#B016E4">article_id</span>**<span style="color:#023e8a;">: **A unique identifier of every article (from `articles` dataframe) cuntomer bought</span>**  
- **<span style="color:#B016E4">price</span>**<span style="color:#023e8a;">: **The customer spend how much money</span>**
- **<span style="color:#B016E4">sales_channel_id</span>**<span style="color:#023e8a;">: **1 or 2</span>**

In [None]:
df_transaction = vaex.from_csv(
    "/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv",
    dtype={"sales_channel_id": "int8", "article_id": "int32", "price": "float32"} 
)
print(f"Shape of transaction dataset: {df_transaction.shape}")
df_transaction.sample(5)

### <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1.2.1 Check that is there any missing values</span>

In [None]:
def vaex_is_null(df):
    count_na = []
    for col in df.column_names:
        count_na.append(df[col].isna().sum().item())
    return pd.Series(data=count_na, index=df.column_names).sort_values(ascending=True)

In [None]:
vaex_is_null(df_transaction)


### <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1.2.2 Statistical Analysis</span>
    
**<span style="color:#023e8a;">For statistical analysis, we can only use the "price" column.</span>**

In [None]:
# short description of "price" column by removing the scientific notation
df_transaction.describe()

### <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1.2.2 Distribution of "price" column</span>

**<span style="color:#023e8a;">According to Freedman-Diaconis:</span>**

<span style="color:#023e8a; font-size:2em">
$$h = 2\frac{IQR}{\sqrt[3]{n}}$$
</span>

**<span style="color:#023e8a;">And then, the number of bins (k) in a histogram should be:</span>**

<span style="color:#023e8a; font-size:2em">
    $$k = \frac{max(x) - min(x)}{h}$$
</span>

In [None]:
def calculate_number_of_bins(data: pd.DataFrame) -> int:
    # calculate the 75th percentile
    q3  = np.quantile(data.evaluate(), 0.75)
    # calculate the 25th percentile
    q1  = np.quantile(data.evaluate(), 0.25)
    # calcutate IQR
    iqr = q3 - q1
    # calculate total number of records
    n   = data.shape[0]
    # calcute the Freedman-Diaconis
    h   = 2 * (iqr/(np.cbrt(n)))
    # calculate the number of bins
    k   = (df_transaction["price"].max() - df_transaction["price"].min())/h
    return int(k)

In [None]:
# lets see the distribution of the "price" column
plt.figure(figsize=(16, 9))
sns.set_style("darkgrid")
sns.distplot(df_transaction["price"].evaluate(), hist=False, color="#16E437", bins=calculate_number_of_bins(df_transaction["price"]))
plt.xlabel("Price",   fontsize=16)
plt.ylabel("Density", fontsize=16)
plt.title("Distribution of the \"price\" column", fontsize=16)
plt.show()

**<span style="color:#023e8a;">From the distribution, we can see that the "price" column is right skewed data. It is not a unimodal distribution. There are more than one peaks in the distribution. Now lets see how much differ the distribution from the normal distribution using `QQ plot`.</span>**

### <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1.2.3 QQ Plot of "price" column</span>

In [None]:
plt.figure(figsize=(16, 9))
x = stats.probplot(df_transaction["price"].evaluate(), plot=plt)
plt.xlabel("Theoritical quantities", fontsize=16)
plt.ylabel("Ordered Values", fontsize=16)
plt.title("QQ Plot of \"price\" column", fontsize=16)
plt.show()

### <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1.2.4 BoxPlot of "price" column</span>

In [None]:
# lets see the boxplot
plt.figure(figsize=(16, 9))
sns.boxplot(x=df_transaction["price"].evaluate(), color="#B616E4")
plt.title('BoxPlot of "price" column', fontsize=16)
plt.xlabel("Price", fontsize=16)
plt.show()

**<span style="color:#023e8a;">Because of highly right skewed data ("price" column), so many outliers are detected. Because of the non-normal distribution, we can remove the outliers using the IQR method. First we have to transform this to a normal distribution.</span>**

**<span style="color:#023e8a;">For skewed distribution, usually these below 3 methods are used to convert to normal distribution:</span>**
- **<span style="color:#023e8a;">Log Transform</span>**
- **<span style="color:#023e8a;">Square Transform</span>**
- **<span style="color:#023e8a;">Box-Cox Transform</span>**

### <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1.2.5 Log transformation of "price" column</span>

In [None]:
# log transformation
log_transformer = FunctionTransformer(func=np.log1p)

df_transaction["price_log_transform"] = log_transformer.fit_transform(df_transaction["price"])

In [None]:
# lets see the result of log transformation
plt.figure(figsize=(16, 9))
sns.set_style("darkgrid")
sns.distplot(df_transaction["price_log_transform"].evaluate(), hist=False, color="#16E437", bins=calculate_number_of_bins(df_transaction["price_log_transform"]))
plt.xlabel("Price (log transformed)", fontsize=16)
plt.ylabel("Density", fontsize=16)
plt.title("Distribution of the log transformed \"price\" column", fontsize=16)
plt.show()

### <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1.2.6 Square transformation of "price" column</span>

In [None]:
# square transformation
square_transformer = FunctionTransformer(func=np.square)

df_transaction["price_square_transform"] = square_transformer.fit_transform(df_transaction["price"])

In [None]:
# lets see the result of square transformation
plt.figure(figsize=(16, 9))
sns.set_style("darkgrid")
sns.distplot(df_transaction["price_square_transform"].evaluate(), hist=False, color="#16E437")
plt.xlabel("Price (square transformed)", fontsize=16)
plt.ylabel("Density", fontsize=16)
plt.title("Distribution of the square transformed \"price\" column", fontsize=16)
plt.show()

### <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1.2.7 Box-Cox transformation of "price" column</span>

In [None]:
# box-cox transformation
box_cox_transformer = PowerTransformer(method="box-cox")

df_transaction["price_box_cox_transform"] = box_cox_transformer.fit_transform(df_transaction["price"].to_numpy().reshape(df_transaction["price"].shape[0], 1)) 

In [None]:
# lets see the result of Box-Cox transformation
plt.figure(figsize=(16, 9))
sns.set_style("darkgrid")
sns.distplot(df_transaction["price_box_cox_transform"].evaluate(), hist=False, color="#16E437")
plt.xlabel("Price (box-cox transformed)", fontsize=16)
plt.ylabel("Density", fontsize=16)
plt.title("Distribution of the \"price\" column", fontsize=16)
plt.show()

**<span style="color:#023e8a;">From the results of 3 transformations, we can see that Box-Cox transformation perform well than other 2 transformations.</span>**

### <span style="color:white; background-color:#B616E4; padding:5px; border-radius:7px">1.2.8 Top 30 customers by number of transactions</span>

**<span style="color:#023e8a;">First I will apply "value_counts" method of the pandas DataFrame on the "customer_id" column. It will return the number of transactions in descending order. For easy usage, I have converted this into pandas DataFrame. For better visualization purpose, I change the x-axis values from unique long customer_id value to index value.</span>**

In [None]:
# calculate the value counts of "customer_id"
df_transactions_value_counts                = pd.DataFrame(df_transaction["customer_id"].value_counts())
df_transactions_value_counts["counts"]      = df_transactions_value_counts.iloc[:, 0]
df_transactions_value_counts["customer_id"] = df_transactions_value_counts.index
df_transactions_value_counts.reset_index(drop=True, inplace=True)

In [None]:
# now visualise the top 15 customers by number of transactions
have_to_display = 30 # define how many customers I want to plot
plt.figure(figsize=(16, 9))
sns.barplot(
    x=df_transactions_value_counts["customer_id"].iloc[:have_to_display],
    y=df_transactions_value_counts["counts"].iloc[:have_to_display]
)
plt.title("Top 15 customers by number of transactions", fontsize=16)
plt.xlabel("Customer ID", fontsize=16)
plt.ylabel("Total transactions", fontsize=16)
plt.xticks(ticks = list(range(have_to_display)), labels=list(range(have_to_display)), rotation=45, fontsize=16)
plt.yticks(fontsize=16)
plt.show()