# Product Segmentation for Retail

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import shapiro

## Import Data & Processing

In [None]:
# import sales Data

df = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv')

In [None]:
df.keys()

In [None]:
df.id

In [None]:
df.head()

In [None]:
# from the csv file named 'sales_train_evaluation' we create our parameters

COLS_ITM = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

#we only study the first year (from day 1 to day 366)

COLS_DATE = ['d_' + str(i) for i in range(1, 366)]

# Change id to match with price dataframe (in file 'sell_prices' id is not mentioned as a feature)

df['id'] = df['store_id'] + '-' + df['item_id']

In [None]:
df.keys()

In [None]:
# Set our index

df.set_index(COLS_ITM, inplace = True)

# And the Scope of our study  = Year 1 (366 days)

df = df[COLS_DATE]

In [None]:
df.keys()

In [None]:
# Import Calendar

df_calendar = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')

# convert days to Weeks to get the sale prices

dict_week = dict(zip(df_calendar.d.values, df_calendar.wm_yr_wk.values))

In [None]:
df_calendar.head()

In [None]:
dict_week

In [None]:
# We'll create a pricing matrix M(n, p) with n = sku_id and p = week

# Import Pricing

df_price = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')

# create a unique SKU Index

df_price['item_store_id'] = df_price['store_id'] + '-' + df_price['item_id']

df_price.head()

In [None]:
print("{:,} records for sales price".format(len(df_price)))

In [None]:
df_price.keys()

In [None]:
# Pricing

df_price = df_price.pivot(index='item_store_id', columns='wm_yr_wk', values='sell_price').fillna(0)

df_price.head()

In [None]:
# Matrix from pivot 

matrix_price = df_price.to_numpy()

In [None]:
matrix_price

In [None]:
# Dict Matrix Index

pr_n = dict(zip(df_price.index, range(len(df_price.index))))

pr_p = dict(zip(df_price.columns, range(len(df_price.columns))))

In [None]:
pr_n

In [None]:
pr_p

In [None]:
print("{:,} records for sales price pivot".format(len(df_price)))

## Statistical Analysis

In [None]:
# we calculate the Sales Mean during the first 366 days

df['mean'] = df[COLS_DATE].mean(axis = 1)

# Standard Deviation

df['std'] = df[COLS_DATE].std(axis = 1)

# We remove items not sold during the first year

print("{:,} records for the full scope".format(len(df)))

df = df[df['mean']>0]

print("{:,} records for after filter".format(len(df)))

df.reset_index(inplace = True)

# We verifie that the line with product with zero sales in year 1 if they are deleted or not yet + Mean and standard dev columns

df.head()

## Turnover, Sales Units

#### Turnover = Price x Volume

#### Product turnover, or inventory turnover, is a measurement of the speed a company sells the products or its inventory that she has on hand

In [None]:
# let's calculate the Total Units

df['units'] = df[COLS_DATE].sum(axis = 1)

# Turnover = Units x Price

df['TO'] = 0 

for col in COLS_DATE:
    df['TO'] = df['TO'] + df[col] * df[['id', col]].apply(
        lambda t: matrix_price[pr_n[t['id']], pr_p[dict_week[col]]], axis = 1)
    
# We verify our updated columns

df.head(10)

## Coefficient of Variation


#### CV = sigma / mu = Standard deviation / Mean

In [None]:
df['CV'] = df['std']/df['mean']

df.head()

## Segmentation

#### ABC Analysis + Demand Variability

In [None]:
# create a copy of our data frame where we focus only on 1 category (Hobbies)

df_abc = df[df['cat_id']=='HOBBIES'].drop(['mean', 'std', 'CV'], axis = 1).copy()

df_abc = df_abc.groupby(['item_id', 'dept_id', 'cat_id']).sum()

In [None]:
df_abc.head()

In [None]:
# Calculate mean - standard deviation - CV

# Mean

df_abc['mean'] = df_abc[COLS_DATE].mean(axis = 1)

# Filter out the reference withou sales

df_abc = df_abc[df_abc['mean']>0]

# Standard

df_abc['std'] = df_abc[COLS_DATE].std(axis = 1)

# Coefficient of Variation

df_abc['CV'] = df_abc['std']/df_abc['mean']

df_abc.reset_index(inplace = True)

In [None]:
df_abc.head()

In [None]:
# Normalility Test through Shapiro-wilks test

df_abc['Normality_p'] = df_abc[COLS_DATE].apply(lambda row : stats.shapiro(row)[1], axis = 1)
alpha = 0.001
df_abc['Not_Normal'] = df_abc['Normality_p'] < alpha

In [None]:
df_abc.head()

In [None]:
# if you want to find a specific item base on its id : 

hobbies_index = df_abc[df_abc["item_id"].str.contains("HOBBIES_1_323",  na=False)]
hobbies_index

In [None]:
# ABC SKU-LEVEL

df_abc = df_abc.drop(COLS_DATE, axis =1).copy()
df_abc['TO%'] = 100*(df_abc['TO']/(df_abc['TO'].sum()))

# Sort 

df_abc.sort_values(['TO%'], ascending = False, inplace = True, ignore_index=True)
df_abc['TO%_CS'] = df_abc['TO%'].cumsum() #cumulated sum

# A, B, C on SKU Number

n_sku = len(df_abc)
n_a, n_b = int(0.05*n_sku), int(0.5*n_sku)
df_abc['SKU_ID'] = pd.Series(range(1, len(df_abc))).astype(int)
df_abc['SKU_%'] = (100 * pd.Series(range(1, len(df_abc))) / len(df_abc))
df_abc['ABC'] = pd.Series(range(len(df_abc))).apply(lambda t: 'A' if t <= n_a-1 else 'B' if t <= n_b-1 else 'C')

# A, B, C on turnover

to_a, to_b = df_abc[df_abc['SKU_ID']==n_a]['TO%'].max(), df_abc[df_abc['SKU_ID']==n_b]['TO%'].max()

print("{:,} unique SKU".format(len(df_abc)))

df_abc.head()

In [None]:
# Distribution by Value

ax = plt.gca()
df_abc.plot(figsize=(12, 8), x='SKU_%', y='TO%_CS', ax =ax, grid = True)

# ABC

# 20%, 50% of SKU Number (2 Vertical lines)

ax.axvline(5 , color="red", linestyle="-", linewidth = 1.0)
ax.axvline(20 , color="red", linestyle="-", linewidth = 1.0)

# 20%, 50% of SKU Number (2 Horizental lines)

ax.axhline(80 , color="blue", linestyle="--", linewidth = 1.0)
ax.axhline(95 , color="blue", linestyle="--", linewidth = 1.0)

plt.xlabel('Percentage of SKU (%)')
plt.xticks(rotation=90)
plt.ylabel('Percentage of the Annual Turnover (%)')

plt.title('ABC Analysis: Distribution by Turnover (Sales Value in $)')

plt.show()

### Segmentation by Demand Variability (aka Coefficient of variation)

In [None]:
# Bar Chart

ax = plt.gca()
colors = {'A':'red', 'B':'green', 'C':'blue'}

# Remove Outliers

df_plot = df_abc[df_abc['CV']<4].copy()
df_plot.plot.scatter(figsize=(12, 8), x='TO%', y='CV', color=df_plot['ABC'].map(colors), ax =ax, grid = True)

# ABC
# A, B and C

ax.axvline(to_a , color="red", linestyle="-", linewidth = 1.0)
ax.axvline(to_b , color="red", linestyle="-", linewidth = 1.0)

# 20%, 50% of SKU Number

ax.axhline(1 , color="blue", linestyle="--", linewidth = 1.0)

plt.xlabel('Percentage of Turnover (%)')
plt.xticks(rotation=90)
plt.ylabel('Coefficient of Variation')

plt.title('Distribution by Demand Variability')

plt.show()

### Normality Test

##### Does the distribution of our sales follows a normal distribution ?

In [None]:
# Bar Chart

ax = plt.gca()
colors = {False:'green', True:'red'}

# Remove Outliers

df_plot = df_abc[df_abc['CV']<4].copy()
df_plot.plot.scatter(figsize=(12, 8), x='TO%', y='CV', color=df_plot['Not_Normal'].map(colors), ax =ax, grid = True)

# ABC
# A, B and C

ax.axvline(to_a , color="red", linestyle="-", linewidth = 1.0)
ax.axvline(to_b , color="red", linestyle="-", linewidth = 1.0)

# 20%, 50% of SKU Number

ax.axhline(1 , color="blue", linestyle="--", linewidth = 1.0)

plt.xlabel('Percentage of Turnover (%)')
plt.xticks(rotation=90)

plt.ylabel('Coefficient of Variation')
plt.title('Distribution by Demand Variability')

plt.show()

###### few items follows a Normal distribution. Just 7 items out of 317

In [None]:
df_abc['Not_Normal'].value_counts()