# Cluster Analysis

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
from datetime import datetime
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

## Read in Data

In [2]:
df = pd.read_csv("group7_data.csv")
df.head()

Unnamed: 0,VENDOR,FREQUENCY,RECENCY,AVG_COST,RETAIL,QUANTITY,AMT,PROFIT,PURCHASES,RETURNS
0,5511283,2582606.0,2005-08-27,12.032232,53611910.0,2674419,52712890.0,16926410.0,2582606.0,91813.0
1,113645,1044197.0,2005-08-27,26.176994,47901820.0,1098662,46666630.0,13265590.0,1044197.0,54465.0
2,13031,418163.0,2005-08-27,3.210998,1787935.0,428453,2303070.0,810849.5,418163.0,10290.0
3,5715232,369674.0,2005-08-27,32.926035,14169200.0,411493,18943280.0,1292774.0,369674.0,41819.0
4,3626213,367217.0,2005-08-27,1.626938,1517627.0,384542,1273198.0,529786.7,367217.0,17325.0


## Categorize vendors according to k-means cluster

In [3]:
#best vendor list as from k-means cluster 2
best_vendor_lst = [5511283, 113645, 13031, 5715232, 3626213,
                   2012863, 3313116, 9520439, 313319, 6016957, 
                   7016341, 2067178, 4516339, 13396, 1114936, 
                   2017178, 5011295, 60904, 10903, 6041161, 
                   4412768, 6013105, 9514659, 1116343, 6062767, 
                   66561, 226176, 4259203, 9113491, 7326340, 
                   5745232, 11679, 3916215, 514761, 4016074, 
                   5016699, 16561, 7510902]

In [4]:
#worst vendor list as from k-means cluster 0
worst_vendor_lst = [115362, 7045883, 7211456, 7095883, 7055883, 
                    7035883, 2819403, 6935883, 1012863, 16605, 
                    9716181, 2512827, 3513036, 5611454, 7221456, 
                    6945292, 9713322, 4913317, 2219404, 6916222]

In [5]:
#define vendor category 
df["VENDOR_CATEGORY"] = "Average Performing Vendors"
df.loc[df["VENDOR"].isin(worst_vendor_lst), "VENDOR_CATEGORY"] = "Worst Performing Vendors"
df.loc[df["VENDOR"].isin(best_vendor_lst), "VENDOR_CATEGORY"] = "Best Performing Vendors"

#keep the useful features 
df = df[["VENDOR","FREQUENCY", "RECENCY", "AVG_COST", "PROFIT", 
         "QUANTITY", "PURCHASES", "RETURNS", "VENDOR_CATEGORY"]]

df.head()

Unnamed: 0,VENDOR,FREQUENCY,RECENCY,AVG_COST,PROFIT,QUANTITY,PURCHASES,RETURNS,VENDOR_CATEGORY
0,5511283,2582606.0,2005-08-27,12.032232,16926410.0,2674419,2582606.0,91813.0,Best Performing Vendors
1,113645,1044197.0,2005-08-27,26.176994,13265590.0,1098662,1044197.0,54465.0,Best Performing Vendors
2,13031,418163.0,2005-08-27,3.210998,810849.5,428453,418163.0,10290.0,Best Performing Vendors
3,5715232,369674.0,2005-08-27,32.926035,1292774.0,411493,369674.0,41819.0,Best Performing Vendors
4,3626213,367217.0,2005-08-27,1.626938,529786.7,384542,367217.0,17325.0,Best Performing Vendors


## Find Cluster Statistics

In [6]:
#find the mean statistics for different clusters
df_stats = df.loc[:, df.columns!='VENDOR'].groupby("VENDOR_CATEGORY").mean().round(2)
df_stats

Unnamed: 0_level_0,FREQUENCY,AVG_COST,PROFIT,QUANTITY,PURCHASES,RETURNS
VENDOR_CATEGORY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Average Performing Vendors,6533.1,19.03,34656.52,7154.65,6533.1,621.55
Best Performing Vendors,232230.03,16.71,1669030.81,246125.66,232230.03,13895.63
Worst Performing Vendors,1015.25,222.44,58732.59,1089.65,1015.25,74.4


In [7]:
df_stats = df_stats.reset_index()
df_stats["RETURN_RATIO"] = 100.0*df_stats["RETURNS"]/df_stats["PURCHASES"]
df_stats["RETURN_RATIO"] = df_stats["RETURN_RATIO"].round(2)
df_stats

Unnamed: 0,VENDOR_CATEGORY,FREQUENCY,AVG_COST,PROFIT,QUANTITY,PURCHASES,RETURNS,RETURN_RATIO
0,Average Performing Vendors,6533.1,19.03,34656.52,7154.65,6533.1,621.55,9.51
1,Best Performing Vendors,232230.03,16.71,1669030.81,246125.66,232230.03,13895.63,5.98
2,Worst Performing Vendors,1015.25,222.44,58732.59,1089.65,1015.25,74.4,7.33


## Summary for Cluster Statistic

- Best vendors have high frequency, low average cost, high profit, high quantity, high purchase, high returns, and low return ratio among the three types of clusters.
- Worst vendors have low frequency, high average cost, medium profit, medium quantity, low purchase, low return and medium return ratio among the three types of clusters.

## Vendor Analysis

In [8]:
#read in summary statistics for best vendor and worst vendors
df_worst = pd.read_csv("df_worst_final.csv")
df_best = pd.read_csv("df_best_final.csv")
df_best

Unnamed: 0,VENDOR,BRAND,CATEGORY
0,6041161,CHANEL,BEAUTY
1,4412768,CALVINKLEIN,CLOTHES
2,6013105,CABERNET,CLOTHES
3,9514659,1928,ACCESSORIES
4,1116343,EUROITAL,CLOTHES
5,6062767,LISLI,CLOTHES
6,66561,ROUNDTREE&YORKE,CLOTHES
7,226176,NOFEAR,CLOTHES
8,4259203,LISLI,CLOTHES
9,9113491,ETERNIT,CLOTHES


In [9]:
df_worst

Unnamed: 0,VENDOR,BRAND,CATEGORY
0,2219404,CREMIEUX,CLOTHES
1,2512827,HARTMANN,LUGGAGE
2,3513036,HUGO BOS,CLOTHES
3,4913317,KIPPY'S,CLOTHES
4,5611454,KAY UNGE,CLOTHES
5,6916222,MARY FRA,CLOTHES
6,6945292,GFT USA,CLOTHES
7,7221456,PEERLESS,CLOTHES
8,9713322,OLD GRIN,CLOTHES
9,9716181,NOVAPPAR,CLOTHES


In [10]:
#calculate percent of Categories in Best Vendor cluster
df_best_cat = df_best.groupby("CATEGORY").count() \
.reset_index()[["CATEGORY","VENDOR"]].rename(columns={"VENDOR":"COUNT"})
df_best_cat["CATEGORY_PERCENT"] = round(100*df_best_cat["COUNT"]/df_best_cat["COUNT"].sum(),2)
df_best_cat.sort_values(by="CATEGORY_PERCENT", ascending=False).reset_index(drop=True)

Unnamed: 0,CATEGORY,COUNT,CATEGORY_PERCENT
0,CLOTHES,20,52.63
1,BEAUTY,11,28.95
2,SHOES,5,13.16
3,ACCESSORIES,1,2.63
4,LUGGAGE,1,2.63


In [11]:
#calculate percent of Categories in Worst Vendor cluster
df_worst_cat = df_worst.groupby("CATEGORY").count() \
.reset_index()[["CATEGORY","VENDOR"]].rename(columns={"VENDOR":"COUNT"})
df_worst_cat["CATEGORY_PERCENT"] = round(100*df_worst_cat["COUNT"]/df_worst_cat["COUNT"].sum(),2)
df_worst_cat.sort_values(by="CATEGORY_PERCENT", ascending=False).reset_index(drop=True)

Unnamed: 0,CATEGORY,COUNT,CATEGORY_PERCENT
0,CLOTHES,19,82.61
1,BAGS,1,4.35
2,FURNITURE,1,4.35
3,LUGGAGE,1,4.35
4,SHOES,1,4.35


### Summary for Vendor Analysis

- We could see that Clothes, Beauty, and Shoes vendors generally defines the categories in the best performing vendor cluster.
- We could see that Clothes generally defines the categories in the worst performing vendor cluster.
- We recommend conducting more market research on popular clothes brands and least popular clothes brands, and then targeting the vendors that sell those popular clothes brands.
- It would also be a good idea to target vendors who sell beauty and cosmetic products in trend to increase revenues.
- Purchasing goods from vendors who provide bags, furniture, luggage, and shoes would require more careful consideration, since these categories are also likely to be poorly received by customers.
