<a href="https://colab.research.google.com/github/rtegao/BigDataSpecialization/blob/main/UpperBag_clusterModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# importing Libraries

In [1]:
import pandas as pd 
import numpy as np
import statistics

from datetime import date

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

import plotly.graph_objects as go



# Importing Data

In [2]:
path = '/content/drive/MyDrive/Personal Project/Upperbag/Data/Clean/'

#customer
df_fem = pd.read_csv( path + 'feminino_secure.csv' )
df_masc = pd.read_csv( path + 'masculino_secure.csv' )

#stock
df_stock_hist = pd.read_csv( path + 'estoque.csv' )
df_stock_actual = pd.read_csv( path + 'situacao_estoque.csv' )

#questions
df_quest_masc = pd.read_csv( path + 'quest_fem_secure.csv' )
df_quest_fem = pd.read_csv( path + 'quest_masc_secure.csv' )


Columns (17,18,19,24,25,26,28,29,37,40,41) have mixed types.Specify dtype option on import or set low_memory=False.


Columns (8,15,17,18,19,20,24,25,26,29,30,31,32,33,34,35,40) have mixed types.Specify dtype option on import or set low_memory=False.



# RFP Segmentation

- R = Recency
- F = Frequency
- P = Profit

## Data Preparation

In [3]:
df_fem = df_fem[['nm_cliente','dt_entrega','qtd_bags_enviadas','qtd_produtos_bag','mkup','flag_compra']]
df_masc = df_masc[['nm_cliente','dt_entrega','qtd_bags_enviadas','qtd_produtos_bag','mkup','flag_compra']]

### Cleaning and Format

#### mkup cleaning

In [4]:
#dropna
df_fem = df_fem[~df_fem['mkup'].isna()].reset_index(drop = True)
df_masc = df_masc[~df_masc['mkup'].isna()].reset_index(drop = True)

#cleaning errors
df_fem = df_fem.drop(list(df_fem[df_fem['mkup'] == '#div/0!'].index)).reset_index(drop = True)
df_masc = df_masc.drop(list(df_masc[df_masc['mkup'] == '#div/0!'].index)).reset_index(drop = True)

# convert to float
df_fem['mkup'] = df_fem['mkup'].astype(float)
df_masc['mkup'] = df_masc['mkup'].astype(float)

#### Convert date, string do date time

In [5]:
df_fem['dt_entrega'] = pd.to_datetime(df_fem['dt_entrega'])
# df_fem['dt_retirada'] = pd.to_datetime(df_fem['dt_retirada'])

df_masc['dt_entrega'] = pd.to_datetime(df_masc['dt_entrega'])
# df_masc['dt_entrega'] = pd.to_datetime(df_masc['dt_entrega'])

### Recency

In [6]:
df_fem = df_fem[~df_fem['dt_entrega'].isna()].reset_index(drop = True)
df_masc = df_masc[~df_masc['dt_entrega'].isna()].reset_index(drop = True)

In [7]:
today = date.today()
today

datetime.date(2020, 12, 7)

In [8]:
df_fem['recency'] = (pd.datetime.now() - df_fem['dt_entrega']).dt.days
df_masc['recency'] = (pd.datetime.now() - df_masc['dt_entrega']).dt.days


The pandas.datetime class is deprecated and will be removed from pandas in a future version. Import from datetime instead.


The pandas.datetime class is deprecated and will be removed from pandas in a future version. Import from datetime instead.



### Profit

In [9]:
df_fem.loc[df_fem[df_fem['flag_compra'] == False].index,'mkup'] = df_fem[df_fem['flag_compra'] == False]['mkup']*-1
df_masc.loc[df_masc[df_masc['flag_compra'] == False].index,'mkup'] = df_masc[df_masc['flag_compra'] == False]['mkup']*-1

In [10]:
df_aux_fem = df_fem[['nm_cliente','mkup']].groupby('nm_cliente').sum().reset_index().rename(columns = {'mkup':'profit'})
df_aux_masc = df_masc[['nm_cliente','mkup']].groupby('nm_cliente').sum().reset_index().rename(columns = {'mkup':'profit'})

In [11]:
df_fem = df_fem.merge(df_aux_fem, left_on = 'nm_cliente',right_on = 'nm_cliente', how = 'inner')
df_masc = df_masc.merge(df_aux_masc, left_on = 'nm_cliente',right_on = 'nm_cliente', how = 'inner')

In [12]:
df_fem

Unnamed: 0,nm_cliente,dt_entrega,qtd_bags_enviadas,qtd_produtos_bag,mkup,flag_compra,recency,profit
0,7db2e5a328c0f5112f6e0f6de21d7b2f412de1a6de5210...,2020-09-28,2.0,71.0,2.00,True,70,176.35
1,7db2e5a328c0f5112f6e0f6de21d7b2f412de1a6de5210...,2020-09-28,2.0,71.0,3.44,True,70,176.35
2,7db2e5a328c0f5112f6e0f6de21d7b2f412de1a6de5210...,2020-09-28,2.0,71.0,2.30,True,70,176.35
3,7db2e5a328c0f5112f6e0f6de21d7b2f412de1a6de5210...,2020-09-28,2.0,71.0,2.50,True,70,176.35
4,7db2e5a328c0f5112f6e0f6de21d7b2f412de1a6de5210...,2020-09-28,2.0,71.0,2.00,True,70,176.35
...,...,...,...,...,...,...,...,...
72637,87518198806a6c50f55485f1423a9342697349e513f49c...,2019-07-26,1.0,62.0,1.69,True,500,141.36
72638,87518198806a6c50f55485f1423a9342697349e513f49c...,2019-07-26,1.0,62.0,1.38,True,500,141.36
72639,87518198806a6c50f55485f1423a9342697349e513f49c...,2019-07-26,1.0,62.0,1.72,True,500,141.36
72640,87518198806a6c50f55485f1423a9342697349e513f49c...,2019-07-26,1.0,62.0,1.74,True,500,141.36


### Frequency

In [13]:
df_fem = df_fem.drop(columns=['flag_compra','mkup']).drop_duplicates()
df_masc = df_masc.drop(columns=['flag_compra','mkup']).drop_duplicates()

In [14]:
df_aux_fem = df_fem.groupby('nm_cliente').count().reset_index()[df_fem.groupby('nm_cliente').count().reset_index()['dt_entrega'] > 1]
df_aux_masc = df_masc.groupby('nm_cliente').count().reset_index()[df_masc.groupby('nm_cliente').count().reset_index()['dt_entrega'] > 1]

In [15]:
df_aux_masc

Unnamed: 0,nm_cliente,dt_entrega,qtd_bags_enviadas,qtd_produtos_bag,recency,profit
0,001f63c8a3b7521b264d665011eda5e3279a2da88e59a3...,2,0,2,2,2
20,06293b66ad4d16ed93976a430c7f4ca079bd5d3adac4b4...,3,1,3,3,3
23,06505285e0ab1d192f48cd165ed706f5d72805d971289d...,3,0,3,3,3
26,06fe88a2bfe3c51d9d0af905c627a51a4c69bed9d24119...,2,1,2,2,2
29,0831c6c55f54b138b1196cba030eeb2ee9ee7077807fb9...,2,0,2,2,2
...,...,...,...,...,...,...
1296,f90e90cbb881668c1662adb900f1952213cc167119d986...,2,1,2,2,2
1304,faa0fabf4e020302aeae3832f4b933064639663ea27450...,2,1,2,2,2
1305,fb7e3cb3f544a86dea307fa2572b1b4eae17186f988d2d...,2,1,2,2,2
1308,fbfa79f3bd641421102b44558a42197a035276d6e9fc43...,2,0,2,2,2


In [16]:
#feminino
df_fem['frequency'] = np.nan
for client in list(df_aux_fem['nm_cliente'].unique()):
  aux_list = []
  for i in range(df_fem[df_fem['nm_cliente'] == client].shape[0] - 1):
    recency = list(df_fem[df_fem['nm_cliente'] == client].sort_values(by = 'recency',ascending = False)['recency'])
    aux_list.append(recency[i] - recency[i+1])

  df_fem.loc[df_fem[df_fem['nm_cliente'] == client].index,'frequency'] = statistics.mean(aux_list)
  df_fem['frequency'].fillna(0,inplace = True)
  df_fem.loc[df_fem[df_fem['nm_cliente'] == client].index,'recency'] = df_fem[df_fem['nm_cliente'] == client]['recency'].min()
    
#masculino
df_masc['frequency'] = np.nan
for client in list(df_aux_masc['nm_cliente'].unique()):
  aux_list = []
  for i in range(df_masc[df_masc['nm_cliente'] == client].shape[0] - 1):
    recency = list(df_masc[df_masc['nm_cliente'] == client].sort_values(by = 'recency',ascending = False)['recency'])
    aux_list.append(recency[i] - recency[i+1])
  
  df_masc.loc[df_masc[df_masc['nm_cliente'] == client].index,'frequency'] = statistics.mean(aux_list)
  df_masc['frequency'].fillna(0,inplace = True)
  df_masc.loc[df_masc[df_masc['nm_cliente'] == client].index,'recency'] = df_masc[df_masc['nm_cliente'] == client]['recency'].min()

## Data Pre-Processing

In [17]:
df_masc = df_masc[['nm_cliente','recency','frequency','profit']].drop_duplicates()
df_fem = df_fem[['nm_cliente','recency','frequency','profit']].drop_duplicates()

In [18]:
df_final = pd.concat([df_masc,df_fem], axis = 'index')
df = df_final.copy()

In [19]:
#minmaxscaler
scaler = MinMaxScaler()

In [20]:
df_scaler = scaler.fit_transform(df.iloc[:,1:])
df_scaler

array([[0.1017192 , 0.94135338, 0.58933405],
       [0.10601719, 0.        , 0.58708959],
       [0.1017192 , 0.        , 0.49723683],
       ...,
       [0.65472779, 0.        , 0.49833237],
       [0.68338109, 0.        , 0.49650066],
       [0.71776504, 0.        , 0.5003605 ]])

## Algorithm

In [21]:
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
  kmeans = KMeans(n_clusters=k, random_state=0).fit(df_scaler)
  Sum_of_squared_distances.append(kmeans.inertia_)

In [22]:
fig = go.Figure(data=go.Scatter(x=list(K), y=Sum_of_squared_distances))
fig.show()

In [23]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(df_scaler)

In [24]:
kmeans.predict(df_scaler)

array([2, 1, 1, ..., 0, 0, 0], dtype=int32)

In [26]:
df['cluster'] = list(kmeans.predict(df_scaler))
df

Unnamed: 0,nm_cliente,recency,frequency,profit,cluster
0,a73944e9e484d52773704d3b7096b66766fe08ecd09e50...,70,626.0,2641.94,2
63,bd3aa9a41e0822c57865a9c1715b3056610007a3ad4642...,73,0.0,2578.86,1
100,e66f2a1d5ab9b26dfc315f5d3093a2bf1f7129e53c3157...,70,0.0,53.57,1
123,152d5597f43f315cbf81a70bc7dfdc45225f8db73154b3...,70,73.0,3115.22,1
189,2872c3848dad4b363c23db24183a6c45c2df41bff285f1...,70,0.0,93.96,1
...,...,...,...,...,...
72444,e59c0940c1a47db2ba764b46893b268f0264d4468d88d1...,667,0.0,65.38,0
72475,9e7e36e2e1e2e34c8e90e3f6681f50cdde257df5930d2d...,476,0.0,98.41,0
72515,0946edd1602d1b9b7e240ed34a7d37278ed268b241114a...,456,0.0,84.36,0
72556,f25e1de98f033b3d31c6ac8d069610307e7d421ae80e91...,476,0.0,32.88,0


In [27]:
df.to_csv('/content/drive/MyDrive/Personal Project/Upperbag/Data/Clean/cluster.csv',index=False)