# Bix Tecnologia


Fonte: https://docs.google.com/spreadsheets/d/1Sc6hN48b-lWbDdYNd_t9j8Bt0EZ8BUNG/edit?pli=1#gid=693918541

# Initial imports

In [1]:
%pip install ipython-autotime  --upgrade

In [2]:
from google.colab import drive, files
import pandas as pd
drive.mount('/content/drive', force_remount=True)
%load_ext autotime

Mounted at /content/drive
time: 275 µs (started: 2023-07-08 00:32:05 +00:00)


In [3]:
clients = pd.read_csv("/content/drive/MyDrive/datasets/bix-tecnologia/clients.csv", encoding='utf-8')
clients['DateOfBirth'] = pd.to_datetime(clients['DateOfBirth'], format="%m/%d/%Y")

products  = pd.read_csv("/content/drive/MyDrive/datasets/bix-tecnologia/products.csv", encoding='utf-8')

sales = pd.read_csv("/content/drive/MyDrive/datasets/bix-tecnologia/sales.csv", encoding='utf-8', skiprows=4)
sales = sales[['ID', 'StoreID', 'ProductID', 'ClientID','Discount', 'UnitPrice', 'Quantity', 'Date']]
sales['Date'] = pd.to_datetime(sales['Date'], format="%m/%d/%Y")

stores    = pd.read_csv("/content/drive/MyDrive/datasets/bix-tecnologia/stores.csv", encoding='utf-8')

time: 357 ms (started: 2023-07-08 00:32:05 +00:00)


Vou renomear as colunas para ficar mais legível e poder unir os dataframes com a função merge() formando apenas um dataset.

In [4]:
clients = clients.rename(columns={'ID':'client_id', 'City':'client_city', 'State':'client_state', 'DateOfBirth':'client_birth', 'Sex':'client_sex'})
products = products.rename(columns={'ID':'product_id', 'Name':'product_name', 'Size':'product_size'})
sales = sales.rename(columns={'ID':'id', 'ProductID':'product_id', 'ClientID':'client_id', 'Discount':'discount', 'UnitPrice':'unit_price', 'Quantity':'quantity', 'StoreID':'store_id', 'Date':'date'})
stores = stores.rename(columns={'ID':'store_id', 'Name':'store_city', 'State':'store_state'})

time: 4.51 ms (started: 2023-07-08 00:32:06 +00:00)


Agora, vou criar um único dataframe com todos os arquivos. Além disso, vou reordenar as colunas para legibilidade.

In [5]:
dataset = sales.merge(stores, on='store_id')
dataset = dataset.merge(products, on='product_id')
dataset = dataset.merge(clients, on='client_id')

#dataset = dataset.drop(['id', 'store_id', 'product_id', 'client_id'], axis=1)
dataset = dataset[['date', 'store_city', 'store_state', 'product_name', 'product_size', 'client_city', 'client_state', 'client_birth', 'client_sex', 'discount', 'unit_price', 'quantity', 'id', 'store_id', 'product_id', 'client_id']]

time: 151 ms (started: 2023-07-08 00:32:06 +00:00)


# EDA

In [6]:
import numpy as np

time: 307 µs (started: 2023-07-08 00:32:06 +00:00)


In [7]:
dataset.head()

Unnamed: 0,date,store_city,store_state,product_name,product_size,client_city,client_state,client_birth,client_sex,discount,unit_price,quantity,id,store_id,product_id,client_id
0,2018-12-10,Curitiba,PR,Tempestade,G,Curitiba,PR,1985-06-28,Homem,8,2492,1,80260d682079b6090c8285b398c50d97,4,002552c0663708129c0019cc97552d7d3,14001
1,2019-04-25,Curitiba,PR,Thanos,G,Curitiba,PR,1985-06-28,Homem,1,1624,1,3275736da1234f55d52bdf09d86b93bc,4,001b237c0e9bb435f2e54071129237e93,14001
2,2019-07-17,Curitiba,PR,Capitão América,P,Curitiba,PR,1985-06-28,Homem,1,1946,1,f3513eec8f1434b5b7e04ddf3598df40,4,00066f42aeeb9f3007548bb9d3f33c381,14001
3,2018-12-03,Curitiba,PR,Tempestade,G,Curitiba,PR,1987-10-21,Homem,8,2492,1,c330b70c5e60bf56e90ebe09e045f79e,4,002552c0663708129c0019cc97552d7d3,14116
4,2018-04-29,Curitiba,PR,Bazinga,G,Curitiba,PR,1987-10-21,Homem,1,2184,2,eced501ea6ea98469acff0ccbf3ca09b,4,002ec297b1b00fb9dde7ee6ac24b67713,14116


time: 32.5 ms (started: 2023-07-08 00:32:06 +00:00)


In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32753 entries, 0 to 32752
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          32753 non-null  datetime64[ns]
 1   store_city    32753 non-null  object        
 2   store_state   32753 non-null  object        
 3   product_name  32753 non-null  object        
 4   product_size  32753 non-null  object        
 5   client_city   32753 non-null  object        
 6   client_state  32753 non-null  object        
 7   client_birth  32753 non-null  datetime64[ns]
 8   client_sex    32753 non-null  object        
 9   discount      32753 non-null  object        
 10  unit_price    32753 non-null  object        
 11  quantity      32753 non-null  int64         
 12  id            32753 non-null  object        
 13  store_id      32753 non-null  int64         
 14  product_id    32753 non-null  object        
 15  client_id     32753 non-null  int64 

Os atributos 'discount' e 'unit_price' são números, mas estão definidos como 'object'. Isso acontece porque os números contém vírgula e não ponto. Vou mudar seus tipos para que eu possa criar os gráficos de maneira adequada.

In [9]:
dataset['discount']   = np.vectorize(lambda x: x.replace(',', '.'))(dataset['discount'])
dataset['unit_price'] = np.vectorize(lambda x: x.replace(',', '.'))(dataset['unit_price'])
dataset = dataset.astype({'discount': 'float64'})
dataset = dataset.astype({'unit_price': 'float64'})

time: 124 ms (started: 2023-07-08 00:32:06 +00:00)


Crio duas listas contendo os atributos categóricos e numéricos. Inicialmente, ignorei datas e ids.

In [10]:
categorical = ['store_city', 'store_state', 'product_name', 'product_size', 'client_city', 'client_state', 'client_sex']
numerical   = ['discount', 'unit_price', 'quantity']

time: 1.6 ms (started: 2023-07-08 00:32:06 +00:00)


In [11]:
dataset[numerical].describe()

Unnamed: 0,discount,unit_price,quantity
count,32753.0,32753.0,32753.0
mean,0.093012,221.644967,1.058651
std,0.009536,112.535304,0.239606
min,0.08,114.8,1.0
25%,0.08,161.0,1.0
50%,0.1,204.4,1.0
75%,0.1,233.8,1.0
max,0.1,686.0,3.0


time: 56.9 ms (started: 2023-07-08 00:32:06 +00:00)


Verificando a quantidade de valores únicos cada atributo categórico tem.

In [12]:
for name in categorical:
  print(f'{name}: {dataset[name].nunique()}')

store_city: 7
store_state: 7
product_name: 24
product_size: 4
client_city: 123
client_state: 18
client_sex: 2
time: 31.3 ms (started: 2023-07-08 00:32:06 +00:00)


Agora, vou ver quais são.

In [13]:
for name in categorical:
  print(f'{name}:\n {dataset[name].value_counts()}\n')

store_city:
 Florianópolis     8230
Rio de Janeiro    6838
Porto Alegre      6167
Curitiba          5172
São Paulo         3588
Belo Horizonte    1845
Online             913
Name: store_city, dtype: int64

store_state:
 SC    8230
RJ    6838
RS    6167
PR    5172
SP    3588
MG    1845
na     913
Name: store_state, dtype: int64

product_name:
 Iron Man               1420
League Legends         1403
Capitã Marvel          1399
Thanos                 1388
Ciclope                1386
Homer Simpsons         1382
Super Mario            1381
Bazinga                1380
Deadpool               1378
Batman                 1373
Wolverine              1372
Coringa                1369
Dragon Ball Z          1366
Flash                  1363
Pokemon Go Squirtle    1362
Mulher-Maravilha       1359
Rick Morty             1358
Darth Vader            1357
BIXDream               1347
Thor                   1333
Capitão América        1331
Pantera Negra          1323
Tempestade             1318
Naruto     

Verificando se há valores nulos no conjunto de dados.

In [14]:
for name in dataset.columns:
  print(f'{name}: {dataset[name].isnull().sum()}')

date: 0
store_city: 0
store_state: 0
product_name: 0
product_size: 0
client_city: 0
client_state: 0
client_birth: 0
client_sex: 0
discount: 0
unit_price: 0
quantity: 0
id: 0
store_id: 0
product_id: 0
client_id: 0
time: 80.4 ms (started: 2023-07-08 00:32:06 +00:00)


Esse resultado não está inteiramente correto. Pude ver antes que 'store_state' possui valores 'na', mas estãos sendo reconhecidos pelo Pandas como string. Vou corrigir isso.

In [15]:
dataset = dataset.replace('na', np.nan)

time: 23.9 ms (started: 2023-07-08 00:32:07 +00:00)


In [16]:
for name in dataset.columns:
  print(f'{name}: {dataset[name].isnull().sum()}')

date: 0
store_city: 0
store_state: 913
product_name: 0
product_size: 0
client_city: 0
client_state: 0
client_birth: 0
client_sex: 0
discount: 0
unit_price: 0
quantity: 0
id: 0
store_id: 0
product_id: 0
client_id: 0
time: 52.9 ms (started: 2023-07-08 00:32:07 +00:00)


## Data visualization

In [17]:
import matplotlib.pyplot as plt
import seaborn as sns

colors = sns.color_palette()
%matplotlib inline

time: 1.09 s (started: 2023-07-08 00:32:07 +00:00)


# Train/Test split

time: 1.1 s (started: 2023-07-08 00:32:07 +00:00)


# Feature engineering

time: 1.1 s (started: 2023-07-08 00:32:07 +00:00)


# Model training

time: 1.11 s (started: 2023-07-08 00:32:07 +00:00)
