# Desafio da Semana

Objetivo: Sedimentar os conhecimentos de Análise de Dados que aprendemos nessa aula. 

Desafio: Criar um notebook de análise exploratória (como fizemos na nossa aula de hoje) com todas as etapas de coleta, limpeza, análise e visualização com base de dados da Olist.

Requisitos: 
- Formular o mínimo de 4 perguntas para responder com suas análises;
- Utilizar pelo menos 3 bases de dados da Olist (caso você deseje criar sua base do zero). Caso deseje continuar utilizando a que criamos em aula, é necessário incluir pelo menos mais 2 tabelas para enriquecer sua análise.   
- Criar pelo menos 3 gráficos.
- Exporte sua base final.


Submeta uma pasta que contenha: 
- o arquivo .ipynb com sua análise exploratória rodada, ou seja, com as respostas aparecendo no notebook;
- interpretações observadas a partir dos gráficos dentro do notebook;
- a base final criada por você no formato .csv;
- especificar quais bases da olist foram usadas;

In [3]:
import pandas as pd


In [80]:
df_pagamento = pd.read_csv(r"C:\Users\nada_\OneDrive\Desktop\REPROGRAMA\on33-python-S14-analise-de-dados\exercicios\para-casa\Francitelma\olist_order_payments_dataset.csv")
df_pagamento



Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45
...,...,...,...,...,...
103881,0406037ad97740d563a178ecc7a2075c,1,boleto,1,363.31
103882,7b905861d7c825891d6347454ea7863f,1,credit_card,2,96.80
103883,32609bbb3dd69b3c066a6860554a77bf,1,credit_card,1,47.77
103884,b8b61059626efa996a60be9bb9320e10,1,credit_card,5,369.54


In [81]:
nulos_por_colunas = df_pagamento.isnull().sum()  #sem numeros nulos
print(nulos_por_colunas)

order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64


In [82]:
info_df = df_pagamento.info()
print(info_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB
None


In [83]:
def visualizar_as_duplicadas(df_pagamento):  #sem duplicadas
    duplicados = df_pagamento[df_pagamento.duplicated(keep=False)]

    return duplicados

linhas_duplicadas = visualizar_as_duplicadas(df_pagamento)
print (linhas_duplicadas)

Empty DataFrame
Columns: [order_id, payment_sequential, payment_type, payment_installments, payment_value]
Index: []


In [84]:
df_pagamento.describe(include='all')

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
count,103886,103886.0,103886,103886.0,103886.0
unique,99440,,5,,
top,fa65dad1b0e818e3ccc5cb0e39231352,,credit_card,,
freq,29,,76795,,
mean,,1.092679,,2.853349,154.10038
std,,0.706584,,2.687051,217.494064
min,,1.0,,0.0,0.0
25%,,1.0,,1.0,56.79
50%,,1.0,,1.0,100.0
75%,,1.0,,4.0,171.8375


In [19]:

# Contar o número de pagamentos por order_id
pagamentos_por_pedido = df_pagamento.groupby('order_id').size().reset_index(name='num_pagamentos')

# Filtrar pedidos com múltiplos pagamentos
pedidos_multiplos_pagamentos = pagamentos_por_pedido[pagamentos_por_pedido['num_pagamentos'] > 1]

print(pedidos_multiplos_pagamentos.head(200000))


                               order_id  num_pagamentos
36     0016dfedd97fc2950e388d2971d718c7               2
73     002f19a65a2ddd70a090297872e6d64e               2
161    0071ee2429bc1efdc43aa3e073a5290e               2
215    009ac365164f8e06f59d18a08045f6c4               6
258    00b4a910f64f24dbcac04fe54088a443               2
...                                 ...             ...
99241  ff7400d904161b62b6e830b3988f5cbd               2
99293  ff978de32e717acd3b5abe1fb069d2b6               4
99304  ffa1dd97810de91a03abd7bd76d2fed1               2
99308  ffa39020fe7c8a3e907320e1bec4b985               2
99360  ffc730a0615d28ec19f9cad02cb41442               2

[2961 rows x 2 columns]


In [85]:

# Identificar order_ids com múltiplos registros
contagem_order_id = df_pagamento['order_id'].value_counts()

# Filtrar apenas os pedidos com mais de um registro
multiplos_registros = contagem_order_id[contagem_order_id > 1]

print(f"Número de pedidos com múltiplos registros: {len(multiplos_registros)}")

# Exibir alguns exemplos de pedidos com múltiplos registros
print(multiplos_registros.head())

# Consolidação dos pagamentos por pedido
# Agrupar por order_id e somar os valores de pagamento
consolidado_pagamento = df_pagamento.groupby('order_id').agg({
    'payment_value': 'sum',
    'payment_installments': 'sum',  # Ou usar 'max', dependendo do contexto
    'payment_type': lambda x: ', '.join(x.unique())  # Lista os tipos de pagamento usados
}).reset_index()

print(consolidado_pagamento.head())


Número de pedidos com múltiplos registros: 2961
order_id
fa65dad1b0e818e3ccc5cb0e39231352    29
ccf804e764ed5650cd8759557269dc13    26
285c2e15bebd4ac83635ccc563dc71f4    22
895ab968e7bb0d5659d16cd74cd1650c    21
fedcd9f7ccdc8cba3a18defedd1a5547    19
Name: count, dtype: int64
                           order_id  payment_value  payment_installments  \
0  00010242fe8c5a6d1ba2dd792cb16214          72.19                     2   
1  00018f77f2f0320c557190d7a144bdd3         259.83                     3   
2  000229ec398224ef6ca0657da4fc703e         216.87                     5   
3  00024acbcdf0a6daa1e931b038114c75          25.78                     2   
4  00042b26cf59d7ce69dfabb4e55b4fd9         218.04                     3   

  payment_type  
0  credit_card  
1  credit_card  
2  credit_card  
3  credit_card  
4  credit_card  


In [86]:
# Contar o número de pagamentos por pedido
pagamentos_por_pedido = df_pagamento.groupby('order_id').size().reset_index(name='num_pagamentos')

# Contar a frequência de cada número de pagamentos
pagamentos_frequencia = pagamentos_por_pedido['num_pagamentos'].value_counts().sort_index()

# Visualizar pagamentos_frequencia
print(pagamentos_frequencia)

                                        #Número de pedidos com múltiplos registros: 2961


num_pagamentos
1     96479
2      2382
3       301
4       108
5        52
6        36
7        28
8        11
9         9
10        5
11        8
12        8
13        3
14        2
15        2
19        2
21        1
22        1
26        1
29        1
Name: count, dtype: int64


In [87]:
df_consumidor = pd.read_csv(r"C:\Users\nada_\OneDrive\Desktop\REPROGRAMA\on33-python-S14-analise-de-dados\exercicios\para-casa\Francitelma\olist_customers_dataset.csv")
df_consumidor


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP
...,...,...,...,...,...
99436,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937,sao paulo,SP
99437,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764,taboao da serra,SP
99438,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115,fortaleza,CE
99439,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120,canoas,RS


In [88]:
nulos_por_colunas = df_consumidor.isnull().sum()  #sem numeros nulos
print(nulos_por_colunas)

customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64


In [89]:
df_consumidor.describe(include='all')

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
count,99441,99441,99441.0,99441,99441
unique,99441,96096,,4119,27
top,06b8999e2fba1a1fbc88172c00ba8bc7,8d50f5eadf50201ccdcedfb9e2ac8455,,sao paulo,SP
freq,1,17,,15540,41746
mean,,,35137.474583,,
std,,,29797.938996,,
min,,,1003.0,,
25%,,,11347.0,,
50%,,,24416.0,,
75%,,,58900.0,,


In [90]:
def visualizar_as_duplicadas(df_consumidor):  #sem duplicadas
    duplicados = df_consumidor[df_consumidor.duplicated(keep=False)]

    return duplicados

linhas_duplicadas = visualizar_as_duplicadas(df_consumidor)
print (linhas_duplicadas)

Empty DataFrame
Columns: [customer_id, customer_unique_id, customer_zip_code_prefix, customer_city, customer_state]
Index: []


In [91]:
info_df = df_consumidor.info()
print(info_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB
None


In [92]:
df_consumidor.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


In [94]:
df_consumidor = df_consumidor.drop('customer_unique_id', axis=1)   #dropando as colunas customer_unique_id e customer_zip_code_prefix
df_consumidor

Unnamed: 0,customer_id,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,campinas,SP
...,...,...,...
99436,17ddf5dd5d51696bb3d7c6291687be6f,sao paulo,SP
99437,e7b71a9017aa05c9a7fd292d714858e8,taboao da serra,SP
99438,5e28dfe12db7fb50a4b2f691faecea5e,fortaleza,CE
99439,56b18e2166679b8a959d72dd06da27f9,canoas,RS


In [95]:
df_items = pd.read_csv(r"C:\Users\nada_\OneDrive\Desktop\REPROGRAMA\on33-python-S14-analise-de-dados\exercicios\para-casa\Francitelma\olist_order_items_dataset.csv")
df_items

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.90,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.90,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.00,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.90,18.14
...,...,...,...,...,...,...,...
112645,fffc94f6ce00a00581880bf54a75a037,1,4aa6014eceb682077f9dc4bffebc05b0,b8bc237ba3788b23da09c0f1f3a3288c,2018-05-02 04:11:01,299.99,43.41
112646,fffcd46ef2263f404302a634eb57f7eb,1,32e07fd915822b0765e448c4dd74c828,f3c38ab652836d21de61fb8314b69182,2018-07-20 04:31:48,350.00,36.53
112647,fffce4705a9662cd70adb13d4a31832d,1,72a30483855e2eafc67aee5dc2560482,c3cfdc648177fdbbbb35635a37472c53,2017-10-30 17:14:25,99.90,16.95
112648,fffe18544ffabc95dfada21779c9644f,1,9c422a519119dcad7575db5af1ba540e,2b3e4a2a3ea8e01938cabda2a3e5cc79,2017-08-21 00:04:32,55.99,8.72


In [96]:
info_df = df_items.info()
print(info_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             112650 non-null  object 
 1   order_item_id        112650 non-null  int64  
 2   product_id           112650 non-null  object 
 3   seller_id            112650 non-null  object 
 4   shipping_limit_date  112650 non-null  object 
 5   price                112650 non-null  float64
 6   freight_value        112650 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 6.0+ MB
None


In [97]:
df_items.describe(include='all')

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
count,112650,112650.0,112650,112650,112650,112650.0,112650.0
unique,98666,,32951,3095,93318,,
top,8272b63d03f5f79c56e9e4120aec44ef,,aca2eb7d00ea1a7b8ebd4e68314663af,6560211a19b47992c3666cc44a7e94c0,2017-07-21 18:25:23,,
freq,21,,527,2033,21,,
mean,,1.197834,,,,120.653739,19.99032
std,,0.705124,,,,183.633928,15.806405
min,,1.0,,,,0.85,0.0
25%,,1.0,,,,39.9,13.08
50%,,1.0,,,,74.99,16.26
75%,,1.0,,,,134.9,21.15


In [99]:

df_items = df_items.drop('shipping_limit_date', axis=1) #dropando colunas shipping_limit_date e seller_id
df_items

Unnamed: 0,order_id,order_item_id,product_id,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,58.90,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,239.90,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,199.00,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,199.90,18.14
...,...,...,...,...,...
112645,fffc94f6ce00a00581880bf54a75a037,1,4aa6014eceb682077f9dc4bffebc05b0,299.99,43.41
112646,fffcd46ef2263f404302a634eb57f7eb,1,32e07fd915822b0765e448c4dd74c828,350.00,36.53
112647,fffce4705a9662cd70adb13d4a31832d,1,72a30483855e2eafc67aee5dc2560482,99.90,16.95
112648,fffe18544ffabc95dfada21779c9644f,1,9c422a519119dcad7575db5af1ba540e,55.99,8.72


In [102]:
df_pagamento.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [103]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   order_id       112650 non-null  object 
 1   order_item_id  112650 non-null  int64  
 2   product_id     112650 non-null  object 
 3   price          112650 non-null  float64
 4   freight_value  112650 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 4.3+ MB


In [123]:
# Jutando os df pedidos e itens de pedidos
df_olist = pd.merge(df_pagamento, df_items, on='order_id', how='inner')
df_olist

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value,order_item_id,product_id,price,freight_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33,1,af74cc53dcffc8384b29e7abfa41902b,79.80,19.53
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39,1,a630cc320a8c872f9de830cf121661a3,17.00,7.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71,1,2028bf1b01cafb2d2b1901fca4083222,56.99,8.72
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78,1,548e5bfe28edceab6b51fa707cc9556f,89.90,17.88
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45,1,386486367c1f9d4f587a8864ccb6902b,113.57,14.88
...,...,...,...,...,...,...,...,...,...
117596,0406037ad97740d563a178ecc7a2075c,1,boleto,1,363.31,1,1302fd16bdfe85e523940c846370ef2c,297.00,66.31
117597,7b905861d7c825891d6347454ea7863f,1,credit_card,2,96.80,1,fb55982be901439613a95940feefd9ee,79.00,17.80
117598,32609bbb3dd69b3c066a6860554a77bf,1,credit_card,1,47.77,1,848ef0a103ee21407d1fbd0f7e7ae179,39.99,7.78
117599,b8b61059626efa996a60be9bb9320e10,1,credit_card,5,369.54,1,e9c1abcc7126360336d8f56d49041a8c,349.00,20.54


In [118]:
df_consumidor = df_consumidor.rename(columns={'order_id': 'customer_id'})
df_consumidor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   order_id        99441 non-null  object
 1   customer_city   99441 non-null  object
 2   customer_state  99441 non-null  object
dtypes: object(3)
memory usage: 2.3+ MB


In [125]:
# Verificar se a coluna 'order_id' está presente
print('order_id in df_olist:', 'order_id' in df_olist.columns)
print('order_id in df_consumidor:', 'order_id' in df_consumidor.columns)


order_id in df_olist: True
order_id in df_consumidor: False


In [126]:
df_consumidor = df_consumidor.rename(columns={'customer_id': 'order_id'})
df_consumidor.head()

Unnamed: 0,order_id,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,campinas,SP


In [127]:
print(df_consumidor['order_id'].isnull().sum())

0


In [128]:
df_olist = pd.merge(df_olist, df_consumidor, on='order_id', how='inner')


print(df_olist.head())

Empty DataFrame
Columns: [order_id, payment_sequential, payment_type, payment_installments, payment_value, order_item_id, product_id, price, freight_value, customer_city, customer_state]
Index: []
