# 1. Install the packages

In [None]:
pip install google-cloud-bigquery
pip install google-cloud-bigquery-storage

# 2. Connection to Olist on BigQuery

In [3]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account

# Path to your service account key JSON file
key_path = "C:/Users/Tricia/personal/module_2_project/key/dsai-module-2-project-25282646cfd3.json"
        
# Create credentials from service account key
credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/bigquery"]
)

# Create BigQuery client
client = bigquery.Client(
    credentials=credentials,
    project=credentials.project_id
)


# 3. From here onwards, we can query the data
## 3.1 Investigate Order Items table:
### Findings:
    a) There are duplicated product_id under the same order_id, this probably means that customer ordered more than 1 quantity of the same product since there is no quantity column in the data set

In [4]:
# Define the SQL query
query = "SELECT * FROM dsai-module-2-project.olist.order_items"

# Run the query
query_job = client.query(query)

# Wait for the query to finish and fetch the results
results = query_job.result()

# Run the query and fetch the results into a DataFrame
df_order_items = client.query(query).to_dataframe()

df_order_items

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,3ee6513ae7ea23bdfab5b9ab60bffcb5,1,8a3254bee785a526d548a81a9bc3c9be,96804ea39d96eb908e7c3afdb671bb9e,2018-05-04 03:55:26+00:00,0.85,18.23
1,6e864b3f0ec71031117ad4cf46b7f2a1,1,8a3254bee785a526d548a81a9bc3c9be,96804ea39d96eb908e7c3afdb671bb9e,2018-05-02 20:30:34+00:00,0.85,18.23
2,c5bdd8ef3c0ec420232e668302179113,2,8a3254bee785a526d548a81a9bc3c9be,96804ea39d96eb908e7c3afdb671bb9e,2018-05-07 02:55:22+00:00,0.85,22.30
3,8272b63d03f5f79c56e9e4120aec44ef,2,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.20,7.89
4,8272b63d03f5f79c56e9e4120aec44ef,3,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.20,7.89
...,...,...,...,...,...,...,...
112645,199af31afc78c699f0dbf71fb178d4d4,1,c3ed642d592594bb648ff4a04cee2747,59417c56835dd8e2e72f91f809cd4092,2017-05-09 15:50:15+00:00,4690.00,74.34
112646,a96610ab360d42a2e5335a3998b4718a,1,a6492cc69376c469ab6f61d8f44de961,59417c56835dd8e2e72f91f809cd4092,2017-04-18 13:25:18+00:00,4799.00,151.34
112647,f5136e38d1a14a4dbd87dff67da82701,1,1bdf5e6731585cf01aa8169c7028d6ad,ee27a8f15b1dded4d213a468ba4eb391,2017-06-15 02:45:17+00:00,6499.00,227.66
112648,fefacc66af859508bf1a7934eab1e97f,1,69c590f7ffc7bf8db97190b6cb6ed62e,80ceebb4ee9b31afb6c6a916a574a1e2,2018-08-02 04:05:13+00:00,6729.00,193.21


In [5]:
df_order_items["order_item_id"].describe()

count    112650.0
mean     1.197834
std      0.705124
min           1.0
25%           1.0
50%           1.0
75%           1.0
max          21.0
Name: order_item_id, dtype: Float64

In [6]:
df_order_items["order_item_id"].unique()

<IntegerArray>
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
Length: 21, dtype: Int64

In [7]:
#check for duplicates
df_order_items[df_order_items.duplicated()==True]

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value


In [None]:
#pd.reset_option('display.max_rows')
#pd.reset_option('display.max_columns')
#pd.reset_option('display.width')
#pd.reset_option('display.max_colwidth')

In [8]:
#check for duplicates
df_order_items[df_order_items["order_id"].duplicated()==True]


Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
4,8272b63d03f5f79c56e9e4120aec44ef,3,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.20,7.89
5,8272b63d03f5f79c56e9e4120aec44ef,4,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.20,7.89
6,8272b63d03f5f79c56e9e4120aec44ef,5,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.20,7.89
7,8272b63d03f5f79c56e9e4120aec44ef,6,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.20,7.89
8,8272b63d03f5f79c56e9e4120aec44ef,7,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.20,7.89
...,...,...,...,...,...,...,...
112432,736e1922ae60d0d6a89247b851902527,4,19936fa4f614ee0590d3b77ac83fd648,b37c4c02bda3161a7546a4e6d222d5b2,2018-07-23 04:31:36+00:00,1790.00,28.72
112446,a8f5ea0830ac9ffd1d2864434bcddb70,1,1550a0fe151b511cb7cb327c55c865d3,5dceca129747e92ff8ef7a997dc4f8ca,2018-06-05 20:31:50+00:00,1837.90,78.84
112559,d2f270487125ddc41fd134c4003ad1d7,1,df676834e2a38e1bf267bd85a2dc8e3a,2bf6a2c1e71bbd29a4ad64e6d3c3629f,2018-06-11 19:50:18+00:00,2200.00,22.75
112570,b4c4b76c642808cbe472a32b86cddc95,2,3db0b74faf0d26a6b252528659d6b849,c4f7fee5b0db50e87766f5a4d1b1b758,2018-07-18 12:25:53+00:00,2299.95,104.77


In [9]:
query = "SELECT * FROM dsai-module-2-project.olist.order_items where order_id='8272b63d03f5f79c56e9e4120aec44ef' ORDER BY product_id"

df_results = client.query(query).to_dataframe()
df_results

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,8272b63d03f5f79c56e9e4120aec44ef,2,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.2,7.89
1,8272b63d03f5f79c56e9e4120aec44ef,3,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.2,7.89
2,8272b63d03f5f79c56e9e4120aec44ef,4,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.2,7.89
3,8272b63d03f5f79c56e9e4120aec44ef,5,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.2,7.89
4,8272b63d03f5f79c56e9e4120aec44ef,6,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.2,7.89
5,8272b63d03f5f79c56e9e4120aec44ef,7,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.2,7.89
6,8272b63d03f5f79c56e9e4120aec44ef,8,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.2,7.89
7,8272b63d03f5f79c56e9e4120aec44ef,9,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.2,7.89
8,8272b63d03f5f79c56e9e4120aec44ef,10,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.2,7.89
9,8272b63d03f5f79c56e9e4120aec44ef,11,05b515fdc76e888aada3c6d66c201dff,2709af9587499e95e803a6498a5a56e9,2017-07-21 18:25:23+00:00,1.2,7.89


In [69]:
query = "SELECT * FROM dsai-module-2-project.olist.order_items where order_id='ab14fdcfbe524636d65ee38360e22ce8' ORDER BY product_id"

df_results = client.query(query).to_dataframe()
df_results


 	

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,ab14fdcfbe524636d65ee38360e22ce8,1,9571759451b1d780ee7c15012ea109d4,ce27a3cc3c8cc1ea79d11e561e9bebb6,2017-08-30 14:30:23+00:00,98.7,14.44
1,ab14fdcfbe524636d65ee38360e22ce8,2,9571759451b1d780ee7c15012ea109d4,ce27a3cc3c8cc1ea79d11e561e9bebb6,2017-08-30 14:30:23+00:00,98.7,14.44
2,ab14fdcfbe524636d65ee38360e22ce8,3,9571759451b1d780ee7c15012ea109d4,ce27a3cc3c8cc1ea79d11e561e9bebb6,2017-08-30 14:30:23+00:00,98.7,14.44
3,ab14fdcfbe524636d65ee38360e22ce8,4,9571759451b1d780ee7c15012ea109d4,ce27a3cc3c8cc1ea79d11e561e9bebb6,2017-08-30 14:30:23+00:00,98.7,14.44
4,ab14fdcfbe524636d65ee38360e22ce8,5,9571759451b1d780ee7c15012ea109d4,ce27a3cc3c8cc1ea79d11e561e9bebb6,2017-08-30 14:30:23+00:00,98.7,14.44
5,ab14fdcfbe524636d65ee38360e22ce8,6,9571759451b1d780ee7c15012ea109d4,ce27a3cc3c8cc1ea79d11e561e9bebb6,2017-08-30 14:30:23+00:00,98.7,14.44
6,ab14fdcfbe524636d65ee38360e22ce8,7,9571759451b1d780ee7c15012ea109d4,ce27a3cc3c8cc1ea79d11e561e9bebb6,2017-08-30 14:30:23+00:00,98.7,14.44
7,ab14fdcfbe524636d65ee38360e22ce8,8,9571759451b1d780ee7c15012ea109d4,ce27a3cc3c8cc1ea79d11e561e9bebb6,2017-08-30 14:30:23+00:00,98.7,14.44
8,ab14fdcfbe524636d65ee38360e22ce8,9,9571759451b1d780ee7c15012ea109d4,ce27a3cc3c8cc1ea79d11e561e9bebb6,2017-08-30 14:30:23+00:00,98.7,14.44
9,ab14fdcfbe524636d65ee38360e22ce8,10,9571759451b1d780ee7c15012ea109d4,ce27a3cc3c8cc1ea79d11e561e9bebb6,2017-08-30 14:30:23+00:00,98.7,14.44


In [10]:
query = "SELECT order_id, count(order_item_id) as item_count FROM dsai-module-2-project.olist.order_items GROUP BY order_id HAVING item_count>1"

df_results = client.query(query).to_dataframe()
df_results.shape

(9803, 2)

In [11]:
query = "SELECT order_id, product_id, count(order_item_id) as item_count FROM dsai-module-2-project.olist.order_items GROUP BY order_id, product_id HAVING item_count>1 ORDER BY item_count DESC"

df_results = client.query(query).to_dataframe()
df_results

Unnamed: 0,order_id,product_id,item_count
0,ab14fdcfbe524636d65ee38360e22ce8,9571759451b1d780ee7c15012ea109d4,20
1,1b15974a0141d54e36626dca3fdc731a,ee3d532c8a438679776d222e997606b3,20
2,9ef13efd6949e4573a18964dd1bbe7f5,37eb69aca8718e843d897aa7b82f462d,15
3,428a2f660dc84138d969ccd69a0ab6d5,89b190a046022486c635022524a974a8,15
4,9bdc4d4c71aa1de4606060929dee888c,44a5d24dd383324a421569ca697b13c2,14
...,...,...,...
7083,877a74a0ee5c1f719f39f4f11d2fc19a,86fb7ef6ee1ffa3832f09b1a1552e7e6,2
7084,6fd59e3ae7e24c50131f6bc97c4c7776,588531f8ec37e7d5ff5b7b22ea0488f8,2
7085,cb418bb4b6115438d20b0141ed5bc9e0,e74384c9363e768848854356a37c73f4,2
7086,66b9c991ee308f9342f6a7f63bb68251,19936fa4f614ee0590d3b77ac83fd648,2


In [12]:
# Define the SQL query
query = "SELECT order_id, order_item_id, A.product_id, A.product_category_name FROM dsai-module-2-project.olist.products A, dsai-module-2-project.olist.order_items B where A.product_id = B.product_id"

# Run the query and fetch the results into a DataFrame
df_products = client.query(query).to_dataframe()

df_products

Unnamed: 0,order_id,order_item_id,product_id,product_category_name
0,101157d4fae1c9fb74a00a5dee265c25,1,5eb564652db742ff8f28759cd8d2652a,
1,1521c6bb7b1028154c8c67cf80fa809f,1,5eb564652db742ff8f28759cd8d2652a,
2,415cfaaaa8cea49f934470548797fed1,1,5eb564652db742ff8f28759cd8d2652a,
3,415cfaaaa8cea49f934470548797fed1,2,5eb564652db742ff8f28759cd8d2652a,
4,6f497c40431d5fb0cfbd6c943dd29215,1,5eb564652db742ff8f28759cd8d2652a,
...,...,...,...,...
112645,e1000df659259b29cb73a4798ccd7454,1,b17808303e15dd50538c011b44295427,cama_mesa_banho
112646,e5c0d192e1f883b21e7673f73cb644b9,1,b17808303e15dd50538c011b44295427,cama_mesa_banho
112647,0a0090ae69392fa38ee742006f8c0a90,1,b17808303e15dd50538c011b44295427,cama_mesa_banho
112648,4ae136c7611e6052ec06035442490f24,1,b17808303e15dd50538c011b44295427,cama_mesa_banho


In [13]:
df_order_items["price"].describe()

count    112650.000000
mean        120.653739
std         183.633928
min           0.850000
25%          39.900000
50%          74.990000
75%         134.900000
max        6735.000000
Name: price, dtype: float64

In [14]:
df_order_items[df_order_items["price"] == 6735]

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
112649,0812eb902a67711a1cb742b3cdaa65ae,1,489ae2aa008f021502940f251d4cce7f,e3b4998c7a498169dc7bce44e6bb6277,2017-02-16 20:37:36+00:00,6735.0,194.31


In [15]:
df_order_items["shipping_limit_date"].describe()

count                              112650
mean     2018-01-07 15:36:52.192685+00:00
min             2016-09-19 00:15:34+00:00
25%      2017-09-20 20:57:27.500000+00:00
50%             2018-01-26 13:59:35+00:00
75%      2018-05-10 14:34:00.750000+00:00
max             2020-04-09 22:35:08+00:00
Name: shipping_limit_date, dtype: object

In [16]:
df_order_items['shipping_limit_date'] = pd.to_datetime(df_order_items['shipping_limit_date'])

# Get the summary statistics similar to what you're looking for
count = df_order_items['shipping_limit_date'].count()  # Count of non-null entries
unique = df_order_items['shipping_limit_date'].nunique()  # Number of unique values
top = df_order_items['shipping_limit_date'].mode()[0]  # Most frequent value (mode)
freq = df_order_items['shipping_limit_date'].value_counts().iloc[0]  # Frequency of the most frequent value
first = df_order_items['shipping_limit_date'].min()  # Earliest date (min)
last = df_order_items['shipping_limit_date'].max()  # Latest date (max)

# Print the results
print(f"Count: {count}")
print(f"Unique: {unique}")
print(f"Top: {top}")
print(f"Freq: {freq}")
print(f"First: {first}")
print(f"Last: {last}")

Count: 112650
Unique: 93318
Top: 2017-07-21 18:25:23+00:00
Freq: 21
First: 2016-09-19 00:15:34+00:00
Last: 2020-04-09 22:35:08+00:00


## 3.2 Investigate Products table:
### Findings:
    a) 610 records have null values for product_category_name, product_name_lenght, product_description_lenght, product_photos_qty

In [17]:
# Define the SQL query
query = "SELECT * FROM dsai-module-2-project.olist.products"

# Run the query and fetch the results into a DataFrame
df_products = client.query(query).to_dataframe()

df_products

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,a0ab96e461d74537772b84950f26a257,climatizacao,41,717,1,1050,18,7,8
1,20ae7c024ede613f47e0d2f23f461493,telefonia_fixa,25,455,1,330,17,11,9
2,4d7585daba2f8b3ed7f87447908b4237,telefonia_fixa,53,897,2,300,15,8,9
3,ad7aebed205805125489f8a89819b24b,construcao_ferramentas_ferramentas,41,2526,2,1150,22,10,9
4,980ecbcc15fe174ec1e5757c4d75b1bf,agro_industria_e_comercio,48,157,1,250,17,3,10
...,...,...,...,...,...,...,...,...,...
32946,e7e460583f2b78939dee0f8cd9a619ed,utilidades_domesticas,59,586,6,12250,80,10,76
32947,b92268ad64d3947c80fddbc3a82de845,utilidades_domesticas,55,134,1,8850,39,12,78
32948,4047f50df2141e2ab6c747b16b6a7bf6,utilidades_domesticas,40,564,3,30000,90,30,80
32949,66a4ac0f9c73e92afc9cb7566977ef83,utilidades_domesticas,58,547,5,1000,18,12,82


In [18]:
df_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   product_id                  32951 non-null  object
 1   product_category_name       32341 non-null  object
 2   product_name_lenght         32341 non-null  Int64 
 3   product_description_lenght  32341 non-null  Int64 
 4   product_photos_qty          32341 non-null  Int64 
 5   product_weight_g            32949 non-null  Int64 
 6   product_length_cm           32949 non-null  Int64 
 7   product_height_cm           32949 non-null  Int64 
 8   product_width_cm            32949 non-null  Int64 
dtypes: Int64(7), object(2)
memory usage: 2.5+ MB


In [19]:
df_products.describe()

Unnamed: 0,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
count,32341.0,32341.0,32341.0,32949.0,32949.0,32949.0,32949.0
mean,48.476949,771.495285,2.188986,2276.472488,30.815078,16.937661,23.196728
std,10.245741,635.115225,1.736766,4282.038731,16.914458,13.637554,12.079047
min,5.0,4.0,1.0,0.0,7.0,2.0,6.0
25%,42.0,339.0,1.0,300.0,18.0,8.0,15.0
50%,51.0,595.0,1.0,700.0,25.0,13.0,20.0
75%,57.0,972.0,3.0,1900.0,38.0,21.0,30.0
max,76.0,3992.0,20.0,40425.0,105.0,105.0,118.0


In [24]:
category_counts = df_products["product_category_name"].value_counts()
category_counts

product_category_name
cama_mesa_banho                  3029
esporte_lazer                    2867
moveis_decoracao                 2657
beleza_saude                     2444
utilidades_domesticas            2335
                                 ... 
fashion_roupa_infanto_juvenil       5
casa_conforto_2                     5
pc_gamer                            3
seguros_e_servicos                  2
cds_dvds_musicais                   1
Name: count, Length: 73, dtype: int64

In [29]:
df_products[df_products.duplicated()]


Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm


In [30]:
null_values = df_products.isnull().sum()

# Print the result
print(null_values)

product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
dtype: int64


In [32]:
is_null = df_products["product_category_name"].isnull()

# Print rows where 'product_category_name' is NaN
df_products[is_null]

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
2454,5eb564652db742ff8f28759cd8d2652a,,,,,,,,
2455,e10758160da97891c2fdcbc35f0f031d,,,,,2200,16,2,11
2456,39e3b9b12cd0bf8ee681bbc1c130feb5,,,,,300,16,7,11
2457,bcb815bba008d89458e428078c0b9211,,,,,150,16,2,11
2458,212cc0fa7359ab242a697a03a574f719,,,,,200,16,2,11
...,...,...,...,...,...,...,...,...,...
3059,6962734c72522e70e852a2a77d21a730,,,,,10050,62,6,62
3060,ceeba7d5636e59173cc5f484e913db3d,,,,,30000,65,65,65
3061,b0a0c5dd78e644373b199380612c350a,,,,,1800,30,20,70
3062,7167af17015615b513d5b429758969a2,,,,,21100,38,42,72


In [33]:

is_dup = df_products["product_id"].duplicated()

# Print rows where 'product_category_name' is NaN
df_products[is_dup]

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm


## 3.3 Investigate Sellers table:
## Findings:¶

a) 610 records have null values for product_category_name, product_name_lenght, product_description_lenght, product_photos_qty



In [35]:
# Define the SQL query
query = "SELECT * FROM dsai-module-2-project.olist.sellers"

# Run the query and fetch the results into a DataFrame
df_sellers = client.query(query).to_dataframe()

df_sellers

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,c13ef0cfbe42f190780f621ce81f2234,1207,sao paulo sp,SP
1,5444b12c82f21c923f2639ebc722c1ea,2051,sao pauo,SP
2,1cbd32d00d01bb8087a5eb088612fd9c,3363,sp / sp,SP
3,71593c7413973a1e160057b80d4958f6,3407,sao paulo / sao paulo,SP
4,6f1a1263039c76e68f40a8e536b1da6a,3581,sao paulop,SP
...,...,...,...,...
3090,0aea4c6ae1505b3228ddf3dd7822ee5b,18080,sorocaba,SP
3091,c4f7fee5b0db50e87766f5a4d1b1b758,18085,sorocaba,SP
3092,3771c85bac139d2344864ede5d9341e3,18087,sorocaba,SP
3093,6e4a902d1054e4e17aa6eab87fac1c75,18090,sorocaba,SP


In [36]:
df_sellers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3095 entries, 0 to 3094
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   seller_id               3095 non-null   object
 1   seller_zip_code_prefix  3095 non-null   Int64 
 2   seller_city             3095 non-null   object
 3   seller_state            3095 non-null   object
dtypes: Int64(1), object(3)
memory usage: 99.9+ KB


In [37]:
df_sellers.describe()

Unnamed: 0,seller_zip_code_prefix
count,3095.0
mean,32291.059451
std,32713.45383
min,1001.0
25%,7093.5
50%,14940.0
75%,64552.5
max,99730.0


In [38]:
df_sellers[df_sellers.duplicated()]

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state


In [40]:
null_values = df_sellers.isnull().sum()

# Print the result
print(null_values)

seller_id                 0
seller_zip_code_prefix    0
seller_city               0
seller_state              0
dtype: int64


In [45]:
is_dup = df_sellers["seller_id"].duplicated()

# Print rows where 'product_category_name' is NaN
df_sellers[is_dup]

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state


In [47]:
city_count = df_sellers["seller_city"].value_counts()

city_count

seller_city
sao paulo                  694
curitiba                   127
rio de janeiro              96
belo horizonte              68
ribeirao preto              52
                          ... 
congonhas                    1
prados                       1
barbacena                    1
barbacena/ minas gerais      1
sao paulo sp                 1
Name: count, Length: 611, dtype: int64

In [48]:
state_count = df_sellers["seller_state"].value_counts()

state_count

seller_state
SP    1849
PR     349
MG     244
SC     190
RJ     171
RS     129
GO      40
DF      30
ES      23
BA      19
CE      13
PE       9
PB       6
MS       5
RN       5
MT       4
SE       2
RO       2
AM       1
AC       1
PI       1
PA       1
MA       1
Name: count, dtype: int64

In [52]:
# Filter rows where 'seller_city' contains the substring 'sao'
df_sellers_filtered = df_sellers[df_sellers["seller_city"].str.contains("sao", case=False, na=False)]

# Print the filtered DataFrame
df_sellers_filtered

Unnamed: 0,seller_id,seller_zip_code_prefix,seller_city,seller_state
0,c13ef0cfbe42f190780f621ce81f2234,1207,sao paulo sp,SP
1,5444b12c82f21c923f2639ebc722c1ea,2051,sao pauo,SP
3,71593c7413973a1e160057b80d4958f6,3407,sao paulo / sao paulo,SP
4,6f1a1263039c76e68f40a8e536b1da6a,3581,sao paulop,SP
5,06579cb253ecd5a3a12a9e6eb6bf8f47,4007,sao paulo - sp,SP
...,...,...,...,...
3058,778323240ce2830d68aab11794e00bfb,13600,sao paulo,SP
3059,dace965ca58120f92f8d742a9fa1864b,14015,sao paulo,SP
3060,761681a821d8275bc79f552116d06869,17606,sao paulo,SP
3061,a64e44665225d19dfc0277eeeaaccc57,19400,sao paulo,SP


In [60]:
# Set Pandas options to display more rows and columns
pd.set_option('display.max_rows', None)  # No limit to rows
pd.set_option('display.max_columns', None)  # No limit to columns

# Now print the full list of city counts
city = df_sellers["seller_city"].value_counts().sort_index()
print(city)

seller_city
04482255                                      1
abadia de goias                               1
afonso claudio                                1
aguas claras df                               1
alambari                                      1
alfenas                                       2
almirante tamandare                           1
alvares machado                               1
alvorada                                      1
americana                                    10
amparo                                        5
ampere                                        1
anapolis                                      3
andira-pr                                     1
andradas                                      2
angra dos reis                                1
angra dos reis rj                             1
ao bernardo do campo                          1
aparecida                                     1
aparecida de goiania                          1
aperibe                     