# 1. Install the packages

In [None]:
pip install google-cloud-bigquery
pip install google-cloud-bigquery-storage

# 2. Connection to Olist on BigQuery

In [20]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account

# Path to your service account key JSON file
key_path = "/Users/wenyilee/Desktop/dsai-module-2-project-25282646cfd3.json"
        
# Create credentials from service account key
credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/bigquery"]
)

# Create BigQuery client
client = bigquery.Client(
    credentials=credentials,
    project=credentials.project_id
)

# 3.1 Observation for orders

In [43]:
# Define the SQL query
query = "SELECT * FROM dsai-module-2-project.olist.orders"

# Run the query
query_job = client.query(query)

# Wait for the query to finish and fetch the results
results = query_job.result()

# Run the query and fetch the results into a DataFrame
df_orders = client.query(query).to_dataframe()

# Display basic information about the DataFrame
print("Dataset Info:")
print(df_orders.info())
print("\nFirst 5 rows:")
print(df_orders.head())
print("\nSummary statistics:")
print(df_orders.describe(include='all'))



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype              
---  ------                         --------------  -----              
 0   order_id                       99441 non-null  object             
 1   customer_id                    99441 non-null  object             
 2   order_status                   99441 non-null  object             
 3   order_purchase_timestamp       99441 non-null  datetime64[us, UTC]
 4   order_approved_at              99281 non-null  datetime64[us, UTC]
 5   order_delivered_carrier_date   97658 non-null  datetime64[us, UTC]
 6   order_delivered_customer_date  96476 non-null  datetime64[us, UTC]
 7   order_estimated_delivery_date  99441 non-null  datetime64[us, UTC]
dtypes: datetime64[us, UTC](5), object(3)
memory usage: 6.1+ MB
None

First 5 rows:
                           order_id                       customer_id  \
0

In [44]:
# Check for missing values
print("Missing Values:")
print(df_orders.isnull().sum())

Missing Values:
order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64


In [45]:
# Check for duplicate rows
print("Duplicate rows:", df_orders.duplicated().sum())

# Check for duplicate order_ids
print("Duplicate order_ids:", df_orders['order_id'].duplicated().sum())

Duplicate rows: 0
Duplicate order_ids: 0


In [None]:
# Check order_status for missing order_approved_at
print("Order Status for missing order_approved_at:")
print(df_orders[df_orders['order_approved_at'].isnull()]['order_status'].value_counts())

# When orders are cancelled, prior to approval, there will be no order_approved_at
# When orders are created. approval might not have been posted yet at the point of data extraction.
# However, when orders have been delivered, it does not make logical sense that it is not approved yet.

Order Status for missing order_approved_at:
order_status
canceled     141
delivered     14
created        5
Name: count, dtype: int64


In [24]:
# Check order_status for missing order_delivered_customer_date
print("Order Status for missing order_delivered_customer_date:")
print(df_orders[df_orders['order_delivered_customer_date'].isnull()]['order_status'].value_counts())

# Check order_status for missing order_delivered_carrier_date
print("Order Status for missing order_delivered_carrier_date:")
print(df_orders[df_orders['order_delivered_carrier_date'].isnull()]['order_status'].value_counts())

Order Status for missing order_delivered_customer_date:
order_status
shipped        1107
canceled        619
unavailable     609
invoiced        314
processing      301
delivered         8
created           5
approved          2
Name: count, dtype: int64
Order Status for missing order_delivered_carrier_date:
order_status
unavailable    609
canceled       550
invoiced       314
processing     301
created          5
approved         2
delivered        2
Name: count, dtype: int64


In [27]:
# Check unique values in order_status
print("Unique order_status values:")
print(df_orders['order_status'].value_counts())

Unique order_status values:
order_status
delivered      96478
shipped         1107
canceled         625
unavailable      609
invoiced         314
processing       301
created            5
approved           2
Name: count, dtype: int64


In [36]:
# Identify delivered orders with missing values
df_delivered_missing_values = df_orders[
    (df_orders['order_status'] == 'delivered') & 
    (
        df_orders['order_approved_at'].isnull() | 
        df_orders['order_delivered_customer_date'].isnull() | 
        df_orders['order_delivered_carrier_date'].isnull()
    )
]

# Confirm the number of orders
df_delivered_missing_values

#Total of 23 rows with missing (order_approved_at / order_delivered_carrier_date / order_delivered_customer_date)

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
1225,c1d4211b3dae76144deccd6c74144a88,684cb238dc5b5d6366244e0e0776b450,delivered,2017-01-19 12:48:08+00:00,NaT,2017-01-25 14:56:50+00:00,2017-01-30 18:16:01+00:00,2017-03-01 00:00:00+00:00
2270,7002a78c79c519ac54022d4f8a65e6e8,d5de688c321096d15508faae67a27051,delivered,2017-01-19 22:26:59+00:00,NaT,2017-01-27 11:08:05+00:00,2017-02-06 14:22:19+00:00,2017-03-16 00:00:00+00:00
2334,e04abd8149ef81b95221e88f6ed9ab6a,2127dc6603ac33544953ef05ec155771,delivered,2017-02-18 14:40:00+00:00,NaT,2017-02-23 12:04:47+00:00,2017-03-01 13:25:33+00:00,2017-03-17 00:00:00+00:00
2353,7013bcfc1c97fe719a7b5e05e61c12db,2941af76d38100e0f8740a374f1a5dc3,delivered,2017-02-18 13:29:47+00:00,NaT,2017-02-22 16:25:25+00:00,2017-03-01 08:07:38+00:00,2017-03-17 00:00:00+00:00
2488,12a95a3c06dbaec84bcfb0e2da5d228a,1e101e0daffaddce8159d25a8e53f2b2,delivered,2017-02-17 13:05:55+00:00,NaT,2017-02-22 11:23:11+00:00,2017-03-02 11:09:19+00:00,2017-03-20 00:00:00+00:00
2553,2eecb0d85f281280f79fa00f9cec1a95,a3d3c38e58b9d2dfb9207cab690b6310,delivered,2017-02-17 17:21:55+00:00,NaT,2017-02-22 11:42:51+00:00,2017-03-03 12:16:03+00:00,2017-03-20 00:00:00+00:00
2635,8a9adc69528e1001fc68dd0aaebbb54a,4c1ccc74e00993733742a3c786dc3c1f,delivered,2017-02-18 12:45:31+00:00,NaT,2017-02-23 09:01:52+00:00,2017-03-02 10:05:06+00:00,2017-03-21 00:00:00+00:00
2716,88083e8f64d95b932164187484d90212,f67cd1a215aae2a1074638bbd35a223a,delivered,2017-02-18 22:49:19+00:00,NaT,2017-02-22 11:31:06+00:00,2017-03-02 12:06:06+00:00,2017-03-21 00:00:00+00:00
2833,d77031d6a3c8a52f019764e68f211c69,0bf35cac6cc7327065da879e2d90fae8,delivered,2017-02-18 11:04:19+00:00,NaT,2017-02-23 07:23:36+00:00,2017-03-02 16:15:23+00:00,2017-03-22 00:00:00+00:00
3037,3c0b8706b065f9919d0505d3b3343881,d85919cb3c0529589c6fa617f5f43281,delivered,2017-02-17 15:53:27+00:00,NaT,2017-02-22 11:31:30+00:00,2017-03-03 11:47:47+00:00,2017-03-23 00:00:00+00:00


In [35]:
# Check non-delivered orders with order_delivered_customer_date present
df_non_delivered_with_delivery = df_orders[(df_orders['order_status'] != 'delivered') & (df_orders['order_delivered_customer_date'].notnull())]
df_non_delivered_with_delivery

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
16,65d1e226dfaeb8cdc42f665422522d14,70fc57eeae292675927697fe03ad3ff5,canceled,2016-10-03 21:01:41+00:00,2016-10-04 10:18:57+00:00,2016-10-25 12:14:28+00:00,2016-11-08 10:58:34+00:00,2016-11-25 00:00:00+00:00
17,770d331c84e5b214bd9dc70a10b829d0,6c57e6119369185e575b36712766b0ef,canceled,2016-10-07 14:52:30+00:00,2016-10-07 15:07:10+00:00,2016-10-11 15:07:11+00:00,2016-10-14 15:07:11+00:00,2016-11-29 00:00:00+00:00
18,dabf2b0e35b423f94618bf965fcb7514,5cdec0bb8cbdf53ffc8fdc212cd247c6,canceled,2016-10-09 00:56:52+00:00,2016-10-09 13:36:58+00:00,2016-10-13 13:36:59+00:00,2016-10-16 14:36:59+00:00,2016-11-30 00:00:00+00:00
19,8beb59392e21af5eb9547ae1a9938d06,bf609b5741f71697f65ce3852c5d2623,canceled,2016-10-08 20:17:50+00:00,2016-10-09 14:34:30+00:00,2016-10-14 22:45:26+00:00,2016-10-19 18:47:43+00:00,2016-11-30 00:00:00+00:00
23,2c45c33d2f9cb8ff8b1c86cc28c11c30,de4caa97afa80c8eeac2ff4c8da5b72e,canceled,2016-10-09 15:39:56+00:00,2016-10-10 10:40:49+00:00,2016-10-14 10:40:50+00:00,2016-11-09 14:53:50+00:00,2016-12-08 00:00:00+00:00
358,1950d777989f6a877539f53795b4c3c3,1bccb206de9f0f25adc6871a1bcf77b2,canceled,2018-02-19 19:48:52+00:00,2018-02-19 20:56:05+00:00,2018-02-20 19:57:13+00:00,2018-03-21 22:03:51+00:00,2018-03-09 00:00:00+00:00


In [42]:
# Define conditions for illogical timestamps
illogical_timestamps = (
    # order_approved_at after order_delivered_carrier_date
    (df_orders['order_approved_at'].notnull() & 
     df_orders['order_delivered_carrier_date'].notnull() & 
     (df_orders['order_approved_at'] > df_orders['order_delivered_carrier_date'])) |
    # order_delivered_customer_date before order_delivered_carrier_date
    (df_orders['order_delivered_customer_date'].notnull() & 
     df_orders['order_delivered_carrier_date'].notnull() & 
     (df_orders['order_delivered_customer_date'] < df_orders['order_delivered_carrier_date'])) |
    # order_delivered_customer_date before order_purchase_timestamp
    (df_orders['order_delivered_customer_date'].notnull() & 
     (df_orders['order_delivered_customer_date'] < df_orders['order_purchase_timestamp'])) |
    # order_estimated_delivery_date before order_purchase_timestamp
    (df_orders['order_estimated_delivery_date'] < df_orders['order_purchase_timestamp']) |
    # order_estimated_delivery_date before order_approved_at
    (df_orders['order_approved_at'].notnull() & 
     (df_orders['order_estimated_delivery_date'] < df_orders['order_approved_at'])) |
    # Non-delivered orders with order_delivered_customer_date
    ((df_orders['order_status'] != 'delivered') & 
     (df_orders['order_delivered_customer_date'].notnull()))
)

# Total rows with illogical timestamps
print("Total rows with illogical timestamps:", illogical_timestamps.sum())

# Break down each type of issue
print("\nBreakdown of Illogical Timestamps:")
# 1. order_approved_at after order_delivered_carrier_date
approved_after_carrier = df_orders[
    df_orders['order_approved_at'].notnull() & 
    df_orders['order_delivered_carrier_date'].notnull() & 
    (df_orders['order_approved_at'] > df_orders['order_delivered_carrier_date'])
]
print("Orders with order_approved_at after order_delivered_carrier_date:", len(approved_after_carrier))

# 2. order_delivered_customer_date before order_delivered_carrier_date
customer_before_carrier = df_orders[
    df_orders['order_delivered_customer_date'].notnull() & 
    df_orders['order_delivered_carrier_date'].notnull() & 
    (df_orders['order_delivered_customer_date'] < df_orders['order_delivered_carrier_date'])
]
print("Orders with order_delivered_customer_date before order_delivered_carrier_date:", len(customer_before_carrier))

# 3. order_delivered_customer_date before order_purchase_timestamp
customer_before_purchase = df_orders[
    df_orders['order_delivered_customer_date'].notnull() & 
    (df_orders['order_delivered_customer_date'] < df_orders['order_purchase_timestamp'])
]
print("Orders with order_delivered_customer_date before order_purchase_timestamp:", len(customer_before_purchase))

# 4. order_estimated_delivery_date before order_purchase_timestamp
estimated_before_purchase = df_orders[
    df_orders['order_estimated_delivery_date'] < df_orders['order_purchase_timestamp']
]
print("Orders with order_estimated_delivery_date before order_purchase_timestamp:", len(estimated_before_purchase))

# 5. order_estimated_delivery_date before order_approved_at
estimated_before_approved = df_orders[
    df_orders['order_approved_at'].notnull() & 
    (df_orders['order_estimated_delivery_date'] < df_orders['order_approved_at'])
]
print("Orders with order_estimated_delivery_date before order_approved_at:", len(estimated_before_approved))

# 6. Non-delivered orders with order_delivered_customer_date
non_delivered_with_delivery = df_orders[
    (df_orders['order_status'] != 'delivered') & 
    (df_orders['order_delivered_customer_date'].notnull())
]
print("Non-delivered orders with order_delivered_customer_date:", len(non_delivered_with_delivery))

Total rows with illogical timestamps: 1395

Breakdown of Illogical Timestamps:
Orders with order_approved_at after order_delivered_carrier_date: 1359
Orders with order_delivered_customer_date before order_delivered_carrier_date: 23
Orders with order_delivered_customer_date before order_purchase_timestamp: 0
Orders with order_estimated_delivery_date before order_purchase_timestamp: 0
Orders with order_estimated_delivery_date before order_approved_at: 12
Non-delivered orders with order_delivered_customer_date: 6


# 3.2 Observation for order_payments

In [60]:
# Define the SQL query
query = "SELECT * FROM dsai-module-2-project.olist.payment"

# Run the query
query_job = client.query(query)

# Wait for the query to finish and fetch the results
results = query_job.result()

# Run the query and fetch the results into a DataFrame
df_payment = client.query(query).to_dataframe()

# Display basic information about the DataFrame
print("Dataset Info:")
print(df_payment.info())
print("\nFirst 5 rows:")
print(df_payment.head())
print("\nSummary statistics:")
print(df_payment.describe(include='all'))



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  Int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  Int64  
 4   payment_value         103886 non-null  float64
dtypes: Int64(2), float64(1), object(2)
memory usage: 4.2+ MB
None

First 5 rows:
                           order_id  payment_sequential payment_type  \
0  744bade1fcf9ff3f31d860ace076d422                   2  credit_card   
1  1a57108394169c0b47d8f876acc9ba2d                   2  credit_card   
2  8bcbe01d44d147f901cd3192671144db                   4      voucher   
3  fa65dad1b0e818e3ccc5cb0e39231352                  14      voucher   
4  6ccb433e00daae1283ccc956189c82ae                   4      voucher  

In [61]:
# Check for missing values in each column
print("Missing Values in Each Column:")
print(df_payment.isnull().sum())

# Check the percentage of missing values
print("\nPercentage of Missing Values in Each Column:")
print((df_payment.isnull().sum() / len(df_payment) * 100).round(2))

Missing Values in Each Column:
order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

Percentage of Missing Values in Each Column:
order_id                0.0
payment_sequential      0.0
payment_type            0.0
payment_installments    0.0
payment_value           0.0
dtype: float64


In [62]:
# Check for duplicate order_id and payment_sequential combinations
duplicate_combinations = df_payment.duplicated(subset=['order_id', 'payment_sequential']).sum()
print(f"Number of Duplicate order_id and payment_sequential Combinations: {duplicate_combinations}")

# If duplicates exist, display a few examples
if duplicate_combinations > 0:
    print("\nSample of Duplicate Combinations:")
    print(df_payment[df_payment.duplicated(subset=['order_id', 'payment_sequential'])].head())

Number of Duplicate order_id and payment_sequential Combinations: 0


In [64]:
# Check unique values in payment_type
print("Unique Values in payment_type:")
print(df_payment['payment_type'].unique())

# Check the distribution of payment_type
print("\nDistribution of payment_type:")
print(df_payment['payment_type'].value_counts())

Unique Values in payment_type:
['credit_card' 'voucher' 'not_defined' 'boleto' 'debit_card']

Distribution of payment_type:
payment_type
credit_card    76795
boleto         19784
voucher         5775
debit_card      1529
not_defined        3
Name: count, dtype: int64


In [66]:
# Filter rows where payment_type is 'not_defined'
df_not_defined_rows = df_payment[df_payment['payment_type'] == 'not_defined']

df_not_defined_rows 

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
5,4637ca194b6387e2d538dc89b124b0ee,1,not_defined,1,0.0
6,00b1cb0320190ca0daa2c88b35206009,1,not_defined,1,0.0
9,c8c528189310eaa44a745b8d9d26908b,1,not_defined,1,0.0


In [82]:
query_orders = "SELECT order_id, order_status FROM dsai-module-2-project.olist.orders"
df_orders_orders = client.query(query_orders).to_dataframe()

# Merge the not_defined_rows with the orders dataset
df_not_defined_with_orders = not_defined_rows.merge(df_orders_orders, on='order_id', how='left')

df_not_defined_with_orders

# not_defined payment type likely indicate a failed and cancelled transaction



Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value,order_status
0,4637ca194b6387e2d538dc89b124b0ee,1,not_defined,1,0.0,canceled
1,00b1cb0320190ca0daa2c88b35206009,1,not_defined,1,0.0,canceled
2,c8c528189310eaa44a745b8d9d26908b,1,not_defined,1,0.0,canceled


In [None]:
# Check the range of payment_installments
print("Summary of payment_installments:")
print(df_payment['payment_installments'].describe())

# Check for invalid values (e.g., negative or zero)
invalid_installments = df_payment[df_payment['payment_installments'] <= 0]
print(f"\nNumber of Rows with payment_installments <= 0: {len(invalid_installments)}")
if len(invalid_installments) > 0:
    print("\nSample of Rows with payment_installments <= 0:")
    print(invalid_installments.head())

# Check payment_installments by payment_type
print("\nPayment Installments by Payment Type:")
print(df_payment.groupby('payment_type')['payment_installments'].describe())

#Logically incorrectly to have payment installments lesser than 1. 
#It would represent a failed deduction. 

Summary of payment_installments:
count    103886.0
mean     2.853349
std      2.687051
min           0.0
25%           1.0
50%           1.0
75%           4.0
max          24.0
Name: payment_installments, dtype: Float64

Number of Rows with payment_installments <= 0: 2

Sample of Rows with payment_installments <= 0:
                           order_id  payment_sequential payment_type  \
0  744bade1fcf9ff3f31d860ace076d422                   2  credit_card   
1  1a57108394169c0b47d8f876acc9ba2d                   2  credit_card   

   payment_installments  payment_value  
0                     0          58.69  
1                     0         129.94  

Payment Installments by Payment Type:
                count      mean      std  min  25%  50%  75%   max
payment_type                                                      
boleto        19784.0       1.0      0.0  1.0  1.0  1.0  1.0   1.0
credit_card   76795.0  3.507155  2.85099  0.0  1.0  3.0  5.0  24.0
debit_card     1529.0       1.0    

In [75]:
# Check the range of payment_value
print("Summary of payment_value:")
print(df_payment['payment_value'].describe())

# Check for invalid values (e.g., negative or zero)
invalid_payment_value = df_payment[df_payment['payment_value'] <= 0]
print(f"\nNumber of Rows with payment_value <= 0: {len(invalid_payment_value)}")
if len(invalid_payment_value) > 0:
    print("\nSample of Rows with payment_value <= 0:")
    print(invalid_payment_value.head())

Summary of payment_value:
count    103886.000000
mean        154.100380
std         217.494064
min           0.000000
25%          56.790000
50%         100.000000
75%         171.837500
max       13664.080000
Name: payment_value, dtype: float64

Number of Rows with payment_value <= 0: 9

Sample of Rows with payment_value <= 0:
                           order_id  payment_sequential payment_type  \
2  8bcbe01d44d147f901cd3192671144db                   4      voucher   
3  fa65dad1b0e818e3ccc5cb0e39231352                  14      voucher   
4  6ccb433e00daae1283ccc956189c82ae                   4      voucher   
5  4637ca194b6387e2d538dc89b124b0ee                   1  not_defined   
6  00b1cb0320190ca0daa2c88b35206009                   1  not_defined   

   payment_installments  payment_value  
2                     1            0.0  
3                     1            0.0  
4                     1            0.0  
5                     1            0.0  
6                     1         

In [81]:
# Filter the invalid rows
df_invalid_payment_value = df_payment[df_payment['payment_value'] <= 0]

# Load the orders dataset 
query_orders = "SELECT order_id, order_status FROM dsai-module-2-project.olist.orders"
df_payment_orders = client.query(query_orders).to_dataframe()

# Merge with the orders dataset
df_invalid_with_orders = df_invalid_payment_value.merge(df_payment_orders, on='order_id', how='left')

df_invalid_with_orders



Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value,order_status
0,8bcbe01d44d147f901cd3192671144db,4,voucher,1,0.0,delivered
1,fa65dad1b0e818e3ccc5cb0e39231352,14,voucher,1,0.0,shipped
2,6ccb433e00daae1283ccc956189c82ae,4,voucher,1,0.0,delivered
3,4637ca194b6387e2d538dc89b124b0ee,1,not_defined,1,0.0,canceled
4,00b1cb0320190ca0daa2c88b35206009,1,not_defined,1,0.0,canceled
5,45ed6e85398a87c253db47c2d9f48216,3,voucher,1,0.0,delivered
6,fa65dad1b0e818e3ccc5cb0e39231352,13,voucher,1,0.0,shipped
7,c8c528189310eaa44a745b8d9d26908b,1,not_defined,1,0.0,canceled
8,b23878b3e8eb4d25a158f57d96331b18,4,voucher,1,0.0,delivered


# 3.3 Observation for customers

In [83]:
# Define the SQL query
query = "SELECT * FROM dsai-module-2-project.olist.customer"

# Run the query
query_job = client.query(query)

# Wait for the query to finish and fetch the results
results = query_job.result()

# Run the query and fetch the results into a DataFrame
df_customer = client.query(query).to_dataframe()

# Display basic information about the DataFrame
print("Dataset Info:")
print(df_customer.info())
print("\nFirst 5 rows:")
print(df_customer.head())
print("\nSummary statistics:")
print(df_customer.describe(include='all'))



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  Int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: Int64(1), object(4)
memory usage: 3.9+ MB
None

First 5 rows:
                        customer_id                customer_unique_id  \
0  2201362e68992f654942dc0067c1b716  f7d7fc0a59ef4363fdce6e3aa069d498   
1  31dbc13addc753e210692eacaea065e4  5dbba6c01268a8ad43f79157bf4454a0   
2  dad907e170748a35ef4e92238b7308f3  36b1c0516f123351ffa87430416dcae5   
3  888d2ebe1af2a8c93c75dae5dfc23719  721d1092e1a6460c67e6a0e691d899a3   
4  8a0108267d9258a0ec9f74381bc9b0de  7a2dc4682890550ebe3b8befcea3

In [84]:
# Check for missing values
print("Missing Values in Each Column:")
print(df_customer.isnull().sum())

Missing Values in Each Column:
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64


In [85]:
# Check for duplicate customer_id
duplicate_customer_ids = df_customer['customer_id'].duplicated().sum()
print(f"Number of Duplicate customer_id: {duplicate_customer_ids}")

# Check the distribution of customer_unique_id
unique_id_counts = df_customer['customer_unique_id'].value_counts()
print("\nDistribution of Orders per Customer (customer_unique_id):")
print(unique_id_counts.describe())

# Check for completely duplicate rows
duplicate_rows = df_customer.duplicated().sum()
print(f"\nNumber of Completely Duplicate Rows: {duplicate_rows}")

Number of Duplicate customer_id: 0

Distribution of Orders per Customer (customer_unique_id):
count    96096.000000
mean         1.034809
std          0.214384
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         17.000000
Name: count, dtype: float64

Number of Completely Duplicate Rows: 0


In [90]:
# Convert to string and check length
df_customer['customer_zip_code_prefix'] = df_customer['customer_zip_code_prefix'].astype(str)
invalid_zip_length = df_customer[~df_customer['customer_zip_code_prefix'].str.match(r'^\d{5}$')]
print(f"Number of Invalid customer_zip_code_prefix (not 5 digits): {len(invalid_zip_length)}")
if len(invalid_zip_length) > 0:
    print("\nSample of Invalid customer_zip_code_prefix:")
    print(invalid_zip_length.head())

# Check range (if numeric)
df_customer['customer_zip_code_prefix'] = df_customer['customer_zip_code_prefix'].astype(int)
invalid_zip_range = df_customer[(df_customer['customer_zip_code_prefix'] < 1000) | (df_customer['customer_zip_code_prefix'] > 99990)]
print(f"\nNumber of customer_zip_code_prefix Out of Range: {len(invalid_zip_range)}")
if len(invalid_zip_range) > 0:
    print("\nSample of Out-of-Range customer_zip_code_prefix:")
    print(invalid_zip_range.head())

Number of Invalid customer_zip_code_prefix (not 5 digits): 23995

Sample of Invalid customer_zip_code_prefix:
                            customer_id                customer_unique_id  \
57415  7ae2a9337aa4bc799723511faa1d6830  0c1a20644f0dc126c3eaff8dbc1bd12c   
57416  a09edf8c1e842e94805a206b3d73eed5  968f6d2f674977d88a4b445a5117ccd8   
57417  ee9b73e88afb4904ee2322cfc89cf638  095e7c124c5c1ccb1eb9f731152eae6a   
57418  5a8b64ee6ccdae09ea823e6aa00e9517  9c84e5193d6ee59b3870e0e4e3a2dad8   
57419  6ec2b4682814cfdac8d92bad42b3ddab  57f0ea1c7f6b9ef8615c0a0b8f06fe57   

      customer_zip_code_prefix customer_city customer_state  
57415                     1003     sao paulo             SP  
57416                     1004     sao paulo             SP  
57417                     1004     sao paulo             SP  
57418                     1005     sao paulo             SP  
57419                     1005     sao paulo             SP  

Number of customer_zip_code_prefix Out of Range: 0


In [88]:
# Convert customer_zip_code_prefix to string
df_customer['customer_zip_code_prefix'] = df_customer['customer_zip_code_prefix'].astype(str)

# Identify rows where customer_zip_code_prefix is not 5 digits
df_inconsistent_rows = df_customer[~df_customer['customer_zip_code_prefix'].str.match(r'^\d{5}$')]

df_inconsistent_rows

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
57415,7ae2a9337aa4bc799723511faa1d6830,0c1a20644f0dc126c3eaff8dbc1bd12c,1003,sao paulo,SP
57416,a09edf8c1e842e94805a206b3d73eed5,968f6d2f674977d88a4b445a5117ccd8,1004,sao paulo,SP
57417,ee9b73e88afb4904ee2322cfc89cf638,095e7c124c5c1ccb1eb9f731152eae6a,1004,sao paulo,SP
57418,5a8b64ee6ccdae09ea823e6aa00e9517,9c84e5193d6ee59b3870e0e4e3a2dad8,1005,sao paulo,SP
57419,6ec2b4682814cfdac8d92bad42b3ddab,57f0ea1c7f6b9ef8615c0a0b8f06fe57,1005,sao paulo,SP
...,...,...,...,...,...
81405,428db965aedf0c8c56f94d005539a9b0,97bc08e526795b17e1d4f642f77ae304,9993,diadema,SP
81406,fd04bf849b36444f719850585a9b0e8a,97bc08e526795b17e1d4f642f77ae304,9993,diadema,SP
81407,9b9024a27b845a8b50ef8d1b7ba89ee8,97bc08e526795b17e1d4f642f77ae304,9993,diadema,SP
81408,10091d0f711745db12815a7935577e26,a8bd559f5b029d6f96e3c9d134288dba,9993,diadema,SP


In [94]:
# Check unique values for customer_city and customer_state
print("Unique customer_city Values:")
print(df_customer['customer_city'].nunique())
print("\nSample of customer_city Values:")
print(df_customer['customer_city'].value_counts())

print("\nUnique customer_state Values:")
print(df_customer['customer_state'].unique())

# Check for invalid state codes
valid_states = ['AC', 'AL', 'AP', 'AM', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MT', 'MS', 'MG', 'PA', 'PB', 'PR', 'PE', 'PI', 'RJ', 'RN', 'RS', 'RO', 'RR', 'SC', 'SP', 'SE', 'TO']
invalid_states = df_customer[~df_customer['customer_state'].isin(valid_states)]
print(f"\nNumber of Invalid customer_state Values: {len(invalid_states)}")
if len(invalid_states) > 0:
    print("\nSample of Invalid customer_state Values:")
    print(invalid_states.head())

Unique customer_city Values:
4119

Sample of customer_city Values:
customer_city
sao paulo              15540
rio de janeiro          6882
belo horizonte          2773
brasilia                2131
curitiba                1521
                       ...  
bugre                      1
cujubim                    1
candeias do jamari         1
mutum parana               1
buriti do tocantins        1
Name: count, Length: 4119, dtype: int64

Unique customer_state Values:
['AC' 'AL' 'AM' 'AP' 'BA' 'CE' 'DF' 'ES' 'GO' 'MA' 'MG' 'MS' 'MT' 'PA'
 'PB' 'PE' 'PI' 'PR' 'RJ' 'RN' 'RO' 'RR' 'RS' 'SC' 'SE' 'SP' 'TO']

Number of Invalid customer_state Values: 0
