### Load the data fom customers, orders, order_reviews and order_pymnts (other tables are not used in this notebook)

In [4]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')
conn = sqlite3.connect(db_path)

# Load the tables into DataFrames
customers = pd.read_sql_query("SELECT * FROM customers", conn)
orders = pd.read_sql_query("SELECT * FROM orders", conn)
order_reviews = pd.read_sql_query("SELECT * FROM order_reviews", conn)
order_pymts = pd.read_sql_query("SELECT * FROM order_pymts", conn)

conn.close()

print("Data loaded successfully.")


Data loaded successfully.


### Merge the data from the different tables

In [9]:
# Merge the orders and customers tables
orders_customers = pd.merge(orders, customers, on='customer_id', how='inner', suffixes=('_order', '_customer'))

# Aggregate reviews by order_id
order_reviews_agg = order_reviews.groupby('order_id').agg({
    'review_score': 'mean',
    'review_id': 'count'
}).rename(columns={'review_score': 'avg_review_score', 'review_id': 'review_count'}).reset_index()

# Aggregate payments by order_id
order_pymts_agg = order_pymts.groupby('order_id').agg({
    'payment_value': 'sum',
    'payment_sequential': 'count'
}).rename(columns={'payment_value': 'total_payment_value', 'payment_sequential': 'payment_count'}).reset_index()

# Merge the aggregated reviews with orders_customers
orders_customers_reviews = pd.merge(orders_customers, order_reviews_agg, on='order_id', how='left')

# Merge the aggregated payments with orders_customers_reviews
df_final = pd.merge(orders_customers_reviews, order_pymts_agg, on='order_id', how='left')

# Perform a control on the number of rows before and after merge
initial_row_count = len(orders)
final_row_count = len(df_final)
print(f"Initial row count in orders table: {initial_row_count}")
print(f"Final row count in df_final: {final_row_count}")

# Drop the customer_id column to avoid confusion
df_final.drop(columns=['customer_id'], inplace=True)

# Use only 'customer_unique_id' to identify each customer
df_final = df_final[['customer_unique_id'] + [col for col in df_final.columns if col != 'customer_unique_id']]

print("Tables merged successfully, data aggregated, and 'customer_id' column removed.")
df_final.head()


Initial row count in orders table: 99441
Final row count in df_final: 99441
Tables merged successfully, data aggregated, and 'customer_id' column removed.


Unnamed: 0,customer_unique_id,index_order,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,index_customer,customer_zip_code_prefix,customer_city,customer_state,avg_review_score,review_count,total_payment_value,payment_count
0,7c396fd4830fd04220f754e42b4e5bff,0,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,70296,3149,sao paulo,SP,4.0,1.0,38.71,3.0
1,af07308b275d755c9edb36a90c618231,1,53cdb2fc8bc7dce0b6741e2150273451,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,77027,47813,barreiras,BA,4.0,1.0,141.46,1.0
2,3a653a41f6f9fc3d2a113cf8398680e8,2,47770eb9100c2d0c44946d9cf07ec65d,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,554,75265,vianopolis,GO,5.0,1.0,179.12,1.0
3,7c142cf63193a1473d2e66489a9ae977,3,949d5b44dbf5de918fe9c16f97b45f8a,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,61081,59296,sao goncalo do amarante,RN,5.0,1.0,72.2,1.0
4,72632f0f9dd73dfee390c9b22eb56dd6,4,ad21c59c0840e6cb83a9ceb5573f8159,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,67263,9195,santo andre,SP,5.0,1.0,28.62,1.0


### Save the Final DataFrame to a CSV File

In [10]:
# Ensure the data directory exists
output_dir = os.path.join('..', 'data')
os.makedirs(output_dir, exist_ok=True)

# Save the final DataFrame to a CSV file in the data directory
output_path = os.path.join(output_dir, 'df_final.csv')
df_final.to_csv(output_path, index=False)
print(f"df_final saved to {output_path}")


df_final saved to ..\data\df_final.csv
