In [4]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

if os.path.exists(db_path):
    print("Connecting to the database...")
    conn = sqlite3.connect(db_path)
    print("Database connected.")
    
    query = "SELECT COUNT(*) FROM orders;"
    try:
        data = pd.read_sql(query, conn)
        print("Total number of rows in orders table:", data.iloc[0, 0])
    except Exception as e:
        print("An error occurred:", e)
    finally:
        conn.close()
        print("Database connection closed.")
else:
    print("The file does not exist at the specified location:", db_path)



Connecting to the database...
Database connected.
Total number of rows in orders table: 99441
Database connection closed.


### Inspecting the Database Schema

In [7]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

if os.path.exists(db_path):
    print("Connecting to the database...")
    
    # Create a connection to the database
    conn = sqlite3.connect(db_path)
    print("Database connected.")
    
    try:
        # List all tables
        tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
        tables = pd.read_sql(tables_query, conn)
        print("Tables in the database:")
        print(tables)

        # Inspect schema of the relevant tables
        for table in tables['name']:
            schema_query = f"PRAGMA table_info({table});"
            schema = pd.read_sql(schema_query, conn)
            print(f"Schema of {table}:")
            print(schema)
    except Exception as e:
        print("An error occurred:", e)
    finally:
        # Close the connection
        conn.close()
        print("Database connection closed.")
else:
    print("The file does not exist at the specified location:", db_path)



Connecting to the database...
Database connected.
Tables in the database:
            name
0      customers
1         geoloc
2    order_items
3    order_pymts
4  order_reviews
5         orders
6       products
7        sellers
8    translation
Schema of customers:
   cid                      name    type  notnull dflt_value  pk
0    0                     index  BIGINT        0       None   0
1    1               customer_id    TEXT        0       None   0
2    2        customer_unique_id    TEXT        0       None   0
3    3  customer_zip_code_prefix  BIGINT        0       None   0
4    4             customer_city    TEXT        0       None   0
5    5            customer_state    TEXT        0       None   0
Schema of geoloc:
   cid                         name    type  notnull dflt_value  pk
0    0                        index  BIGINT        0       None   0
1    1  geolocation_zip_code_prefix  BIGINT        0       None   0
2    2              geolocation_lat   FLOAT        0      

### Query to Display a Few Rows and the Date Format

In [1]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

if os.path.exists(db_path):
    print("Connecting to the database...")
    conn = sqlite3.connect(db_path)
    print("Database connected.")
    
    query = """
    SELECT 
        order_id,
        customer_id,
        order_status,
        order_purchase_timestamp
    FROM orders
    LIMIT 5;
    """
    try:
        data = pd.read_sql(query, conn)
        print("Sample rows from orders table:")
        print(data)  # Display sample rows to check date format
    except Exception as e:
        print("An error occurred:", e)
    finally:
        conn.close()
        print("Database connection closed.")
else:
    print("The file does not exist at the specified location:", db_path)


Connecting to the database...
Database connected.
Sample rows from orders table:
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_status order_purchase_timestamp  
0    delivered      2017-10-02 10:56:33  
1    delivered      2018-07-24 20:41:37  
2    delivered      2018-08-08 08:38:49  
3    delivered      2017-11-18 19:28:06  
4    delivered      2018-02-13 21:18:39  
Database connection closed.


### Query to Display the Current Date in order_purchase_timestamp Format

In [3]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

if os.path.exists(db_path):
    print("Connecting to the database...")
    conn = sqlite3.connect(db_path)
    print("Database connected.")
    
    query = """
    SELECT strftime('%Y-%m-%d %H:%M:%S', 'now') AS current_date;
    """
    try:
        data = pd.read_sql(query, conn)
        print("Current date (formatted):")
        print(data)  # Display the current date to ensure it's being calculated correctly
    except Exception as e:
        print("An error occurred:", e)
    finally:
        conn.close()
        print("Database connection closed.")
else:
    print("The file does not exist at the specified location:", db_path)


Connecting to the database...
Database connected.
Current date (formatted):
          current_date
0  2024-07-17 07:52:15
Database connection closed.


### 1. Recent Orders with at Least 3 Days of Delay (Excluding Canceled Orders) for Orders Less Than 90 Days Old

In [5]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

# Check if the file exists before attempting to connect
if os.path.exists(db_path):
    print("Connecting to the database...")
    
    # Create a connection to the database
    conn = sqlite3.connect(db_path)
    print("Database connected.")
    
    # SQL query to count recent orders
    count_query = """
    SELECT COUNT(*) AS recent_orders_count
    FROM orders
    WHERE 
        order_status <> 'canceled' AND 
        order_purchase_timestamp >= datetime('now', '-90 days');
    """
    
    # SQL query to retrieve recent orders with delay
    data_query = """
    WITH RelevantOrders AS (
        SELECT 
            order_id,
            customer_id,
            order_status,
            order_purchase_timestamp,
            order_delivered_customer_date,
            julianday(order_delivered_customer_date) - julianday(order_estimated_delivery_date) AS delay_days
        FROM orders
        WHERE 
            order_status <> 'canceled' AND 
            order_purchase_timestamp >= datetime('now', '-90 days')
    )
    SELECT *
    FROM RelevantOrders
    WHERE delay_days > 3;
    """

    try:
        # Execute the count query and print the result
        count_data = pd.read_sql(count_query, conn)
        print("Number of recent orders in the last 90 days:", count_data.iloc[0, 0])
        
        # Execute the data query and load data into a DataFrame
        data = pd.read_sql(data_query, conn)
        print("Query executed successfully.")
        print(data.head())  # Display the first few rows of the result
    except Exception as e:
        print("An error occurred:", e)
    finally:
        # Close the connection
        conn.close()
        print("Database connection closed.")
else:
    print("The file does not exist at the specified location:", db_path)


Connecting to the database...
Database connected.
Number of recent orders in the last 90 days: 0
Query executed successfully.
Empty DataFrame
Columns: [order_id, customer_id, order_status, order_purchase_timestamp, order_delivered_customer_date, delay_days]
Index: []
Database connection closed.


### 2. Sellers generating revenue over 100,000 Real via Olist

In [1]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

# Function to execute SQL queries and return DataFrames
def execute_query(query):
    with sqlite3.connect(db_path) as conn:
        return pd.read_sql(query, conn)

# SQL query for sellers generating revenue over 100,000 Real
query_revenue_sellers = """
WITH Revenue AS (
    SELECT
        i.seller_id,
        SUM(i.price) AS total_revenue
    FROM order_items AS i
    JOIN orders AS o ON i.order_id = o.order_id
    WHERE o.order_status = 'delivered'
    GROUP BY i.seller_id
)
SELECT s.seller_id, s.seller_zip_code_prefix, r.total_revenue
FROM Revenue AS r
JOIN sellers AS s ON r.seller_id = s.seller_id
WHERE r.total_revenue > 100000;
"""

# Execute the query and load data into a DataFrame
data_revenue_sellers = execute_query(query_revenue_sellers)
print("DataFrame: revenue_sellers")
print(data_revenue_sellers.head())
print()


DataFrame: revenue_sellers
                          seller_id  seller_zip_code_prefix  total_revenue
0  7e93a43ef30c4f03f38b393420bc753a                    6429      165981.49
1  7d13fca15225358621be4086e1eb0964                   14050      112436.18
2  955fee9216a65b617aa5c0531780ce60                    4782      131836.71
3  1f50f920176fa81dab994f9023523100                   15025      106655.71
4  fa1c13f2614d7b5c4749cbc52fecda94                   13170      190917.14



### 3 New Sellers with More Than 30 Products Sold in the Last 30 Days

In [7]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

# Function to execute SQL queries and return DataFrames
def execute_query(query):
    with sqlite3.connect(db_path) as conn:
        return pd.read_sql(query, conn)

# Step 1: Check Min and Max Dates
query_check_dates = """
SELECT 
    MIN(order_purchase_timestamp) AS min_date,
    MAX(order_purchase_timestamp) AS max_date
FROM orders;
"""

data_check_dates = execute_query(query_check_dates)
print("DataFrame: check_dates")
print(data_check_dates)
if data_check_dates.empty:
    print("No data available. This may be due to the date range issue because today's date is July 17, 2024, and the data in the database are older than 30 days.")
print()

# Step 2: Check Distribution of Dates
query_distribution_dates = """
SELECT 
    order_purchase_timestamp,
    COUNT(*) AS order_count
FROM orders
GROUP BY order_purchase_timestamp
ORDER BY order_purchase_timestamp;
"""

data_distribution_dates = execute_query(query_distribution_dates)
print("DataFrame: distribution_dates")
print(data_distribution_dates.head(10))  # Display the first 10 rows
if data_distribution_dates.empty:
    print("No data available. This may be due to the date range issue because today's date is July 17, 2024, and the data in the database are older than 30 days.")
print()

# Step 3: Check Earliest Order Dates for Sellers
query_earliest_order_dates = """
SELECT 
    i.seller_id,
    MIN(o.order_purchase_timestamp) AS first_order_date
FROM order_items AS i
JOIN orders AS o ON i.order_id = o.order_id
GROUP BY i.seller_id
ORDER BY first_order_date DESC
LIMIT 10;
"""

data_earliest_order_dates = execute_query(query_earliest_order_dates)
print("DataFrame: earliest_order_dates")
print(data_earliest_order_dates)
if data_earliest_order_dates.empty:
    print("No data available. This may be due to the date range issue because today's date is July 17, 2024, and the data in the database are older than 30 days.")
print()

# Final Query: New Sellers with More Than 30 Products Sold in the Last 90 Days
query_new_sellers_final = """
WITH SellerFirstOrder AS (
    SELECT 
        i.seller_id,
        MIN(o.order_purchase_timestamp) AS first_order_date
    FROM order_items AS i
    JOIN orders AS o ON i.order_id = o.order_id
    GROUP BY i.seller_id
),
NewSellers AS (
    SELECT 
        sfo.seller_id,
        COUNT(i.order_item_id) AS total_products_sold
    FROM SellerFirstOrder AS sfo
    JOIN order_items AS i ON sfo.seller_id = i.seller_id
    JOIN orders AS o ON i.order_id = o.order_id
    WHERE 
        julianday('now') - julianday(sfo.first_order_date) < 90
    GROUP BY sfo.seller_id
    HAVING total_products_sold > 30
)
SELECT s.seller_id, s.seller_zip_code_prefix
FROM NewSellers AS ns
JOIN sellers AS s ON ns.seller_id = s.seller_id;
"""

data_new_sellers_final = execute_query(query_new_sellers_final)
print("DataFrame: new_sellers_final")
print(data_new_sellers_final)
if data_new_sellers_final.empty:
    print("No data available. This may be due to the date range issue because today's date is July 17, 2024, and the data in the database are older than 30 days.")
print()


DataFrame: check_dates
              min_date             max_date
0  2016-09-04 21:15:19  2018-10-17 17:30:18

DataFrame: distribution_dates
  order_purchase_timestamp  order_count
0      2016-09-04 21:15:19            1
1      2016-09-05 00:15:34            1
2      2016-09-13 15:24:19            1
3      2016-09-15 12:16:38            1
4      2016-10-02 22:07:52            1
5      2016-10-03 09:44:50            1
6      2016-10-03 16:56:50            1
7      2016-10-03 21:01:41            1
8      2016-10-03 21:13:36            1
9      2016-10-03 22:06:03            1

DataFrame: earliest_order_dates
                          seller_id     first_order_date
0  6561d6bf844e464b4019442692b40e02  2018-08-28 09:26:43
1  3296662b1331dea51e744505065ae889  2018-08-27 12:41:49
2  e8ff5a6ceb895583033fc2a0f314e3c2  2018-08-26 14:17:08
3  b76f4d90e85657a240495c876313adc5  2018-08-25 22:28:18
4  26e2e5033827d2ba53929f43e03d8ffe  2018-08-25 12:50:59
5  edb58a1390adf273840030a3d6253829  2018-0

In [8]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

# Function to execute SQL queries and return DataFrames
def execute_query(query):
    with sqlite3.connect(db_path) as conn:
        return pd.read_sql(query, conn)

# SQL query to find the 5 zip codes with the worst average review scores over the last 12 months
query_zip_codes_reviews = """
WITH RecentReviews AS (
    SELECT 
        r.review_id,
        r.review_score,
        o.order_id,
        c.customer_zip_code_prefix,
        r.review_creation_date
    FROM order_reviews AS r
    JOIN orders AS o ON r.order_id = o.order_id
    JOIN customers AS c ON o.customer_id = c.customer_id
    WHERE 
        julianday('now') - julianday(r.review_creation_date) <= 365
),
ZipScores AS (
    SELECT 
        customer_zip_code_prefix,
        AVG(review_score) AS average_score,
        COUNT(review_id) AS review_count
    FROM RecentReviews
    GROUP BY customer_zip_code_prefix
    HAVING review_count > 30
)
SELECT 
    customer_zip_code_prefix,
    average_score
FROM ZipScores
ORDER BY average_score ASC
LIMIT 5;
"""

# Execute the query and load data into a DataFrame
data_zip_codes_reviews = execute_query(query_zip_codes_reviews)
print("DataFrame: zip_codes_reviews")
print(data_zip_codes_reviews)
if data_zip_codes_reviews.empty:
    print("No data available. This may be due to the date range issue because today's date is July 17, 2024, and the data in the database are older than 30 days.")
print()


DataFrame: zip_codes_reviews
Empty DataFrame
Columns: [customer_zip_code_prefix, average_score]
Index: []
No data available. This may be due to the date range issue because today's date is July 17, 2024, and the data in the database are older than 30 days.

