In [4]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

if os.path.exists(db_path):
    print("Connecting to the database...")
    conn = sqlite3.connect(db_path)
    print("Database connected.")
    
    query = "SELECT COUNT(*) FROM orders;"
    try:
        data = pd.read_sql(query, conn)
        print("Total number of rows in orders table:", data.iloc[0, 0])
    except Exception as e:
        print("An error occurred:", e)
    finally:
        conn.close()
        print("Database connection closed.")
else:
    print("The file does not exist at the specified location:", db_path)



Connecting to the database...
Database connected.
Total number of rows in orders table: 99441
Database connection closed.


### Inspecting the Database Schema

In [7]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

if os.path.exists(db_path):
    print("Connecting to the database...")
    
    # Create a connection to the database
    conn = sqlite3.connect(db_path)
    print("Database connected.")
    
    try:
        # List all tables
        tables_query = "SELECT name FROM sqlite_master WHERE type='table';"
        tables = pd.read_sql(tables_query, conn)
        print("Tables in the database:")
        print(tables)

        # Inspect schema of the relevant tables
        for table in tables['name']:
            schema_query = f"PRAGMA table_info({table});"
            schema = pd.read_sql(schema_query, conn)
            print(f"Schema of {table}:")
            print(schema)
    except Exception as e:
        print("An error occurred:", e)
    finally:
        # Close the connection
        conn.close()
        print("Database connection closed.")
else:
    print("The file does not exist at the specified location:", db_path)



Connecting to the database...
Database connected.
Tables in the database:
            name
0      customers
1         geoloc
2    order_items
3    order_pymts
4  order_reviews
5         orders
6       products
7        sellers
8    translation
Schema of customers:
   cid                      name    type  notnull dflt_value  pk
0    0                     index  BIGINT        0       None   0
1    1               customer_id    TEXT        0       None   0
2    2        customer_unique_id    TEXT        0       None   0
3    3  customer_zip_code_prefix  BIGINT        0       None   0
4    4             customer_city    TEXT        0       None   0
5    5            customer_state    TEXT        0       None   0
Schema of geoloc:
   cid                         name    type  notnull dflt_value  pk
0    0                        index  BIGINT        0       None   0
1    1  geolocation_zip_code_prefix  BIGINT        0       None   0
2    2              geolocation_lat   FLOAT        0      

### Query to Display a Few Rows and the Date Format

In [1]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

if os.path.exists(db_path):
    print("Connecting to the database...")
    conn = sqlite3.connect(db_path)
    print("Database connected.")
    
    query = """
    SELECT 
        order_id,
        customer_id,
        order_status,
        order_purchase_timestamp
    FROM orders
    LIMIT 5;
    """
    try:
        data = pd.read_sql(query, conn)
        print("Sample rows from orders table:")
        print(data)  # Display sample rows to check date format
    except Exception as e:
        print("An error occurred:", e)
    finally:
        conn.close()
        print("Database connection closed.")
else:
    print("The file does not exist at the specified location:", db_path)


Connecting to the database...
Database connected.
Sample rows from orders table:
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_status order_purchase_timestamp  
0    delivered      2017-10-02 10:56:33  
1    delivered      2018-07-24 20:41:37  
2    delivered      2018-08-08 08:38:49  
3    delivered      2017-11-18 19:28:06  
4    delivered      2018-02-13 21:18:39  
Database connection closed.


### Query to Display the Current Date in order_purchase_timestamp Format

In [3]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

if os.path.exists(db_path):
    print("Connecting to the database...")
    conn = sqlite3.connect(db_path)
    print("Database connected.")
    
    query = """
    SELECT strftime('%Y-%m-%d %H:%M:%S', 'now') AS current_date;
    """
    try:
        data = pd.read_sql(query, conn)
        print("Current date (formatted):")
        print(data)  # Display the current date to ensure it's being calculated correctly
    except Exception as e:
        print("An error occurred:", e)
    finally:
        conn.close()
        print("Database connection closed.")
else:
    print("The file does not exist at the specified location:", db_path)


Connecting to the database...
Database connected.
Current date (formatted):
          current_date
0  2024-07-17 07:52:15
Database connection closed.


### 1. Recent Orders with at Least 3 Days of Delay (Excluding Canceled Orders) for Orders Less Than 90 Days Old

In [5]:
import pandas as pd
import sqlite3
import os

# Relative path to the database
db_path = os.path.join('..', 'data', 'olist.db')

# Check if the file exists before attempting to connect
if os.path.exists(db_path):
    print("Connecting to the database...")
    
    # Create a connection to the database
    conn = sqlite3.connect(db_path)
    print("Database connected.")
    
    # SQL query to count recent orders
    count_query = """
    SELECT COUNT(*) AS recent_orders_count
    FROM orders
    WHERE 
        order_status <> 'canceled' AND 
        order_purchase_timestamp >= datetime('now', '-90 days');
    """
    
    # SQL query to retrieve recent orders with delay
    data_query = """
    WITH RelevantOrders AS (
        SELECT 
            order_id,
            customer_id,
            order_status,
            order_purchase_timestamp,
            order_delivered_customer_date,
            julianday(order_delivered_customer_date) - julianday(order_estimated_delivery_date) AS delay_days
        FROM orders
        WHERE 
            order_status <> 'canceled' AND 
            order_purchase_timestamp >= datetime('now', '-90 days')
    )
    SELECT *
    FROM RelevantOrders
    WHERE delay_days > 3;
    """

    try:
        # Execute the count query and print the result
        count_data = pd.read_sql(count_query, conn)
        print("Number of recent orders in the last 90 days:", count_data.iloc[0, 0])
        
        # Execute the data query and load data into a DataFrame
        data = pd.read_sql(data_query, conn)
        print("Query executed successfully.")
        print(data.head())  # Display the first few rows of the result
    except Exception as e:
        print("An error occurred:", e)
    finally:
        # Close the connection
        conn.close()
        print("Database connection closed.")
else:
    print("The file does not exist at the specified location:", db_path)


Connecting to the database...
Database connected.
Number of recent orders in the last 90 days: 0
Query executed successfully.
Empty DataFrame
Columns: [order_id, customer_id, order_status, order_purchase_timestamp, order_delivered_customer_date, delay_days]
Index: []
Database connection closed.


### 2. Sellers generating revenue over 100,000 Real via Olist