# 04 Performance Profiling

**Assignment 1: Query profiling and code profiling with quantified improvements**

- Query profiling (SQLite EXPLAIN QUERY PLAN)
- Code profiling (cProfile)
- Outputs in profiling/query_results.txt and profiling/code_profiling.txt

In [1]:
import sys
from pathlib import Path
project_root = Path.cwd() if (Path.cwd() / "src").exists() else Path.cwd().parent
sys.path.insert(0, str(project_root))

from sqlalchemy import text
from src.utils import get_engine

engine = get_engine(sample=True)

## 1. Query profiling (EXPLAIN QUERY PLAN)

In [None]:
# queries = [
#     ("Count reviews", "SELECT COUNT(*) FROM reviews"),
#     ("Avg rating by hotel", "SELECT offering_id, AVG(rating_overall) FROM reviews GROUP BY offering_id"),
#     ("Top 10 hotels", "SELECT offering_id, COUNT(*) AS n FROM reviews GROUP BY offering_id ORDER BY n DESC LIMIT 10"),
#     ("Reviews with rating >= 4", "SELECT * FROM reviews WHERE rating_overall >= 4 LIMIT 1000"),
# ]
# out = []
# with engine.connect() as conn:
#     for name, sql in queries:
#         out.append(f"--- {name} ---")
#         out.append(f"SQL: {sql}")
#         for row in conn.execute(text(f"EXPLAIN QUERY PLAN {sql}")).fetchall():
#             out.append(str(row))
#         out.append("")
# # Write profiling/query_results.txt (assignment: only 2 .txt files in profiling/)
# (project_root / "profiling").mkdir(parents=True, exist_ok=True)
# (project_root / "profiling" / "query_results.txt").write_text("\n".join(out), encoding="utf-8")
# print("\n".join(out))
# print("Wrote profiling/query_results.txt")

--- Count reviews ---
SQL: SELECT COUNT(*) FROM reviews
(4, 0, 0, 'SCAN reviews USING COVERING INDEX idx_reviews_rating_overall')

--- Avg rating by hotel ---
SQL: SELECT offering_id, AVG(rating_overall) FROM reviews GROUP BY offering_id
(7, 0, 222, 'SCAN reviews USING INDEX idx_reviews_offering')

--- Top 10 hotels ---
SQL: SELECT offering_id, COUNT(*) AS n FROM reviews GROUP BY offering_id ORDER BY n DESC LIMIT 10
(8, 0, 207, 'SCAN reviews USING COVERING INDEX idx_reviews_offering')
(42, 0, 0, 'USE TEMP B-TREE FOR ORDER BY')

--- Reviews with rating >= 4 ---
SQL: SELECT * FROM reviews WHERE rating_overall >= 4 LIMIT 1000
(4, 0, 202, 'SEARCH reviews USING INDEX idx_reviews_rating_overall (rating_overall>?)')

Wrote profiling/query_results.txt


## 2. Code profiling (cProfile)

The cell below runs cProfile and writes profiling/code_profiling.txt (assignment: profiling folder has only these two .txt files).

In [3]:
import cProfile
import pstats
import io
from src.benchmarking import get_reviews_df, comparable_groups_by_volume_and_rating

prof = cProfile.Profile()
prof.enable()
df = get_reviews_df(sample=True)
peers = comparable_groups_by_volume_and_rating(df)
prof.disable()
s = io.StringIO()
ps = pstats.Stats(prof, stream=s).strip_dirs().sort_stats("cumulative")
ps.print_stats(30)
code_profiling_txt = s.getvalue()
(project_root / "profiling" / "code_profiling.txt").write_text(code_profiling_txt, encoding="utf-8")
print(code_profiling_txt)
print("Wrote profiling/code_profiling.txt")

         18165 function calls (17892 primitive calls) in 0.114 seconds

   Ordered by: cumulative time
   List reduced from 1027 to 30 due to restriction <30>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        3    0.000    0.000    0.114    0.038 interactiveshell.py:3665(run_code)
        3    0.000    0.000    0.114    0.038 {built-in method builtins.exec}
        1    0.000    0.000    0.077    0.077 benchmarking.py:17(get_reviews_df)
        1    0.001    0.001    0.075    0.075 sql.py:572(read_sql)
        1    0.000    0.000    0.071    0.071 sql.py:1791(read_query)
        3    0.000    0.000    0.049    0.016 result.py:1331(fetchall)
        3    0.003    0.001    0.049    0.016 result.py:555(_allrows)
        3    0.000    0.000    0.046    0.015 cursor.py:2251(_fetchall_impl)
        3    0.002    0.001    0.046    0.015 cursor.py:1191(fetchall)
        3    0.044    0.015    0.044    0.015 {method 'fetchall' of 'sqlite3.Cursor' objects}
        

In [5]:
import time
import sqlite3
import io
from contextlib import redirect_stdout
import pandas as pd
from src.utils import get_db_path

db_path = get_db_path(sample=True)
conn = sqlite3.connect(str(db_path))

test_queries = [
    ("Count all reviews", "SELECT COUNT(*) FROM reviews"),
    ("Avg rating by hotel", "SELECT offering_id, AVG(rating_overall) FROM reviews GROUP BY offering_id"),
    ("Filter by rating >= 4", "SELECT * FROM reviews WHERE rating_overall >= 4 LIMIT 1000"),
    ("Complex aggregation", """
        SELECT offering_id, 
               COUNT(*) as n,
               AVG(rating_overall) as avg_rating,
               AVG(rating_cleanliness) as avg_clean
        FROM reviews
        WHERE rating_overall >= 3.5
        GROUP BY offering_id
        HAVING COUNT(*) >= 10
        ORDER BY avg_rating DESC
        LIMIT 100
    """),
]

# Capture printed output
buf = io.StringIO()
with redirect_stdout(buf):
    print("=" * 80)
    print("QUERY PERFORMANCE ANALYSIS")
    print("=" * 80)

    results = []

    for name, query in test_queries:
        times = []
        for _ in range(5):
            start = time.time()
            cursor = conn.execute(query)
            _ = cursor.fetchall()
            times.append(time.time() - start)

        avg_time = sum(times) / len(times)
        results.append({
            'query': name,
            'avg_time': avg_time,
            'min_time': min(times),
            'max_time': max(times),
        })

        print(f"\n{name}:")
        print(f"  Average: {avg_time*1000:.2f}ms")
        print(f"  Min: {min(times)*1000:.2f}ms")
        print(f"  Max: {max(times)*1000:.2f}ms")

conn.close()

# Convert captured output to text
query_profiling_txt = buf.getvalue()

# Write to file
output_path = project_root / "profiling" / "query_results.txt"
output_path.write_text(query_profiling_txt, encoding="utf-8")

# Show in notebook
print(query_profiling_txt)
print(f"Wrote {output_path}")


QUERY PERFORMANCE ANALYSIS

Count all reviews:
  Average: 0.27ms
  Min: 0.08ms
  Max: 0.96ms

Avg rating by hotel:
  Average: 28.68ms
  Min: 17.40ms
  Max: 55.24ms

Filter by rating >= 4:
  Average: 5.52ms
  Min: 5.29ms
  Max: 5.67ms

Complex aggregation:
  Average: 28.16ms
  Min: 17.43ms
  Max: 35.46ms

Wrote c:\Users\rayya\Desktop\IS5126-G4-hotel-analytics-master\profiling\query_results.txt
