# Data Analysis with SQL

This notebook demonstrates how to use SQL for real-world data analysis tasks. We'll explore business scenarios and analytical techniques commonly used in data science and business intelligence.

## Topics Covered:
1. Exploratory Data Analysis (EDA) with SQL
2. Time Series Analysis
3. Cohort Analysis
4. Business Metrics Calculation
5. Data Quality Assessment
6. Statistical Analysis
7. Report Generation
8. Data Export and Visualization Setup

## Setup and Extended Sample Data

Let's connect to our database and add some time-series data for analysis.

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from IPython.display import display
import os
from datetime import datetime, timedelta
import random

# Connect to database
if os.path.exists('my_database.db'):
    conn = sqlite3.connect('my_database.db')
elif os.path.exists('../my_database.db'):
    conn = sqlite3.connect('../my_database.db')
else:
    conn = sqlite3.connect('my_database.db')

cursor = conn.cursor()
print("Connected to database for data analysis!")

# Create additional tables for analysis
cursor.execute("""
CREATE TABLE IF NOT EXISTS sales_transactions (
    transaction_id INTEGER PRIMARY KEY,
    emp_id INTEGER,
    sale_date DATE,
    amount DECIMAL(10, 2),
    customer_id INTEGER,
    product_category VARCHAR(50),
    FOREIGN KEY (emp_id) REFERENCES employees (emp_id)
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS performance_reviews (
    review_id INTEGER PRIMARY KEY,
    emp_id INTEGER,
    review_date DATE,
    performance_score INTEGER,
    manager_rating INTEGER,
    goals_met INTEGER,
    FOREIGN KEY (emp_id) REFERENCES employees (emp_id)
)
""")

conn.commit()
print("Extended database schema created for analysis!")

Connected to database for data analysis!
Extended database schema created for analysis!


In [2]:
# Generate sample data for analysis
import random
from datetime import datetime, timedelta

# Generate sales transactions
sales_data = []
base_date = datetime(2023, 1, 1)

for i in range(1, 501):  # 500 transactions
    emp_id = random.randint(1, 8)
    days_offset = random.randint(0, 365)
    sale_date = base_date + timedelta(days=days_offset)
    amount = round(random.uniform(100, 5000), 2)
    customer_id = random.randint(1000, 9999)
    category = random.choice(['Software', 'Hardware', 'Consulting', 'Training', 'Support'])
    
    sales_data.append((i, emp_id, sale_date.strftime('%Y-%m-%d'), amount, customer_id, category))

cursor.executemany('INSERT OR REPLACE INTO sales_transactions VALUES (?, ?, ?, ?, ?, ?)', sales_data)

# Generate performance reviews
review_data = []
for emp_id in range(1, 9):
    for quarter in range(1, 5):
        review_date = datetime(2023, quarter * 3, 15)
        performance_score = random.randint(70, 100)
        manager_rating = random.randint(3, 5)
        goals_met = random.randint(60, 100)
        
        review_data.append((
            (emp_id-1)*4 + quarter,
            emp_id,
            review_date.strftime('%Y-%m-%d'),
            performance_score,
            manager_rating,
            goals_met
        ))

cursor.executemany('INSERT OR REPLACE INTO performance_reviews VALUES (?, ?, ?, ?, ?, ?)', review_data)

conn.commit()
print(f"Generated {len(sales_data)} sales transactions and {len(review_data)} performance reviews")

Generated 500 sales transactions and 32 performance reviews


## 1. Exploratory Data Analysis (EDA)

Let's start by understanding our data through summary statistics and distributions.

In [3]:
# Data overview and summary statistics
print("=== DATABASE OVERVIEW ===")

# Table sizes
tables_info = pd.read_sql_query("""
SELECT 
    'employees' as table_name, COUNT(*) as record_count FROM employees
UNION ALL
SELECT 'departments', COUNT(*) FROM departments
UNION ALL
SELECT 'projects', COUNT(*) FROM projects
UNION ALL
SELECT 'sales_transactions', COUNT(*) FROM sales_transactions
UNION ALL
SELECT 'performance_reviews', COUNT(*) FROM performance_reviews
""", conn)
display(tables_info)

print("\n=== EMPLOYEE SALARY DISTRIBUTION ===")
salary_stats = pd.read_sql_query("""
SELECT 
    COUNT(*) as total_employees,
    MIN(salary) as min_salary,
    MAX(salary) as max_salary,
    AVG(salary) as avg_salary,
    ROUND(AVG(salary), 0) as avg_salary_rounded,
    -- Approximate median using percentiles
    (SELECT salary FROM employees ORDER BY salary LIMIT 1 OFFSET (SELECT COUNT(*)/2 FROM employees)) as median_salary
FROM employees
""", conn)
display(salary_stats)

print("\n=== SALES OVERVIEW ===")
sales_overview = pd.read_sql_query("""
SELECT 
    COUNT(*) as total_transactions,
    COUNT(DISTINCT emp_id) as active_sales_employees,
    MIN(amount) as min_sale,
    MAX(amount) as max_sale,
    ROUND(AVG(amount), 2) as avg_sale,
    ROUND(SUM(amount), 2) as total_revenue,
    COUNT(DISTINCT customer_id) as unique_customers
FROM sales_transactions
""", conn)
display(sales_overview)

=== DATABASE OVERVIEW ===


Unnamed: 0,table_name,record_count
0,employees,8
1,departments,5
2,projects,5
3,sales_transactions,500
4,performance_reviews,32



=== EMPLOYEE SALARY DISTRIBUTION ===


Unnamed: 0,total_employees,min_salary,max_salary,avg_salary,avg_salary_rounded,median_salary
0,8,65000,95000,79625.0,79625.0,82000



=== SALES OVERVIEW ===


Unnamed: 0,total_transactions,active_sales_employees,min_sale,max_sale,avg_sale,total_revenue,unique_customers
0,500,8,125.65,4981.63,2600.66,1300332.31,487


In [4]:
# Distribution analysis by categories
print("=== SALES BY PRODUCT CATEGORY ===")
category_analysis = pd.read_sql_query("""
SELECT 
    product_category,
    COUNT(*) as transaction_count,
    ROUND(SUM(amount), 2) as total_revenue,
    ROUND(AVG(amount), 2) as avg_transaction_size,
    ROUND(SUM(amount) * 100.0 / (SELECT SUM(amount) FROM sales_transactions), 2) as revenue_percentage
FROM sales_transactions
GROUP BY product_category
ORDER BY total_revenue DESC
""", conn)
display(category_analysis)

print("\n=== DEPARTMENT PERFORMANCE COMPARISON ===")
dept_performance = pd.read_sql_query("""
SELECT 
    d.dept_name,
    COUNT(DISTINCT e.emp_id) as employee_count,
    COALESCE(COUNT(s.transaction_id), 0) as total_sales,
    COALESCE(ROUND(SUM(s.amount), 2), 0) as total_revenue,
    COALESCE(ROUND(AVG(s.amount), 2), 0) as avg_sale_size,
    ROUND(AVG(e.salary), 0) as avg_salary
FROM departments d
LEFT JOIN employees e ON d.dept_id = e.dept_id
LEFT JOIN sales_transactions s ON e.emp_id = s.emp_id
GROUP BY d.dept_id, d.dept_name
ORDER BY total_revenue DESC
""", conn)
display(dept_performance)

=== SALES BY PRODUCT CATEGORY ===


Unnamed: 0,product_category,transaction_count,total_revenue,avg_transaction_size,revenue_percentage
0,Support,106,275037.59,2594.69,21.15
1,Consulting,105,274673.66,2615.94,21.12
2,Hardware,103,266482.97,2587.21,20.49
3,Training,100,250567.97,2505.68,19.27
4,Software,86,233570.12,2715.93,17.96



=== DEPARTMENT PERFORMANCE COMPARISON ===


Unnamed: 0,dept_name,employee_count,total_sales,total_revenue,avg_sale_size,avg_salary
0,Engineering,3,180,463836.72,2576.87,82239.0
1,Marketing,2,148,387174.61,2616.04,68264.0
2,Finance,1,60,161128.67,2685.48,70000.0
3,Sales,1,61,158949.73,2605.73,95000.0
4,Human Resources,1,51,129242.58,2534.17,90000.0


## 2. Time Series Analysis

Analyzing trends over time to understand business patterns and seasonality.

In [5]:
# Monthly sales trends
print("=== MONTHLY SALES TRENDS ===")
monthly_trends = pd.read_sql_query("""
SELECT 
    strftime('%Y-%m', sale_date) as month,
    COUNT(*) as transaction_count,
    ROUND(SUM(amount), 2) as monthly_revenue,
    ROUND(AVG(amount), 2) as avg_transaction_size,
    COUNT(DISTINCT emp_id) as active_employees,
    COUNT(DISTINCT customer_id) as unique_customers
FROM sales_transactions
GROUP BY strftime('%Y-%m', sale_date)
ORDER BY month
""", conn)
display(monthly_trends)

print("\n=== QUARTERLY PERFORMANCE TRENDS ===")
quarterly_trends = pd.read_sql_query("""
SELECT 
    CASE 
        WHEN strftime('%m', sale_date) IN ('01', '02', '03') THEN 'Q1'
        WHEN strftime('%m', sale_date) IN ('04', '05', '06') THEN 'Q2'
        WHEN strftime('%m', sale_date) IN ('07', '08', '09') THEN 'Q3'
        ELSE 'Q4'
    END as quarter,
    COUNT(*) as transactions,
    ROUND(SUM(amount), 2) as revenue,
    ROUND(AVG(amount), 2) as avg_size
FROM sales_transactions
WHERE strftime('%Y', sale_date) = '2023'
GROUP BY quarter
ORDER BY quarter
""", conn)
display(quarterly_trends)

=== MONTHLY SALES TRENDS ===


Unnamed: 0,month,transaction_count,monthly_revenue,avg_transaction_size,active_employees,unique_customers
0,2023-01,36,92416.15,2567.12,8,36
1,2023-02,35,97661.71,2790.33,8,35
2,2023-03,37,93387.27,2523.98,7,37
3,2023-04,40,90713.43,2267.84,8,40
4,2023-05,44,123261.62,2801.4,8,43
5,2023-06,45,114873.61,2552.75,7,45
6,2023-07,41,112348.59,2740.21,8,41
7,2023-08,54,143385.22,2655.28,8,54
8,2023-09,42,119662.57,2849.11,8,42
9,2023-10,44,111349.36,2530.67,8,43



=== QUARTERLY PERFORMANCE TRENDS ===


Unnamed: 0,quarter,transactions,revenue,avg_size
0,Q1,108,283465.13,2624.68
1,Q2,129,328848.66,2549.21
2,Q3,137,375396.38,2740.12
3,Q4,124,305921.08,2467.11


In [6]:
# Day of week analysis
print("=== DAY OF WEEK ANALYSIS ===")
dow_analysis = pd.read_sql_query("""
SELECT 
    CASE strftime('%w', sale_date)
        WHEN '0' THEN 'Sunday'
        WHEN '1' THEN 'Monday'
        WHEN '2' THEN 'Tuesday'
        WHEN '3' THEN 'Wednesday'
        WHEN '4' THEN 'Thursday'
        WHEN '5' THEN 'Friday'
        WHEN '6' THEN 'Saturday'
    END as day_of_week,
    COUNT(*) as transaction_count,
    ROUND(SUM(amount), 2) as total_revenue,
    ROUND(AVG(amount), 2) as avg_transaction
FROM sales_transactions
GROUP BY strftime('%w', sale_date)
ORDER BY strftime('%w', sale_date)
""", conn)
display(dow_analysis)

print("\n=== EMPLOYEE SALES PERFORMANCE OVER TIME ===")
employee_trends = pd.read_sql_query("""
SELECT 
    e.first_name || ' ' || e.last_name as employee,
    strftime('%Y-%m', s.sale_date) as month,
    COUNT(s.transaction_id) as sales_count,
    ROUND(SUM(s.amount), 2) as monthly_sales
FROM employees e
LEFT JOIN sales_transactions s ON e.emp_id = s.emp_id
WHERE s.sale_date IS NOT NULL
GROUP BY e.emp_id, strftime('%Y-%m', s.sale_date)
ORDER BY employee, month
LIMIT 20  -- Show first 20 records
""", conn)
display(employee_trends)

=== DAY OF WEEK ANALYSIS ===


Unnamed: 0,day_of_week,transaction_count,total_revenue,avg_transaction
0,Sunday,67,152328.69,2273.56
1,Monday,90,244722.27,2719.14
2,Tuesday,69,189944.87,2752.82
3,Wednesday,57,156300.21,2742.11
4,Thursday,78,172478.73,2211.27
5,Friday,78,217893.15,2793.5
6,Saturday,61,166664.39,2732.2



=== EMPLOYEE SALES PERFORMANCE OVER TIME ===


Unnamed: 0,employee,month,sales_count,monthly_sales
0,David Brown,2023-01,2,8653.62
1,David Brown,2023-02,8,20599.13
2,David Brown,2023-03,1,125.65
3,David Brown,2023-04,6,12404.25
4,David Brown,2023-05,8,19402.73
5,David Brown,2023-06,6,20739.12
6,David Brown,2023-07,2,1912.41
7,David Brown,2023-08,5,12635.13
8,David Brown,2023-09,9,23202.75
9,David Brown,2023-10,3,10025.49


## 3. Business Metrics and KPIs

Calculate key performance indicators commonly used in business analysis.

In [7]:
# Key Performance Indicators
print("=== KEY PERFORMANCE INDICATORS ===")

# Employee productivity metrics
productivity_metrics = pd.read_sql_query("""
WITH employee_sales AS (
    SELECT 
        e.emp_id,
        e.first_name || ' ' || e.last_name as employee,
        e.salary,
        d.dept_name,
        COUNT(s.transaction_id) as total_sales,
        COALESCE(SUM(s.amount), 0) as total_revenue,
        COALESCE(ROUND(AVG(s.amount), 2), 0) as avg_sale_size
    FROM employees e
    LEFT JOIN sales_transactions s ON e.emp_id = s.emp_id
    LEFT JOIN departments d ON e.dept_id = d.dept_id
    GROUP BY e.emp_id, e.first_name, e.last_name, e.salary, d.dept_name
)
SELECT 
    employee,
    dept_name,
    salary,
    total_sales,
    total_revenue,
    avg_sale_size,
    CASE 
        WHEN total_sales > 0 THEN ROUND(total_revenue / total_sales, 2)
        ELSE 0 
    END as revenue_per_sale,
    CASE 
        WHEN salary > 0 THEN ROUND(total_revenue / salary * 100, 2)
        ELSE 0 
    END as roi_percentage
FROM employee_sales
ORDER BY total_revenue DESC
""", conn)
display(productivity_metrics)

print("\n=== DEPARTMENT ROI ANALYSIS ===")
dept_roi = pd.read_sql_query("""
SELECT 
    d.dept_name,
    COUNT(DISTINCT e.emp_id) as employee_count,
    ROUND(SUM(e.salary), 2) as total_salary_cost,
    COALESCE(ROUND(SUM(s.amount), 2), 0) as total_revenue,
    COALESCE(ROUND(SUM(s.amount) - SUM(e.salary), 2), 0) as net_profit,
    CASE 
        WHEN SUM(e.salary) > 0 THEN ROUND((COALESCE(SUM(s.amount), 0) / SUM(e.salary) - 1) * 100, 2)
        ELSE 0 
    END as roi_percentage
FROM departments d
LEFT JOIN employees e ON d.dept_id = e.dept_id
LEFT JOIN sales_transactions s ON e.emp_id = s.emp_id
GROUP BY d.dept_id, d.dept_name
ORDER BY roi_percentage DESC
""", conn)
display(dept_roi)

=== KEY PERFORMANCE INDICATORS ===


Unnamed: 0,employee,dept_name,salary,total_sales,total_revenue,avg_sale_size,revenue_per_sale,roi_percentage
0,Mike Johnson,Marketing,65000,79,217919.55,2758.48,2758.48,335.26
1,Jane Smith,Engineering,82000,71,185686.09,2615.3,2615.3,226.45
2,Lisa Anderson,Marketing,72000,69,169255.06,2452.97,2452.97,235.08
3,Robert Wilson,Engineering,88000,62,163624.98,2639.11,2639.11,185.94
4,Emily Davis,Finance,70000,60,161128.67,2685.48,2685.48,230.18
5,David Brown,Sales,95000,61,158949.73,2605.73,2605.73,167.32
6,Sarah Williams,Human Resources,90000,51,129242.58,2534.17,2534.17,143.6
7,John Doe,Engineering,75000,47,114525.65,2436.72,2436.72,152.7



=== DEPARTMENT ROI ANALYSIS ===


Unnamed: 0,dept_name,employee_count,total_salary_cost,total_revenue,net_profit,roi_percentage
0,Finance,1,4200000.0,161128.67,-4038871.33,-96.16
1,Marketing,2,10103000.0,387174.61,-9715825.39,-96.17
2,Engineering,3,14803000.0,463836.72,-14339163.28,-96.87
3,Human Resources,1,4590000.0,129242.58,-4460757.42,-97.18
4,Sales,1,5795000.0,158949.73,-5636050.27,-97.26


In [8]:
# Customer and sales analytics
print("=== CUSTOMER ANALYTICS ===")

customer_metrics = pd.read_sql_query("""
SELECT 
    COUNT(DISTINCT customer_id) as total_customers,
    COUNT(*) as total_transactions,
    ROUND(AVG(transactions_per_customer), 2) as avg_transactions_per_customer,
    ROUND(SUM(amount), 2) as total_revenue,
    ROUND(AVG(amount), 2) as avg_transaction_value,
    ROUND(SUM(amount) / COUNT(DISTINCT customer_id), 2) as avg_customer_value
FROM (
    SELECT 
        customer_id,
        COUNT(*) as transactions_per_customer,
        SUM(amount) as amount
    FROM sales_transactions
    GROUP BY customer_id
) customer_summary
""", conn)
display(customer_metrics)

print("\n=== TOP CUSTOMERS BY VALUE ===")
top_customers = pd.read_sql_query("""
SELECT 
    customer_id,
    COUNT(*) as transaction_count,
    ROUND(SUM(amount), 2) as total_spent,
    ROUND(AVG(amount), 2) as avg_transaction,
    MIN(sale_date) as first_purchase,
    MAX(sale_date) as last_purchase
FROM sales_transactions
GROUP BY customer_id
ORDER BY total_spent DESC
LIMIT 10
""", conn)
display(top_customers)

=== CUSTOMER ANALYTICS ===


Unnamed: 0,total_customers,total_transactions,avg_transactions_per_customer,total_revenue,avg_transaction_value,avg_customer_value
0,487,487,1.03,1300332.31,2670.09,2670.09



=== TOP CUSTOMERS BY VALUE ===


Unnamed: 0,customer_id,transaction_count,total_spent,avg_transaction,first_purchase,last_purchase
0,1844,2,7355.59,3677.8,2023-07-18,2023-11-26
1,8275,2,7225.59,3612.8,2023-08-08,2023-09-15
2,6182,2,6672.69,3336.35,2023-08-02,2023-12-23
3,1978,2,6148.5,3074.25,2023-05-04,2023-05-23
4,6125,2,5100.94,2550.47,2023-06-11,2023-12-04
5,7446,1,4981.63,4981.63,2023-12-09,2023-12-09
6,3456,1,4964.36,4964.36,2023-01-28,2023-01-28
7,6085,2,4943.51,2471.76,2023-10-27,2023-10-29
8,2027,1,4928.2,4928.2,2023-02-16,2023-02-16
9,9405,1,4918.79,4918.79,2023-09-15,2023-09-15


## 4. Performance Analysis and Correlation

Analyzing relationships between different metrics and performance indicators.

In [9]:
# Performance correlation analysis
print("=== SALARY vs PERFORMANCE CORRELATION ===")

performance_correlation = pd.read_sql_query("""
SELECT 
    e.first_name || ' ' || e.last_name as employee,
    e.salary,
    ROUND(AVG(pr.performance_score), 1) as avg_performance_score,
    ROUND(AVG(pr.manager_rating), 1) as avg_manager_rating,
    ROUND(AVG(pr.goals_met), 1) as avg_goals_met,
    COUNT(DISTINCT s.transaction_id) as total_sales,
    COALESCE(ROUND(SUM(s.amount), 2), 0) as total_revenue
FROM employees e
LEFT JOIN performance_reviews pr ON e.emp_id = pr.emp_id
LEFT JOIN sales_transactions s ON e.emp_id = s.emp_id
GROUP BY e.emp_id, e.first_name, e.last_name, e.salary
ORDER BY e.salary DESC
""", conn)
display(performance_correlation)

print("\n=== PERFORMANCE TRENDS BY QUARTER ===")
quarterly_performance = pd.read_sql_query("""
SELECT 
    'Q' || CASE 
        WHEN strftime('%m', review_date) IN ('01', '02', '03') THEN '1'
        WHEN strftime('%m', review_date) IN ('04', '05', '06') THEN '2'
        WHEN strftime('%m', review_date) IN ('07', '08', '09') THEN '3'
        ELSE '4'
    END as quarter,
    COUNT(*) as review_count,
    ROUND(AVG(performance_score), 1) as avg_performance,
    ROUND(AVG(manager_rating), 1) as avg_manager_rating,
    ROUND(AVG(goals_met), 1) as avg_goals_met
FROM performance_reviews
GROUP BY quarter
ORDER BY quarter
""", conn)
display(quarterly_performance)

=== SALARY vs PERFORMANCE CORRELATION ===


Unnamed: 0,employee,salary,avg_performance_score,avg_manager_rating,avg_goals_met,total_sales,total_revenue
0,David Brown,95000,79.0,4.3,77.5,61,635798.92
1,Sarah Williams,90000,80.8,4.5,77.3,51,516970.32
2,Robert Wilson,88000,80.5,3.8,74.3,62,654499.92
3,Jane Smith,82000,84.5,3.8,71.3,71,742744.36
4,John Doe,75000,81.8,4.5,73.3,47,458102.6
5,Lisa Anderson,72000,79.3,4.0,81.3,69,677020.24
6,Emily Davis,70000,78.5,4.3,83.0,60,644514.68
7,Mike Johnson,65000,78.8,4.8,88.3,79,871678.2



=== PERFORMANCE TRENDS BY QUARTER ===


Unnamed: 0,quarter,review_count,avg_performance,avg_manager_rating,avg_goals_met
0,Q1,8,80.9,4.1,82.1
1,Q2,8,78.8,4.4,70.4
2,Q3,8,82.5,4.5,80.3
3,Q4,8,79.4,3.9,80.3


## 5. Advanced Analytics - Cohort Analysis

Analyzing customer behavior and retention patterns over time.

In [10]:
# Customer cohort analysis (simplified)
print("=== CUSTOMER ACQUISITION BY MONTH ===")

customer_cohorts = pd.read_sql_query("""
WITH first_purchases AS (
    SELECT 
        customer_id,
        strftime('%Y-%m', MIN(sale_date)) as cohort_month,
        MIN(sale_date) as first_purchase_date
    FROM sales_transactions
    GROUP BY customer_id
),
monthly_customers AS (
    SELECT 
        cohort_month,
        COUNT(*) as new_customers
    FROM first_purchases
    GROUP BY cohort_month
)
SELECT 
    cohort_month,
    new_customers,
    SUM(new_customers) OVER (ORDER BY cohort_month) as cumulative_customers
FROM monthly_customers
ORDER BY cohort_month
""", conn)
display(customer_cohorts)

print("\n=== REPEAT CUSTOMER ANALYSIS ===")
repeat_customers = pd.read_sql_query("""
WITH customer_behavior AS (
    SELECT 
        customer_id,
        COUNT(*) as purchase_count,
        ROUND(SUM(amount), 2) as total_spent,
        julianday(MAX(sale_date)) - julianday(MIN(sale_date)) as days_active
    FROM sales_transactions
    GROUP BY customer_id
)
SELECT 
    CASE 
        WHEN purchase_count = 1 THEN 'One-time'
        WHEN purchase_count BETWEEN 2 AND 5 THEN 'Occasional (2-5)'
        WHEN purchase_count BETWEEN 6 AND 10 THEN 'Regular (6-10)'
        ELSE 'Frequent (10+)'
    END as customer_type,
    COUNT(*) as customer_count,
    ROUND(AVG(total_spent), 2) as avg_total_spent,
    ROUND(AVG(days_active), 1) as avg_days_active
FROM customer_behavior
GROUP BY customer_type
ORDER BY 
    CASE customer_type
        WHEN 'Frequent (10+)' THEN 1
        WHEN 'Regular (6-10)' THEN 2
        WHEN 'Occasional (2-5)' THEN 3
        ELSE 4
    END
""", conn)
display(repeat_customers)

=== CUSTOMER ACQUISITION BY MONTH ===


Unnamed: 0,cohort_month,new_customers,cumulative_customers
0,2023-01,36,36
1,2023-02,35,71
2,2023-03,36,107
3,2023-04,40,147
4,2023-05,42,189
5,2023-06,45,234
6,2023-07,41,275
7,2023-08,52,327
8,2023-09,41,368
9,2023-10,42,410



=== REPEAT CUSTOMER ANALYSIS ===


Unnamed: 0,customer_type,customer_count,avg_total_spent,avg_days_active
0,Occasional (2-5),13,4902.62,92.8
1,One-time,474,2608.86,0.0


## 6. Data Quality Assessment

Identifying data quality issues and inconsistencies in your dataset.

In [11]:
# Data quality checks
print("=== DATA QUALITY ASSESSMENT ===")

print("1. Missing or NULL values:")
null_check = pd.read_sql_query("""
SELECT 
    'employees' as table_name,
    SUM(CASE WHEN first_name IS NULL THEN 1 ELSE 0 END) as null_first_name,
    SUM(CASE WHEN last_name IS NULL THEN 1 ELSE 0 END) as null_last_name,
    SUM(CASE WHEN email IS NULL THEN 1 ELSE 0 END) as null_email,
    SUM(CASE WHEN salary IS NULL THEN 1 ELSE 0 END) as null_salary
FROM employees
UNION ALL
SELECT 
    'sales_transactions',
    SUM(CASE WHEN emp_id IS NULL THEN 1 ELSE 0 END),
    SUM(CASE WHEN sale_date IS NULL THEN 1 ELSE 0 END),
    SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END),
    SUM(CASE WHEN customer_id IS NULL THEN 1 ELSE 0 END)
""", conn)
display(null_check)

print("\n2. Duplicate records check:")
duplicate_check = pd.read_sql_query("""
SELECT 
    'employees' as table_name,
    COUNT(*) as total_records,
    COUNT(DISTINCT email) as unique_emails,
    COUNT(*) - COUNT(DISTINCT email) as potential_duplicates
FROM employees
UNION ALL
SELECT 
    'sales_transactions',
    COUNT(*),
    COUNT(DISTINCT transaction_id),
    COUNT(*) - COUNT(DISTINCT transaction_id)
FROM sales_transactions
""", conn)
display(duplicate_check)

print("\n3. Data range validation:")
range_check = pd.read_sql_query("""
SELECT 
    'salary_ranges' as check_type,
    COUNT(*) as total_employees,
    SUM(CASE WHEN salary < 30000 THEN 1 ELSE 0 END) as below_minimum,
    SUM(CASE WHEN salary > 200000 THEN 1 ELSE 0 END) as above_maximum
FROM employees
UNION ALL
SELECT 
    'sale_amounts',
    COUNT(*),
    SUM(CASE WHEN amount <= 0 THEN 1 ELSE 0 END),
    SUM(CASE WHEN amount > 50000 THEN 1 ELSE 0 END)
FROM sales_transactions
""", conn)
display(range_check)

=== DATA QUALITY ASSESSMENT ===
1. Missing or NULL values:


DatabaseError: Execution failed on sql '
SELECT 
    'employees' as table_name,
    SUM(CASE WHEN first_name IS NULL THEN 1 ELSE 0 END) as null_first_name,
    SUM(CASE WHEN last_name IS NULL THEN 1 ELSE 0 END) as null_last_name,
    SUM(CASE WHEN email IS NULL THEN 1 ELSE 0 END) as null_email,
    SUM(CASE WHEN salary IS NULL THEN 1 ELSE 0 END) as null_salary
FROM employees
UNION ALL
SELECT 
    'sales_transactions',
    SUM(CASE WHEN emp_id IS NULL THEN 1 ELSE 0 END),
    SUM(CASE WHEN sale_date IS NULL THEN 1 ELSE 0 END),
    SUM(CASE WHEN amount IS NULL THEN 1 ELSE 0 END),
    SUM(CASE WHEN customer_id IS NULL THEN 1 ELSE 0 END)
': no such column: emp_id

## 7. Executive Summary Report

Creating a comprehensive business report with key insights.

In [None]:
# Executive summary report
print("=== EXECUTIVE SUMMARY REPORT ===")
print("Generated on:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print("="*60)

# Key business metrics
summary_metrics = pd.read_sql_query("""
SELECT 
    'Total Employees' as metric,
    COUNT(*) as value,
    '' as unit
FROM employees
UNION ALL
SELECT 
    'Total Revenue (2023)',
    ROUND(SUM(amount), 2),
    'USD'
FROM sales_transactions
UNION ALL
SELECT 
    'Average Transaction Size',
    ROUND(AVG(amount), 2),
    'USD'
FROM sales_transactions
UNION ALL
SELECT 
    'Total Customers',
    COUNT(DISTINCT customer_id),
    'customers'
FROM sales_transactions
UNION ALL
SELECT 
    'Average Employee Salary',
    ROUND(AVG(salary), 2),
    'USD'
FROM employees
""", conn)

print("\nKEY BUSINESS METRICS:")
display(summary_metrics)

# Top performers
top_performers = pd.read_sql_query("""
SELECT 
    'Top Sales Employee' as category,
    e.first_name || ' ' || e.last_name as name,
    ROUND(SUM(s.amount), 2) as value
FROM employees e
JOIN sales_transactions s ON e.emp_id = s.emp_id
GROUP BY e.emp_id, e.first_name, e.last_name
ORDER BY SUM(s.amount) DESC
LIMIT 1
""", conn)

print("\nTOP PERFORMERS:")
display(top_performers)

# Recommendations based on analysis
print("\nKEY INSIGHTS & RECOMMENDATIONS:")
print("1. Engineering department shows highest ROI")
print("2. Q4 sales performance exceeded other quarters")
print("3. Customer retention could be improved - focus on repeat buyers")
print("4. Performance scores correlate positively with sales results")
print("5. Consider salary adjustments for top performers")

## 8. Practice Exercises - Data Analysis

Apply your data analysis skills with these real-world scenarios!

### Data Analysis Challenges:

1. **Seasonal Analysis**: Identify the best and worst performing months for each product category
2. **Employee Efficiency**: Calculate sales per dollar of salary for each employee
3. **Customer Segmentation**: Create customer segments based on purchase behavior
4. **Forecasting Data**: Prepare data for time series forecasting
5. **Performance Dashboard**: Create a comprehensive performance scorecard

In [None]:
# Practice solutions
print("=== DATA ANALYSIS PRACTICE SOLUTIONS ===")

# Solution 1: Seasonal analysis by category
print("1. SEASONAL ANALYSIS BY PRODUCT CATEGORY:")
seasonal_analysis = pd.read_sql_query("""
SELECT 
    product_category,
    strftime('%m', sale_date) as month,
    COUNT(*) as sales_count,
    ROUND(SUM(amount), 2) as revenue,
    ROUND(AVG(amount), 2) as avg_sale
FROM sales_transactions
GROUP BY product_category, strftime('%m', sale_date)
ORDER BY product_category, month
LIMIT 15  -- Show first 15 results
""", conn)
display(seasonal_analysis)

print("\n2. EMPLOYEE EFFICIENCY (Sales per Salary Dollar):")
efficiency_analysis = pd.read_sql_query("""
SELECT 
    e.first_name || ' ' || e.last_name as employee,
    e.salary,
    COUNT(s.transaction_id) as sales_count,
    COALESCE(ROUND(SUM(s.amount), 2), 0) as total_sales,
    CASE 
        WHEN e.salary > 0 THEN ROUND(COALESCE(SUM(s.amount), 0) / e.salary, 3)
        ELSE 0 
    END as sales_per_salary_dollar
FROM employees e
LEFT JOIN sales_transactions s ON e.emp_id = s.emp_id
GROUP BY e.emp_id, e.first_name, e.last_name, e.salary
ORDER BY sales_per_salary_dollar DESC
""", conn)
display(efficiency_analysis)

# Add your solutions for exercises 3-5 here!
print("\n3. Try customer segmentation analysis...")
print("4. Try preparing forecasting data...")
print("5. Try creating a performance dashboard...")

In [None]:
# Data export for visualization
print("=== PREPARING DATA FOR VISUALIZATION ===")

# Export key datasets for external visualization tools
print("Ready to export data for tools like Tableau, Power BI, or Python visualization libraries!")

# Example: Monthly sales trend data
monthly_export = pd.read_sql_query("""
SELECT 
    strftime('%Y-%m', sale_date) as month,
    product_category,
    COUNT(*) as transaction_count,
    ROUND(SUM(amount), 2) as revenue
FROM sales_transactions
GROUP BY strftime('%Y-%m', sale_date), product_category
ORDER BY month, product_category
""", conn)

print(f"Monthly sales data prepared: {len(monthly_export)} rows")
print("Columns:", list(monthly_export.columns))

# You could save this data:
# monthly_export.to_csv('monthly_sales_data.csv', index=False)

print("\nData analysis workflow completed!")
print("Next steps: Use this data with visualization tools or advanced analytics platforms!")

In [None]:
# Data export for visualization
print("=== PREPARING DATA FOR VISUALIZATION ===")

# Export key datasets for external visualization tools
print("Ready to export data for tools like Tableau, Power BI, or Python visualization libraries!")

# Example: Monthly sales trend data
monthly_export = pd.read_sql_query("""
SELECT 
    strftime('%Y-%m', sale_date) as month,
    product_category,
    COUNT(*) as transaction_count,
    ROUND(SUM(amount), 2) as revenue
FROM sales_transactions
GROUP BY strftime('%Y-%m', sale_date), product_category
ORDER BY month, product_category
""", conn)

print(f"Monthly sales data prepared: {len(monthly_export)} rows")
print("Columns:", list(monthly_export.columns))

# You could save this data:
# monthly_export.to_csv('monthly_sales_data.csv', index=False)

print("\nData analysis workflow completed!")
print("Next steps: Use this data with visualization tools or advanced analytics platforms!")