# Setup Script: Create Revenue Data in Databricks

This script creates dummy revenue data and loads it into a Databricks table named `monthly_revenue`.

**Instructions:**
1. Run all cells in order
2. The table `monthly_revenue` will be created in your Databricks workspace
3. Verify the table creation using the verification query at the end

In [0]:
# Step 1: Generate Dummy Revenue Data
# This script creates sample revenue data with realistic patterns

import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate monthly revenue data for 2024 and 2025
# Creating realistic revenue patterns with some seasonality

months = ['January', 'February', 'March', 'April', 'May', 'June',
          'July', 'August', 'September', 'October', 'November', 'December']

# Generate 2024 revenue data (base year)
# Adding some seasonality: higher in Q4, lower in Q1
revenue_2024 = []
for i, month in enumerate(months):
    month_num = i + 1
    # Base revenue with seasonality
    base = 50000
    seasonal_factor = 1.0
    if month_num in [10, 11, 12]:  # Q4 - holiday season
        seasonal_factor = 1.3
    elif month_num in [1, 2, 3]:  # Q1 - slower
        seasonal_factor = 0.85
    elif month_num in [6, 7, 8]:  # Summer - moderate
        seasonal_factor = 1.1

    # Add some random variation
    revenue = base * seasonal_factor * (1 + np.random.uniform(-0.15, 0.15))
    revenue_2024.append(round(revenue, 2))

# Generate 2025 revenue data (target year)
# Overall growth trend with some months performing better/worse than 2024
revenue_2025 = []
growth_rate = 1.15  # 15% overall growth
for i, month in enumerate(months):
    month_num = i + 1
    base_2024 = revenue_2024[i]

    # Some months have higher growth, some lower
    month_growth = growth_rate
    if month_num in [3, 5, 7, 9, 11]:  # Odd months - better growth
        month_growth = growth_rate * 1.1
    elif month_num in [2, 4, 6]:  # Some months - lower growth
        month_growth = growth_rate * 0.95

    # Add random variation
    revenue = base_2024 * month_growth * (1 + np.random.uniform(-0.1, 0.1))
    revenue_2025.append(round(revenue, 2))

# Create DataFrame
data = []
for i, month in enumerate(months):
    data.append({
        'year': 2024,
        'month': month,
        'month_number': i + 1,
        'revenue': revenue_2024[i]
    })
    data.append({
        'year': 2025,
        'month': month,
        'month_number': i + 1,
        'revenue': revenue_2025[i]
    })

df = pd.DataFrame(data)

# Display the data
print("Generated Revenue Data:")
print("=" * 60)
print(df.to_string(index=False))
print("\n" + "=" * 60)
print(f"\nAverage monthly revenue for 2024: ${df[df['year']==2024]['revenue'].mean():,.2f}")
print(f"Average monthly revenue for 2025: ${df[df['year']==2025]['revenue'].mean():,.2f}")
print("\n" + "=" * 60)

# Save to CSV for reference
df.to_csv('revenue_data.csv', index=False)
print("\nData saved to 'revenue_data.csv'")

In [0]:
# Step 2: Create Table in Databricks
# Convert pandas DataFrame to Spark DataFrame and create table

from pyspark.sql import SparkSession

# Initialize Spark session (if not already initialized)
spark = SparkSession.builder.appName("RevenueDataSetup").getOrCreate()

# Convert pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(df)

# Create table directly
spark_df.write.mode("overwrite").saveAsTable("monthly_revenue")

print("Table 'monthly_revenue' created successfully!")
print("\nData preview:")
spark_df.show(24)  # Show all 24 rows (12 months Ã— 2 years)

In [0]:
# Step 3: Verify Table Creation
# Run this to verify the table was created correctly

spark.sql("""
SELECT *
FROM monthly_revenue
ORDER BY year, month_number
""").show()

print("\nSummary statistics by year:")
spark.sql("""
SELECT
    year,
    COUNT(*) as month_count,
    ROUND(AVG(revenue), 2) as avg_revenue,
    ROUND(MIN(revenue), 2) as min_revenue,
    ROUND(MAX(revenue), 2) as max_revenue,
    ROUND(SUM(revenue), 2) as total_revenue
FROM monthly_revenue
GROUP BY year
ORDER BY year
""").show()