# Data Generation Notebook

This notebook generates sample time series weather data for our pipeline.

In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

print("Starting data generation process...")

In [None]:
# Create a directory for our data
os.makedirs('data', exist_ok=True)
print("Created data directory")

In [None]:
# Generate sample time series data
def generate_sample_data(num_days=30):
    # Create date range
    end_date = datetime.now()
    start_date = end_date - timedelta(days=num_days)
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Generate random metrics
    np.random.seed(42)  # For reproducibility
    temperature = np.random.normal(25, 5, size=len(date_range))
    humidity = np.random.normal(60, 10, size=len(date_range))
    pressure = np.random.normal(1013, 10, size=len(date_range))
    
    # Create DataFrame
    df = pd.DataFrame({
        'date': date_range,
        'temperature': temperature,
        'humidity': humidity,
        'pressure': pressure
    })
    
    return df

# Generate the data
data = generate_sample_data()
print(f"Generated {len(data)} rows of sample weather data")
print(data.head())

In [None]:
# Save to CSV
output_file = 'data/raw_weather_data.csv'
data.to_csv(output_file, index=False)
print(f"Data saved to {output_file}")

In [None]:
print("Data generation complete!")