# Data Generation Notebook

This notebook generates sample time series weather data for our pipeline.

In [None]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
import random
import datetime
import boto3
from io import StringIO
import os

In [None]:
# Cell 2: Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Cell 3: Generate sample weather data
print("Generating synthetic weather data...")

# Create date range for the past month
end_date = datetime.datetime.now()
start_date = end_date - datetime.timedelta(days=30)
dates = pd.date_range(start=start_date, end=end_date, freq='H')

# Generate random temperature data (°C)
temperatures = np.random.normal(loc=22, scale=5, size=len(dates))
temperatures = [round(max(min(t, 35), -5), 1) for t in temperatures]  # Bound between -5 and 35°C

# Generate humidity data (%)
humidities = np.random.normal(loc=60, scale=15, size=len(dates))
humidities = [round(max(min(h, 100), 20), 1) for h in humidities]  # Bound between 20 and 100%

# Generate wind speed (km/h)
wind_speeds = np.random.lognormal(mean=1.5, sigma=0.5, size=len(dates))
wind_speeds = [round(min(w, 100), 1) for w in wind_speeds]  # Cap at 100 km/h

# Generate precipitation (mm)
precipitation = np.zeros(len(dates))
rain_mask = np.random.random(len(dates)) < 0.2  # 20% chance of rain
precipitation[rain_mask] = np.random.exponential(scale=5, size=sum(rain_mask))
precipitation = [round(p, 1) for p in precipitation]

# Create dataframe
weather_data = pd.DataFrame({
    'date': dates,
    'temperature': temperatures,
    'humidity': humidities,
    'wind_speed': wind_speeds,
    'precipitation': precipitation
})

print(f"Generated {len(weather_data)} rows of weather data")
print(weather_data.head())

In [None]:
# Cell 4: Save the data to MinIO
try:
    # First, try to save locally for debugging
    os.makedirs('data', exist_ok=True)
    weather_data.to_csv('data/raw_weather_data.csv', index=False)
    print("Saved data locally to data/raw_weather_data.csv")
    
    # Now save to MinIO
    # Set up the MinIO/S3 client
    s3_client = boto3.client(
        's3',
        endpoint_url='http://minio.minio-system.svc.cluster.local:9000',
        aws_access_key_id='minio', 
        aws_secret_access_key='minio123',
        region_name='us-east-1'  # Can be any region, doesn't matter for MinIO
    )
    
    # Convert dataframe to CSV string
    csv_buffer = StringIO()
    weather_data.to_csv(csv_buffer, index=False)
    
    # Upload to MinIO
    s3_client.put_object(
        Bucket='elyra-airflow',
        Key='weather-dataprocessing-0519232703/raw_weather_data.csv',
        Body=csv_buffer.getvalue()
    )
    print("Successfully saved raw_weather_data.csv to MinIO")
    
except Exception as e:
    print(f"Error saving to MinIO: {e}")