In [2]:
from huggingface_hub import login
login()  # This will prompt for your token in the terminal

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
from datasets import load_dataset

# Create data directory if it doesn't exist
import os
if not os.path.exists('data'):
    os.makedirs('data')

try:
    # Load and save weather data
    weather_dataset = load_dataset("EDS-lab/electricity-demand", 'weather')
    df_weather = weather_dataset['train'].to_pandas()
    df_weather.to_parquet('data/weather.parquet')
    print("Weather data saved successfully")
    
    # Load and save demand data
    demand_dataset = load_dataset("EDS-lab/electricity-demand", 'demand')
    df_demand = demand_dataset['train'].to_pandas()
    df_demand.to_parquet('data/demand.parquet')
    print("Demand data saved successfully")
    
    # Load and save metadata
    metadata_dataset = load_dataset("EDS-lab/electricity-demand", 'metadata')
    df_metadata = metadata_dataset['train'].to_pandas()
    df_metadata.to_parquet('data/metadata.parquet')
    print("Metadata saved successfully")
    
except Exception as e:
    print(f"Error saving data: {e}")

# Verify the files
for file in ['weather.parquet', 'demand.parquet', 'metadata.parquet']:
    path = os.path.join('data', file)
    if os.path.exists(path):
        size = os.path.getsize(path) / (1024 * 1024)  # Convert to MB
        print(f"{file}: {size:.2f} MB")
    else:
        print(f"{file} not found")

weather.parquet:   0%|          | 0.00/36.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Weather data saved successfully


demand.parquet:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

Demand data saved successfully


metadata.parquet:   0%|          | 0.00/202k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Metadata saved successfully
weather.parquet: 34.61 MB
demand.parquet: 987.38 MB
metadata.parquet: 0.20 MB


In [9]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import pyarrow.parquet as pq

def analyze_demand_stats(file_path, batch_size=10000):
    """
    Analyze demand data using pyarrow's batched reading
    """
    # Initialize statistics
    total_rows = 0
    value_sum = 0
    value_sq_sum = 0
    min_val = float('inf')
    max_val = float('-inf')
    min_date = None
    max_date = None
    unique_ids = set()
    
    # Open the parquet file
    parquet_file = pq.ParquetFile(file_path)
    num_row_groups = parquet_file.num_row_groups
    
    # Process each row group
    for i in range(num_row_groups):
        # Read one row group at a time
        row_group = parquet_file.read_row_group(i, columns=['timestamp', 'y', 'unique_id'])
        df_chunk = row_group.to_pandas()
        
        # Process in smaller batches
        for start_idx in range(0, len(df_chunk), batch_size):
            end_idx = min(start_idx + batch_size, len(df_chunk))
            batch = df_chunk.iloc[start_idx:end_idx]
            
            # Update statistics
            total_rows += len(batch)
            value_sum += batch['y'].sum()
            value_sq_sum += (batch['y']**2).sum()
            min_val = min(min_val, batch['y'].min())
            max_val = max(max_val, batch['y'].max())
            
            # Update date range
            batch_min_date = batch['timestamp'].min()
            batch_max_date = batch['timestamp'].max()
            min_date = batch_min_date if min_date is None else min(min_date, batch_min_date)
            max_date = batch_max_date if max_date is None else max(max_date, batch_max_date)
            
            # Update unique IDs
            unique_ids.update(batch['unique_id'].unique())
        
        # Print progress
        print(f"Processed row group {i+1}/{num_row_groups} ({total_rows:,} rows so far)...")
    
    # Calculate final statistics
    mean = value_sum / total_rows
    variance = (value_sq_sum / total_rows) - (mean ** 2)
    std = np.sqrt(variance)
    
    return {
        'total_rows': total_rows,
        'mean': mean,
        'std': std,
        'min': min_val,
        'max': max_val,
        'date_range': (min_date, max_date),
        'num_unique_ids': len(unique_ids)
    }

def analyze_metadata(metadata_path):
    """
    Analyze the metadata file (which is small enough to load entirely)
    """
    try:
        metadata_df = pd.read_parquet(metadata_path)
        print("\nMetadata Analysis:")
        print("-----------------")
        print(f"Total buildings: {len(metadata_df)}")
        print("\nColumns available:")
        print(metadata_df.columns.tolist())
        
        if 'building_class' in metadata_df.columns:
            print("\nBuilding class distribution:")
            print(metadata_df['building_class'].value_counts())
        
        return metadata_df
    except Exception as e:
        print(f"Error reading metadata: {str(e)}")
        return None

def analyze_weather_sample(weather_path, sample_rows=1000):
    """
    Analyze a sample of weather data using row groups
    """
    try:
        parquet_file = pq.ParquetFile(weather_path)
        
        # Read the first row group
        first_group = parquet_file.read_row_group(0)
        weather_sample = first_group.to_pandas().head(sample_rows)
        
        print("\nWeather Data Analysis (Sample):")
        print("-----------------------------")
        print(f"Sample size: {len(weather_sample):,} records")
        print(f"Total row groups: {parquet_file.num_row_groups}")
        print("\nColumns available:")
        print(weather_sample.columns.tolist())
        
        print("\nNumeric columns summary:")
        numeric_cols = weather_sample.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            print(f"\n{col}:")
            print(weather_sample[col].describe())
        
        return weather_sample
    except Exception as e:
        print(f"Error reading weather data: {str(e)}")
        return None

# Set up paths
data_path = os.path.join('..', 'data')
demand_path = os.path.join(data_path, 'demand.parquet')
weather_path = os.path.join(data_path, 'weather.parquet')
metadata_path = os.path.join(data_path, 'metadata.parquet')

# Run analyses
print("Analyzing demand data (this may take a while)...")
try:
    demand_stats = analyze_demand_stats(demand_path)
    
    print("\nDemand Data Analysis:")
    print("-------------------")
    print(f"Total records: {demand_stats['total_rows']:,}")
    print(f"Number of unique buildings/meters: {demand_stats['num_unique_ids']:,}")
    print(f"Date range: {demand_stats['date_range'][0]} to {demand_stats['date_range'][1]}")
    print("\nElectricity demand statistics:")
    print(f"Mean: {demand_stats['mean']:.2f}")
    print(f"Std Dev: {demand_stats['std']:.2f}")
    print(f"Min: {demand_stats['min']:.2f}")
    print(f"Max: {demand_stats['max']:.2f}")
except Exception as e:
    print(f"Error analyzing demand data: {str(e)}")

# Analyze metadata
metadata_df = analyze_metadata(metadata_path)

# Analyze weather data sample
weather_sample = analyze_weather_sample(weather_path)

Analyzing demand data (this may take a while)...
Processed row group 1/227 (1,048,576 rows so far)...
Processed row group 2/227 (2,097,152 rows so far)...
Processed row group 3/227 (3,145,728 rows so far)...
Processed row group 4/227 (4,194,304 rows so far)...
Processed row group 5/227 (5,242,880 rows so far)...
Processed row group 6/227 (6,291,456 rows so far)...
Processed row group 7/227 (7,340,032 rows so far)...
Processed row group 8/227 (8,388,608 rows so far)...
Processed row group 9/227 (9,437,184 rows so far)...
Processed row group 10/227 (10,485,760 rows so far)...
Processed row group 11/227 (11,534,336 rows so far)...
Processed row group 12/227 (12,582,912 rows so far)...
Processed row group 13/227 (13,631,488 rows so far)...
Processed row group 14/227 (14,680,064 rows so far)...
Processed row group 15/227 (15,728,640 rows so far)...
Processed row group 16/227 (16,777,216 rows so far)...
Processed row group 17/227 (17,825,792 rows so far)...
Processed row group 18/227 (18,874