In [1]:
# Task 1: Load real PVGIS data from merged CSV (all cities)
import pandas as pd

# Load the merged real data file created from PVGIS CSVs
print("Loading real data from all_cities_real_data.csv...")
df = pd.read_csv("solar_data_pvgis/all_cities_real_data.csv")
df['date'] = pd.to_datetime(df['date'])

print(f"Data loaded successfully!")
print(f"Number of rows: {len(df)}")
print(f"Cities: {sorted(df['city'].unique())}")
print(df.head())

# Save a backup if needed
df.to_csv("backup_real_data.csv", index=False)
print("Backup saved as backup_real_data.csv")

Loading real data from all_cities_real_data.csv...
Data loaded successfully!
Number of rows: 20817
Cities: ['Agadir', 'Casa', 'Marrakech']
        date    city  mean_rad  temp_max  rad_rolling  temp_rolling
0 2005-01-01  Agadir   6.68089     17.81     6.680890     17.810000
1 2005-01-02  Agadir   6.65736     17.91     6.669125     17.860000
2 2005-01-03  Agadir   6.80250     18.26     6.713583     17.993333
3 2005-01-04  Agadir   6.77962     18.35     6.746493     18.173333
4 2005-01-05  Agadir   6.82313     17.68     6.801750     18.096667
Backup saved as backup_real_data.csv


In [5]:
# Task 2: Prepare features, one-hot encoding, and create sequences for LSTM
import numpy as np

# Load real data
df = pd.read_csv("solar_data_pvgis/all_cities_real_data.csv")
df['date'] = pd.to_datetime(df['date'])

# One-hot encoding for cities
df = pd.get_dummies(df, columns=['city'], dtype=float)

# Define features (real radiation and temperature + rolling + one-hot)
features = ['mean_rad', 'temp_max', 'rad_rolling', 'temp_rolling'] + \
           [col for col in df.columns if col.startswith('city_')]

# Sequence length (7 days)
seq_len = 7
X, y, indices = [], [], []

# Group by city (using one-hot columns)
city_cols = [col for col in df.columns if col.startswith('city_')]

for _, group in df.groupby(city_cols, sort=False):
    group_features = group[features].values
    
    # Create target label (outage probability based on real data)
    # Example logic: low radiation = higher outage chance
    prob = 0.05 + 0.4 * (group['mean_rad'] < 3.5) + 0.3 * (group['temp_max'] > 30)
    prob = np.clip(prob, 0, 1)
    labels = np.random.binomial(1, prob.values)  # Simulate binary outage
    
    # Create sequences
    for i in range(seq_len, len(group)):
        X.append(group_features[i-seq_len:i])
        y.append(labels[i])
        indices.append(group.index[i])

X = np.array(X)
y = np.array(y)

print(f"Sequences created: {X.shape} (samples, time_steps, features)")
print(f"Target shape: {y.shape}")

Sequences created: (20796, 7, 7) (samples, time_steps, features)
Target shape: (20796,)


In [6]:
# Task 3: Split data into train/validation and create DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch

# Assuming X and y from Task 2 are saved or run sequentially
# If running separately, load from saved numpy files

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert to PyTorch tensors
X_train_t = torch.from_numpy(X_train).float()
y_train_t = torch.from_numpy(y_train).float().unsqueeze(1)
X_val_t = torch.from_numpy(X_val).float()
y_val_t = torch.from_numpy(y_val).float().unsqueeze(1)

# Create DataLoader
train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=64, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=64)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print("DataLoaders ready!")

ModuleNotFoundError: No module named 'torch'