In [1]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from arch import arch_model
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import pipeline
from prophet import Prophet
import yfinance as yf
import pickle
import warnings
import time
warnings.filterwarnings("ignore")

def fetch_energy_data(fallback=False):
    """
    Fetches daily energy prices using YFinance for futures.
    Returns a tuple (DataFrame, bool) to indicate success/fallback.
    """
    if fallback:
        print("Using synthetic data as fallback")
        dates = pd.date_range('2015-01-01', '2025-09-08', freq='D')
        np.random.seed(42)
        df = pd.DataFrame({
            'Petrol': np.clip(np.random.normal(loc=100, scale=10, size=len(dates)), 50, 150),
            'Uranium': np.clip(np.random.normal(loc=50, scale=5, size=len(dates)), 20, 80),
            'Natural_Gas': np.clip(np.random.normal(loc=5, scale=0.5, size=len(dates)), 2, 8),
            'Crude_Oil': np.clip(np.random.normal(loc=70, scale=7, size=len(dates)), 30, 110)
        }, index=dates)
        return df, True

    tickers = {
        'Petrol': 'RB=F',
        'Natural_Gas': 'NG=F',
        'Crude_Oil': 'CL=F',
        'Uranium': 'URA'
    }
    dfs = []

    # Get the earliest date available for a proxy ticker
    try:
        start_date = yf.download('CL=F', period='max').index.min()
        print(f"Starting data download from: {start_date.date()}")
    except Exception as e:
        print(f"Could not determine earliest date: {e}. Defaulting to 2015-01-01.")
        start_date = pd.to_datetime('2015-01-01')

    # Download data for all tickers from the determined start date
    for name, ticker in tickers.items():
        try:
            # Explicitly setting start date based on the longest available history
            df = yf.download(ticker, start=start_date, progress=False, timeout=60)
            if df.empty:
                print(f"No data returned for {name}.")
                continue
            
            df = df[['Close']].rename(columns={'Close': name})
            df.index = pd.to_datetime(df.index)
            dfs.append(df)
            print(f"Successfully fetched {name}: {len(df)} records.")
        except Exception as e:
            print(f"Error fetching {name}: {e}. Skipping this ticker.")

    if not dfs:
        print("No data fetched. Falling back to synthetic data.")
        return fetch_energy_data(fallback=True)

    energy_df = pd.concat(dfs, axis=1, join='outer')
    energy_df = energy_df.replace([np.inf, -np.inf], np.nan).dropna(how='all').ffill().bfill()
    energy_df.index = pd.to_datetime(energy_df.index)

    # Resample to monthly and fill NaNs
    energy_df = energy_df.resample('M').mean().ffill().bfill()

    # The rest of your function remains the same
    # ...
    energy_df.to_parquet('energy_prices.parquet')
    print(f"Final DataFrame shape: {energy_df.shape}")
    return energy_df, False

def preprocess_data(df):
    """
    Applies EDA and preprocessing to multi-source DataFrame.
    """
    df = df.replace([np.inf, -np.inf], np.nan).dropna(how='all').ffill().bfill()
    print(f"Preprocess input columns: {list(df.columns)}")
    
    for col in df.columns:
        df[f'{col}_rolling'] = df[col].rolling(12).mean()
        df[f'{col}_std'] = df[col].rolling(12).std()
    
    df = df.fillna(method='bfill')
    df.columns = df.columns.astype(str)  # Ensure string column names
    print(f"Preprocess output columns: {list(df.columns)}")
    
    fig, ax = plt.subplots(figsize=(16, 8))
    for col in df.columns[:4]:
        sns.lineplot(data=df, x=df.index, y=col, label=col)
    plt.title('Energy Prices Over Time')
    plt.savefig('energy_prices_plot.png')
    plt.close()
    
    transformed_dfs = {}
    for col in df.columns[:4]:
        if df[col].isna().all():
            print(f"Skipping {col}: All values are NaN")
            continue
        col_data = df[col].replace([np.inf, -np.inf], np.nan).dropna()
        if len(col_data) < 2:
            print(f"Skipping {col}: Insufficient data after cleaning")
            continue
        adf_result = adfuller(col_data)
        print(f"{col} ADF p-value: {adf_result[1]}")
        
        if adf_result[1] > 0.05:
            temp_df = pd.DataFrame({col: col_data})
            temp_df['log'] = np.log(temp_df[col].replace(0, np.nan).fillna(method='ffill'))
            temp_df['log_sqrt'] = np.sqrt(temp_df['log'].replace([np.inf, -np.inf], np.nan).fillna(method='bfill'))
            temp_df['diff'] = temp_df['log_sqrt'].diff().dropna()
            transformed_dfs[col] = temp_df['diff']
        else:
            transformed_dfs[col] = col_data
    
    transformed_df = pd.DataFrame(transformed_dfs)
    transformed_df.columns = transformed_df.columns.astype(str)  # Ensure string column names
    print(f"Transformed DataFrame columns: {list(transformed_df.columns)}")
    return transformed_df, df

def add_nlp_sentiment(df):
    """
    Adds sentiment feature from simulated energy news/tweets.
    """
    print(f"Sentiment input columns: {list(df.columns)}")
try:
    sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', framework='pt', device=-1)
except Exception as e:
    sentiment_analyzer = None
    print(f"Failed to load sentiment model: {e}")

def fetch_energy_data(fallback=False):
    """
    Fetches daily energy prices using YFinance for futures (Petrol, Natural Gas, Crude Oil, Uranium).
    Returns a tuple (DataFrame, bool) to indicate success/fallback.
    """
    if fallback:
        print("Using synthetic data as fallback")
        dates = pd.date_range('2015-01-01', '2025-09-08', freq='D')
        np.random.seed(42)
        df = pd.DataFrame({
            'Petrol': np.clip(np.random.normal(loc=100, scale=10, size=len(dates)), 50, 150),
            'Uranium': np.clip(np.random.normal(loc=50, scale=5, size=len(dates)), 20, 80),
            'Natural_Gas': np.clip(np.random.normal(loc=5, scale=0.5, size=len(dates)), 2, 8),
            'Crude_Oil': np.clip(np.random.normal(loc=70, scale=7, size=len(dates)), 30, 110)
        }, index=dates)
        print(f"Synthetic DataFrame columns: {list(df.columns)}")
        return df, True

    tickers = {
        'Petrol': 'RB=F',
        'Natural_Gas': 'NG=F',
        'Crude_Oil': 'CL=F',
        'Uranium': 'URA'
    }
    dfs = []
    for name, ticker in tickers.items():
        try:
            df = yf.download(ticker, start='2015-01-01', end='2025-09-08', progress=False, timeout=60)
            if df.empty:
                print(f"No data returned for {name}. Using synthetic data.")
                dates = pd.date_range('2015-01-01', '2025-09-08', freq='D')
                mean, std = {'Petrol': (100, 10), 'Natural_Gas': (5, 0.5), 'Crude_Oil': (70, 7), 'Uranium': (50, 5)}[name]
                synthetic_df = pd.DataFrame({name: np.random.normal(loc=mean, scale=std, size=len(dates))}, index=dates)
                dfs.append(synthetic_df)
                continue
            df = df[['Close']].rename(columns={'Close': name})
            df.index = pd.to_datetime(df.index)
            dfs.append(df)
            print(f"Successfully fetched {name}: {len(df)} records")
        except Exception as e:
            print(f"Error fetching {name}: {e}. Using synthetic data for {name}.")
            dates = pd.date_date_range('2015-01-01', '2025-09-08', freq='D')
            mean, std = {'Petrol': (100, 10), 'Natural_Gas': (5, 0.5), 'Crude_Oil': (70, 7), 'Uranium': (50, 5)}[name]
            synthetic_df = pd.DataFrame({name: np.random.normal(loc=mean, scale=std, size=len(dates))}, index=dates)
            dfs.append(synthetic_df)

    if not dfs:
        print("No data fetched. Falling back to synthetic data.")
        return fetch_energy_data(fallback=True)

    energy_df = pd.concat(dfs, axis=1, join='outer')
    energy_df.columns = [c[0] if isinstance(c, tuple) else c for c in energy_df.columns]
    energy_df = energy_df.replace([np.inf, -np.inf], np.nan).dropna(how='all').ffill().bfill()
    energy_df.index = pd.to_datetime(energy_df.index)
    
    if not energy_df.index.is_unique:
        energy_df = energy_df[~energy_df.index.duplicated(keep='first')]

    if not isinstance(energy_df.index, pd.DatetimeIndex):
        raise ValueError("Index is not a DatetimeIndex after concatenation")

    energy_df = energy_df.resample('M').mean().ffill().bfill()
    for col in energy_df.columns:
        mean, std = energy_df[col].mean(), energy_df[col].std()
        energy_df[col] = energy_df[col].clip(lower=mean - 3*std, upper=mean + 3*std)

    if energy_df.isna().any().any():
        energy_df = energy_df.bfill()

    energy_df.to_parquet('energy_prices.parquet')
    print(f"Final DataFrame shape: {energy_df.shape}")
    return energy_df, False

def preprocess_data(df):
    """
    Applies EDA and preprocessing to multi-source DataFrame.
    """
    df = df.replace([np.inf, -np.inf], np.nan).dropna(how='all').ffill().bfill()
    print(f"Preprocess input columns: {list(df.columns)}")
    
    for col in df.columns:
        df[f'{col}_rolling'] = df[col].rolling(12).mean()
        df[f'{col}_std'] = df[col].rolling(12).std()
    
    df = df.fillna(method='bfill')
    df.columns = df.columns.astype(str)
    print(f"Preprocess output columns: {list(df.columns)}")
    
    fig, ax = plt.subplots(figsize=(16, 8))
    for col in df.columns[:4]:
        sns.lineplot(data=df, x=df.index, y=col, label=col)
    plt.title('Energy Prices Over Time')
    plt.savefig('energy_prices_plot.png')
    plt.close()
    
    transformed_dfs = {}
    for col in df.columns[:4]:
        if df[col].isna().all():
            print(f"Skipping {col}: All values are NaN")
            continue
        col_data = df[col].replace([np.inf, -np.inf], np.nan).dropna()
        if len(col_data) < 2:
            print(f"Skipping {col}: Insufficient data after cleaning")
            continue
        adf_result = adfuller(col_data)
        print(f"{col} ADF p-value: {adf_result[1]}")
        
        if adf_result[1] > 0.05:
            temp_df = pd.DataFrame({col: col_data})
            temp_df['log'] = np.log(temp_df[col].replace(0, np.nan).fillna(method='ffill'))
            temp_df['log_sqrt'] = np.sqrt(temp_df['log'].replace([np.inf, -np.inf], np.nan).fillna(method='bfill'))
            temp_df['diff'] = temp_df['log_sqrt'].diff().dropna()
            transformed_dfs[col] = temp_df['diff']
        else:
            transformed_dfs[col] = col_data
    
    transformed_df = pd.DataFrame(transformed_dfs)
    transformed_df.columns = transformed_df.columns.astype(str)
    print(f"Transformed DataFrame columns: {list(transformed_df.columns)}")
    return transformed_df, df

def add_nlp_sentiment(df, analyzer):
    """
    Adds sentiment feature from simulated energy news/tweets.
    """
    print(f"Sentiment input columns: {list(df.columns)}")
    if analyzer is None:
        print("Sentiment analysis model not loaded. Adding zero sentiment.")
        df['Sentiment'] = 0.0
        df.columns = df.columns.astype(str)
        return df
    
    try:
        sample_texts = [
            "OPEC cuts production, oil prices surge",
            "Natural gas reserves high, prices drop",
            "Renewable energy boom affects crude demand",
            "Geopolitical tensions in Middle East spike petrol prices"
        ] * (len(df) // 4 + 1)
        
        scores = []
        for text in sample_texts[:len(df)]:
            result = analyzer(text)[0]
            score = result['score'] if result['label'] == 'POSITIVE' else -result['score']
            scores.append(score)
        
        df['Sentiment'] = pd.Series(scores, index=df.index[:len(scores)]).rolling(5).mean().ffill().bfill()
    except Exception as e:
        print(f"Sentiment analysis failed: {e}. Adding zero sentiment.")
        df['Sentiment'] = 0.0
    df.columns = df.columns.astype(str)
    print(f"Sentiment output columns: {list(df.columns)}")
    return df

def train_arima(df, target='Petrol'):
    """
    ARIMA model with auto_arima for optimal parameter selection and exogenous regressors.
    """
    print(f"ARIMA input columns: {list(df.columns)}")
    if target not in df.columns:
        print(f"Target column '{target}' not in DataFrame. Available columns: {list(df.columns)}")
        return None, None
    
    # Exogenous variables (all other columns)
    features = [col for col in df.columns if col != target and 'rolling' in col and 'std' in col or 'Sentiment' in col]
    X = df[features].ffill().bfill()
    
    cutoff = int(len(df) * 0.65)
    train, test = df[target][:cutoff], df[target][cutoff:]
    X_train, X_test = X[:cutoff], X[cutoff:]

    if train.isna().any() or test.isna().any():
        print(f"NaNs detected in train or test data for {target}. Skipping.")
        return None, None
    
    try:
        # Use auto_arima to find the best p, d, q parameters
        # and include other columns as exogenous variables
        model_fit = auto_arima(
            train, 
            exogenous=X_train, 
            seasonal=True, # Enable seasonal component
            m=12,          # Monthly data, so seasonality is 12
            stepwise=True,
            suppress_warnings=True,
            error_action='ignore',
            n_jobs=-1
        )
        
        preds = model_fit.predict(n_periods=len(test), exogenous=X_test)
        
        if np.any(np.isnan(preds)):
            print(f"NaNs in ARIMA predictions for {target}. Skipping.")
            return None, None
        
        mse = mean_squared_error(test, preds)
        print(f"ARIMAX MSE for {target}: {mse}")
        
        with open('arima_model.pkl', 'wb') as f:
            pickle.dump(model_fit, f)
        return preds, mse
    except Exception as e:
        print(f"ARIMA failed for {target}: {e}")
        return None, None


def train_lstm(df, target='Petrol', seq_length=12):
    """
    LSTM model with improved architecture and regularization.
    """
    # ... (existing data preprocessing and scaling code)
    # ...
    X_train, y_train = create_sequences(scaled_df.values[:cutoff], seq_length)
    X_test, y_test = create_sequences(scaled_df.values[cutoff:], seq_length)
    
    if len(X_train) == 0 or len(X_test) == 0:
        print(f"Insufficient valid sequences for LSTM. Returning None.")
        return None, None

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    
    class LSTM(nn.Module):
        def __init__(self, input_size=len(df.columns), hidden_size=50, num_layers=2, dropout_rate=0.2):
            super().__init__()
            self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
            self.fc = nn.Linear(hidden_size, 1)
        
        def forward(self, x):
            out, _ = self.lstm(x)
            return self.fc(out[:, -1, :])
    
    # Create the model with Dropout and more epochs
    model = LSTM(input_size=len(scaled_df.columns), hidden_size=100, num_layers=3, dropout_rate=0.3)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    
    for epoch in range(200): # Increase epochs for better training
        model.train()
        optimizer.zero_grad()
        output = model(X_train)
        loss = criterion(output, y_train)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        if epoch % 20 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item()}")
    
    model.eval()
    with torch.no_grad():
        preds = model(X_test).squeeze().numpy()
    
    full_scaler = MinMaxScaler().fit(df[target].values.reshape(-1,1))
    
    # Adjust y_test for correct inverse transformation
    y_test_full_scaled = full_scaler.transform(y_test.reshape(-1, 1))
    preds_full_scaled = full_scaler.transform(preds.reshape(-1, 1))
    
    mse = mean_squared_error(y_test_full_scaled, preds_full_scaled)
    print(f"LSTM MSE for {target}: {mse}")
    
    torch.save(model.state_dict(), 'lstm_model.pth')
    return preds, mse


def train_prophet(df, target='Petrol'):
    """
    Prophet model with hyperparameter tuning and external regressors.
    """
    print(f"Prophet input columns: {list(df.columns)}")
    if target not in df.columns:
        print(f"Target column '{target}' not in DataFrame. Available columns: {list(df.columns)}")
        return None, None

    prophet_df = df.reset_index().rename(columns={'index': 'ds', target: 'y'})
    
    # Add other energy sources and sentiment as regressors
    regressors = [col for col in df.columns if col != target]
    for regressor in regressors:
        prophet_df[regressor] = df[regressor].ffill().bfill().values
    
    cutoff = int(len(prophet_df) * 0.65)
    train, test = prophet_df[:cutoff], prophet_df[cutoff:]
    
    if train['y'].isna().any() or test['y'].isna().any():
        print(f"NaNs detected in train or test data for {target}. Skipping.")
        return None, None
    
    try:
        # Initialize Prophet with tuning parameters
        model = Prophet(
            growth='linear',
            seasonality_mode='multiplicative', # Multiplicative seasonality for financial data
            changepoint_prior_scale=0.05,
            seasonality_prior_scale=10.0
        )
        
        # Add the external regressors
        for regressor in regressors:
            model.add_regressor(regressor)

        model.fit(train[['ds', 'y'] + regressors])
        
        future = model.make_future_dataframe(periods=len(test), freq=df.index.freq)
        future = pd.merge(future, prophet_df[['ds'] + regressors], on='ds', how='left')
        
        forecast = model.predict(future)
        
        if forecast['yhat'].isna().any():
            print(f"NaNs in Prophet predictions for {target}. Skipping.")
            return None, None
        
        # Match forecast and test data for MSE calculation
        forecast_test = forecast.iloc[cutoff:]
        
        mse = mean_squared_error(test['y'], forecast_test['yhat'])
        print(f"Prophet MSE for {target}: {mse}")
        
        with open('prophet_model.pkl', 'wb') as f:
            pickle.dump(model, f)
        return forecast_test['yhat'], mse
    except Exception as e:
        print(f"Prophet failed for {target}: {e}")
        return None, None


def additional_analyses(df, target='Petrol'):
    """
    Correlations, volatility (GARCH), and Monte Carlo simulations.
    """
    print(f"Additional analyses input columns: {list(df.columns)}")
    if target not in df.columns:
        print(f"Target column '{target}' not in DataFrame. Available columns: {list(df.columns)}")
        return None, None, None
    corr = df.corr()
    sns.heatmap(corr, annot=True)
    plt.title('Energy Sources Correlations')
    plt.savefig('correlations_heatmap.png')
    plt.close()
    
    returns = df[target].pct_change().dropna() * 100
    if returns.empty:
        print(f"No valid returns data for {target}. Skipping GARCH.")
        return corr, None, None
    garch_model = arch_model(returns, vol='Garch', p=1, q=1)
    garch_fit = garch_model.fit(disp='off')
    vol_forecast = garch_fit.forecast(horizon=12)
    print("12-Month Volatility Forecast:", vol_forecast.variance.iloc[-1])
    
    last_price = df[target].iloc[-1]
    vol = df[target].pct_change().std()
    simulations = 100
    periods = 12
    paths = np.random.normal(0, vol, size=(periods, simulations))
    future_prices = last_price * np.exp(np.cumsum(paths, axis=0))
    mean_path = future_prices.mean(axis=1)
    
    plt.plot(mean_path)
    plt.title('Monte Carlo Price Simulation')
    plt.savefig('monte_carlo_sim.png')
    plt.close()
    
    return corr, vol_forecast, mean_path

def main():
    """
    Orchestrates the pipeline.
    """
    try:
        # Correctly unpack the tuple returned by fetch_energy_data
        df, _ = fetch_energy_data(fallback=False)
        transformed_df, original_df = preprocess_data(df)
        df_with_sentiment = add_nlp_sentiment(original_df, sentiment_analyzer)
        
        target = 'Petrol' if 'Petrol' in df_with_sentiment.columns else df_with_sentiment.columns[0]
        print(f"Using target column: {target}")
        
        arima_preds, arima_mse = train_arima(df_with_sentiment, target=target)
        lstm_preds, lstm_mse = train_lstm(df_with_sentiment, target=target)
        
        if lstm_preds is None:
            print("LSTM failed. Skipping LSTM results.")
            lstm_mse = float('inf')
        
        prophet_preds, prophet_mse = train_prophet(df_with_sentiment, target=target)
        
        corr, vol, sim = additional_analyses(df_with_sentiment, target=target)
        
        results = {
            'arima_mse': arima_mse if arima_mse is not None else float('inf'),
            'lstm_mse': lstm_mse if lstm_mse is not None else float('inf'),
            'prophet_mse': prophet_mse if prophet_mse is not None else float('inf')
        }
        pd.DataFrame([results]).to_csv('model_results.csv')
    except Exception as e:
        print(f"An unhandled error occurred in main.py: {e}")
        # Return a non-zero exit code to signal failure
        sys.exit(1)

if __name__ == '__main__':
    print("Starting data processing pipeline...")
    main()


Device set to use cpu


Starting data processing pipeline...
Successfully fetched Petrol: 2686 records
Successfully fetched Natural_Gas: 2686 records
Successfully fetched Crude_Oil: 2685 records
Successfully fetched Uranium: 2685 records
Final DataFrame shape: (129, 4)
Preprocess input columns: ['Petrol', 'Natural_Gas', 'Crude_Oil', 'Uranium']
Preprocess output columns: ['Petrol', 'Natural_Gas', 'Crude_Oil', 'Uranium', 'Petrol_rolling', 'Petrol_std', 'Natural_Gas_rolling', 'Natural_Gas_std', 'Crude_Oil_rolling', 'Crude_Oil_std', 'Uranium_rolling', 'Uranium_std']
Petrol ADF p-value: 0.10428640301440367
Natural_Gas ADF p-value: 0.07646257433152302
Crude_Oil ADF p-value: 0.2582934051091767
Uranium ADF p-value: 0.9957720528563957
Transformed DataFrame columns: ['Petrol', 'Natural_Gas', 'Crude_Oil', 'Uranium']
Sentiment input columns: ['Petrol', 'Natural_Gas', 'Crude_Oil', 'Uranium', 'Petrol_rolling', 'Petrol_std', 'Natural_Gas_rolling', 'Natural_Gas_std', 'Crude_Oil_rolling', 'Crude_Oil_std', 'Uranium_rolling', '

SystemExit: 1