In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

# Visual settings
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

%matplotlib inline

In [None]:
# Load the dataset
# Assuming the data is in the data folder relative to this notebook
try:
    df = pd.read_csv('../data/BrentOilPrices.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Dataset not found. Please ensure 'BrentOilPrices.csv' is in the 'data' directory.")
    # Create a dummy dataframe for demonstration purposes if file is missing
    dates = pd.date_range(start='1987-05-20', end='2022-09-30', freq='D')
    df = pd.DataFrame({
        'Date': dates.strftime('%d-%b-%y'),
        'Price': np.random.lognormal(mean=3, sigma=0.5, size=len(dates))
    })

# Display first few rows
df.head()

In [None]:
# Data Info and Preprocessing
print(df.info())

# Convert Date to datetime
# The format is 'day-month-year' e.g., 20-May-87
df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%y')
df.set_index('Date', inplace=True)
df.sort_index(inplace=True)

# Check for missing values
print("\nMissing values:\n", df.isnull().sum())

df.head()

## 2. Time Series Visualization (Trend Analysis)
Visualize the raw price series to identify major trends and shocks.

In [None]:
plt.figure(figsize=(15, 7))
plt.plot(df.index, df['Price'], label='Brent Oil Price', color='blue')
plt.title('Brent Oil Prices (1987 - 2022)')
plt.xlabel('Date')
plt.ylabel('Price (USD per barrel)')
plt.legend()
plt.show()

## 3. Stationarity Analysis
We use the **Augmented Dickey-Fuller (ADF)** test to check for stationarity.
- **Null Hypothesis (H0):** The time series has a unit root (is non-stationary).
- **Alternate Hypothesis (H1):** The time series has no unit root (is stationary).
If p-value < 0.05, we reject H0.

We will test both the original price series and the **log returns**.

In [None]:
def perform_adf_test(series, name="Series"):
    result = adfuller(series.dropna())
    print(f'ADF Test Results for {name}:')
    print(f'ADF Statistic: {result[0]}')
    print(f'p-value: {result[1]}')
    print('Critical Values:')
    for key, value in result[4].items():
        print(f'\t{key}: {value}')

    if result[1] < 0.05:
        print("=> The series is likely Stationary.")
    else:
        print("=> The series is likely Non-Stationary.")
    print("-" * 30)

# Calculate Log Returns
df['Log_Return'] = np.log(df['Price']) - np.log(df['Price'].shift(1))

perform_adf_test(df['Price'], "Raw Prices")
perform_adf_test(df['Log_Return'], "Log Returns")

# Plot Log Returns
plt.figure(figsize=(15, 7))
plt.plot(df.index, df['Log_Return'], alpha=0.7, color='green')
plt.title('Log Returns of Brent Oil Prices')
plt.xlabel('Date')
plt.ylabel('Log Return')
plt.show()

## 4. Volatility Analysis
Analyze volatility clustering by calculating rolling statistics (mean and standard deviation). High standard deviation indicates high volatility.

In [None]:
# Rolling window of 30 days
rolling_window = 30
df['Rolling_Mean'] = df['Price'].rolling(window=rolling_window).mean()
df['Rolling_Std'] = df['Price'].rolling(window=rolling_window).std()

plt.figure(figsize=(15, 7))
plt.plot(df.index, df['Price'], label='Original Price', alpha=0.5)
plt.plot(df.index, df['Rolling_Mean'], label=f'{rolling_window}-Day Rolling Mean', color='orange')
plt.plot(df.index, df['Rolling_Std'], label=f'{rolling_window}-Day Rolling Std', color='red', linewidth=1.5)
plt.title(f'Rolling Mean & Standard Deviation ({rolling_window} Days)')
plt.legend()
plt.show()

## 5. Time Series Decomposition
Decompose the time series into Trend, Seasonality, and Residual components.
*Note: We need to handle missing values or ensure frequency is set for decomposition.*

In [None]:
# Resample to ensure daily frequency and forward fill missing values if any
df_daily = df.asfreq('D', method='ffill')

# Apply seasonal decomposition
# Using multiplicative model as volatility seems to increase with price level
# Period = 365 for yearly seasonality
decomposition = seasonal_decompose(df_daily['Price'], model='multiplicative', period=365)

fig = decomposition.plot()
fig.set_size_inches(15, 10)
plt.show()

## 6. Understanding Change Point Analysis
**Change Point Detection (CPD)** is the process of identifying times when the probability distribution of a stochastic process or time series changes. In the context of Brent Oil Prices, we are looking for "structural breaks"â€”points in time where the underlying parameters of the price generation process (like the mean price or volatility) shift significantly.

**Why use it here?**
Standard time series models usually assume stationarity (constant parameters over time). However, oil prices are heavily influenced by external shocks (wars, pandemics, policy changes). CPD allows us to:
1.  Segment volume into different regimes (e.g., Pre-Crisis vs. Crisis).
2.  Quantify the impact of specific events.

**Approach for Task 2:**
We will use **Bayesian Change Point Detection** using **PyMC**.
-   We treat the "switch point" (the day the change happens) as a random variable to be inferred.
-   We define a model where the price comes from one distribution before the switch and another after.
-   MCMC sampling gives us probabilities for when the switch likely occurred.


In [None]:
# Save processed data for Task 2 if needed
df.to_csv('../data/BrentOilPrices_Processed.csv')
print("Processed data saved to ../data/BrentOilPrices_Processed.csv")