In [None]:
import pandas as pd
import numpy as np

# STEP 1: Load Raw Data
df = pd.read_csv('../data/raw/BrentOilPrices.csv')

# A. Initial Inspection

In [None]:
print(df.info())
print(df.describe())
print(f"Missing values:\n{df.isnull().sum()}")

# B. Visualizing the Price History

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 6))
plt.plot(df.index, df['Price'], label='Brent Oil Price', color='teal')
plt.title('Historical Brent Oil Prices (1987-2022)')
plt.xlabel('Year')
plt.ylabel('USD per Barrel')
plt.grid(True, alpha=0.3)
plt.legend()
plt.show()

# C. Identifying Outliers and Volatility

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Distribution of Prices
sns.histplot(df['Price'], kde=True, ax=axes[0], color='orange')
axes[0].set_title('Distribution of Oil Prices')

# Boxplot to see outliers
sns.boxplot(x=df['Price'], ax=axes[1], color='lightgreen')
axes[1].set_title('Visualizing Price Outliers')

# D. Checking for "Seasonality"

In [None]:
df['Year'] = df.index.year
df['Month'] = df.index.month

plt.figure(figsize=(12, 6))
sns.boxplot(x='Month', y='Price', data=df)
plt.title('Price Distribution by Month (Seasonality Check)')
plt.show()

In [None]:
# Clean Dates
# The format '%d-%b-%y' handles '20-May-87'
df['Date'] = pd.to_datetime(df['Date'], format='mixed')
df.sort_values('Date', inplace=True)
df.set_index('Date', inplace=True)

# Handle missing days (filling weekend gaps)
df = df.resample('D').interpolate(method='linear')

#  Create Log Returns (This is your "Processed" feature)
# formula: log(Price_today / Price_yesterday)
df['Log_Return'] = np.log(df['Price'] / df['Price'].shift(1))

#  Save for later tasks
df.to_csv('../data/processed/brent_processed.csv')
print("Processing Complete. Data is now ready for PyMC modeling.")