In [None]:

# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Set seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [None]:

# Load dataset with basic check
import os

file_path = 'PB_All_2000_2021.csv'
if os.path.exists(file_path):
    df = pd.read_csv(file_path, sep=';')
    print("✅ Dataset loaded successfully.")
    display(df.head())
else:
    raise FileNotFoundError(f"❌ File not found: {file_path}")


In [None]:

# Data overview
print("Data Info:")
df.info()

print("\nData Shape:")
print(df.shape)

print("\nMissing values:")
print(df.isnull().sum())


In [None]:

# Fill missing values (simple strategy)
df = df.fillna(df.mean(numeric_only=True))
print("✅ Missing values handled (filled with column means).")


In [None]:

# Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:

# Define features and targets (example assumes last 3 columns are targets)
X = df.iloc[:, :-3]
y = df.iloc[:, -3:]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', MultiOutputRegressor(RandomForestRegressor(random_state=RANDOM_STATE)))
])

# Train model
pipeline.fit(X_train, y_train)
print("✅ Model training completed.")


In [None]:

# Predictions
y_pred = pipeline.predict(X_test)

# Evaluation Metrics
print("R2 Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

# Plot True vs Predicted
for i, col in enumerate(y.columns):
    plt.figure()
    plt.scatter(y_test.iloc[:, i], y_pred[:, i], alpha=0.6)
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title(f"{col} - Actual vs Predicted")
    plt.grid(True)
    plt.show()
