In [35]:
# Import necessary libraries
import numpy as np
import pymc as pm
import pandas as pd
from sklearn.model_selection import train_test_split

In [36]:
data = pd.read_csv('data/train.csv')


In [60]:
# Import necessary libraries
import numpy as np
import pymc as pm
import pandas as pd


# Select relevant features
selected_features = ['LotArea', 'Neighborhood', 'OverallQual', 'YearBuilt',
                     'GrLivArea', 'GarageCars', 'FullBath', 'Fireplaces', 'SalePrice']


# Encoding 'Neighborhood' as a categorical variable
data['Neighborhood'] = pd.Categorical(data['Neighborhood']).codes

# Normalize numerical predictors for better numerical stability
data['LotArea'] = data['LotArea'] / 1000  # Scale to 1000s of square feet
data['YearBuilt'] = (data['YearBuilt'] - data['YearBuilt'].mean()) / data['YearBuilt'].std()
data['GrLivArea'] = data['GrLivArea'] / 1000  # Scale to 1000s of square feet


X_train, X_test, y_train, y_test = train_test_split(data[[i for i in data.columns if i not in ['SalePrice']]], data.SalePrice, test_size=0.2, random_state=42)
X_train['SalePrice'] = list(y_train)

# Start the Bayesian Model
with pm.Model() as housing_model:
    # Priors for coefficients
    beta_0 = pm.Normal('Intercept', mu=0, sigma=10)  # Prior for the intercept
    beta_LotArea = pm.Normal('LotAreaEffect', mu=0, sigma=10)
    beta_Neighborhood = pm.Normal('NeighborhoodEffect', mu=0, sigma=10)
    beta_OverallQual = pm.Normal('OverallQualEffect', mu=0, sigma=10)
    beta_YearBuilt = pm.Normal('YearBuiltEffect', mu=0, sigma=10)
    beta_GrLivArea = pm.Normal('GrLivAreaEffect', mu=0, sigma=10)
    beta_GarageCars = pm.Normal('GarageCarsEffect', mu=0, sigma=10)
    beta_FullBath = pm.Normal('FullBathEffect', mu=0, sigma=10)
    beta_Fireplaces = pm.Normal('FireplacesEffect', mu=0, sigma=10)

    # Likelihood (data-generating process)
    mu = (
        beta_0 +
        beta_LotArea * data['LotArea'] +
        beta_Neighborhood * data['Neighborhood'] +
        beta_OverallQual * data['OverallQual'] +
        beta_YearBuilt * data['YearBuilt'] +
        beta_GrLivArea * data['GrLivArea'] +
        beta_GarageCars * data['GarageCars'] +
        beta_FullBath * data['FullBath'] +
        beta_Fireplaces * data['Fireplaces']
    )

    sigma = pm.HalfNormal('sigma', sigma=10)  # Prior for residual noise
    price_obs = pm.Normal('Price', mu=mu, sigma=sigma, observed=data['SalePrice'])

    # Explicitly create a compatible random number generator
    rng = np.random.default_rng(42)  # Random seed for reproducibility
    
    # Sampling using the random number generator - Markov Chain Monte Carlo
    trace = pm.sample(1000, tune=1000, random_seed=rng)


# Summarize posterior
print(pm.summary(trace))

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [Intercept, LotAreaEffect, NeighborhoodEffect, OverallQualEffect, YearBuiltEffect, GrLivAreaEffect, GarageCarsEffect, FullBathEffect, FireplacesEffect, sigma]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 9 seconds.


                        mean      sd    hdi_3%   hdi_97%  mcse_mean  mcse_sd  \
Intercept            322.861   9.932   304.611   342.503      0.143    0.101   
LotAreaEffect          3.928   9.959   -14.562    23.363      0.134    0.151   
NeighborhoodEffect  3839.083   9.039  3822.001  3855.394      0.119    0.084   
OverallQualEffect   2175.798   9.800  2158.185  2194.269      0.132    0.093   
YearBuiltEffect      100.973   9.931    81.689   119.075      0.127    0.091   
GrLivAreaEffect        0.504  10.058   -18.145    19.275      0.129    0.169   
GarageCarsEffect     660.089  10.151   639.708   678.018      0.132    0.093   
FullBathEffect       564.147  10.028   545.250   582.416      0.121    0.085   
FireplacesEffect     257.942   9.851   239.859   276.440      0.131    0.093   
sigma               7307.290   5.759  7296.496  7318.231      0.077    0.054   

                    ess_bulk  ess_tail  r_hat  
Intercept             4848.0    3115.0    1.0  
LotAreaEffect         5

In [62]:
# Extract posterior means for each feature and intercept
posterior_dist_feats = {}
for feature in selected_features + ['Intercept']:
    try:
        # Attempt to access the posterior for the feature
        if feature == 'Intercept':
            # Intercept has no "Effect" suffix
            arr = float(np.mean(trace.posterior['Intercept']))
        else:
            # Feature coefficients with "Effect" suffix
            arr = float(np.mean(trace.posterior[f'{feature}Effect']))
        
        # Store the mean of the posterior distribution
        posterior_dist_feats[feature] = arr

    except KeyError:
        # Handle cases where the posterior key does not exist
        print(f"Feature {feature} not found in posterior trace.")
        continue

# Sort features by the magnitude of their coefficients (descending)
train_features_coefficients = {k: v for k, v in sorted(posterior_dist_feats.items(), key=lambda i: abs(i[1]), reverse=True)}

# Make predictions
predictions = []
for i in range(X_test.shape[0]):
    loc_arr = X_test.iloc[i]
    prediction = 0
    for k, v in train_features_coefficients.items():
        if k == 'Intercept':
            # Add intercept to the prediction
            prediction += v
        else:
            # Add the weighted contribution of each feature
            prediction += loc_arr[k] * v

    predictions.append(prediction)

# Evaluate using RMSE
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Root Mean Squared Error (RMSE): {rmse}")


Feature SalePrice not found in posterior trace.
Root Mean Squared Error (RMSE): 142167.9037710385


## Why use normal distribution for the priors:
Rationale: Normal priors are used here because they encode the belief that the parameter values are most likely to cluster around a central value (e.g., 0) but with some spread (controlled by the standard deviation, sigma).
- #### Why It's Relevant:
        - Most effects in real-world data tend to cluster around a central value (e.g., the effect of neighborhood or room size). The Normal distribution is flexible, allowing you to encode both uncertainty (using large sigma) and prior knowledge (using a specific mean, mu).