<a href="https://colab.research.google.com/github/runnithan03/Dissertation/blob/main/MRR_Selection_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
import os

uploaded = files.upload()

uploaded_filename = list(uploaded.keys())[0]
os.rename(uploaded_filename, 'clean.csv')

Saving clean.csv to clean.csv


In [None]:
%reload_ext rpy2.ipython

In [None]:
%%R

data <- read.csv('clean.csv')

# Frequency encode the categorical variables
rating_counts <- table(data$rating)
data$rating_encoded <- as.numeric(rating_counts[data$rating])

risk_rating_counts <- table(data$risk_rating)
data$risk_rating_encoded <- as.numeric(risk_rating_counts[data$risk_rating])

equity_category_counts <- table(data$equity_category)
data$equity_category_encoded <- as.numeric(equity_category_counts[data$equity_category])

data <- data[, setdiff(names(data), c("rating", "risk_rating", "equity_category"))]
dim(data)

[1] 1389   14


In [None]:
%%R

# function to evaluate model fit
calculate_nrmse <- function(model, data) {
  residuals_matrix <- residuals(model)

  if (is.null(residuals_matrix) || ncol(as.matrix(residuals_matrix)) == 0) {
    stop("Residuals matrix is NULL or has zero dimensions.")
  }

  # calculate RMSE for each response
  rmse_per_response <- apply(as.matrix(residuals_matrix), 2, function(residuals) {
    sqrt(mean(residuals^2))
  })

  # Normalise RMSE by the standard deviation of each response variable
  response_vars <- c("roe", "sustainability_score")
  std_devs <- apply(data[, response_vars], 2, function(column) {
    sd(column, na.rm = TRUE)
  })

  nrmse <- sum(rmse_per_response / std_devs) / length(std_devs)  # average across responses
  return(nrmse)
}
multi_model <- lm(cbind(roe, sustainability_score)~., data = data)
print(calculate_nrmse(multi_model, data))

[1] 0.7751806


Add Predictors

In [None]:
%%R
add_predictors <- function(data, response_vars, selected_predictors, remaining_predictors) {
  metrics <- c()
  candidate_models <- list()

  for (predictor in remaining_predictors) {
    formula <- as.formula(
      paste0("cbind(", paste(response_vars, collapse = ", "), ") ~ ",
             paste(c(selected_predictors, predictor), collapse = " + "))
    )
    tryCatch({
      model <- lm(formula, data = data)
      metric <- calculate_nrmse(model, data)
      metrics <- c(metrics, metric)
      candidate_models[[predictor]] <- model
    }, error = function(e) {
      metrics <- c(metrics, NA)
      candidate_models[[predictor]] <- NULL
    })
  }

  return(list(metrics = metrics, candidate_models = candidate_models))
}

Remove Predictors

In [None]:
%%R
remove_predictors <- function(data, response_vars, selected_predictors) {
  metrics <- c()
  candidate_models <- list()

  for (predictor in selected_predictors) {
    remaining_predictors <- setdiff(selected_predictors, predictor)

    if (length(remaining_predictors) > 0) {
      formula <- as.formula(
        paste0("cbind(", paste(response_vars, collapse = ", "), ") ~ ",
               paste(remaining_predictors, collapse = " + "))
      )
      tryCatch({
        model <- lm(formula, data = data)
        metric <- calculate_nrmse(model, data)
        metrics <- c(metrics, metric)
        candidate_models[[predictor]] <- model
      }, error = function(e) {
        metrics <- c(metrics, NA)
        candidate_models[[predictor]] <- NULL
      })
    }
  }

  return(list(metrics = metrics, candidate_models = candidate_models))
}


Multi-variate Stepwise Selection

In [None]:
%%R

stepwise_multivariate <- function(data, method = "stepwise") {
  response_vars <- c("roe", "sustainability_score")
  predictors <- setdiff(names(data), response_vars)
  selected_predictors <- if (method == "forward") c() else predictors
  remaining_predictors <- if (method == "forward") predictors else c()
  best_model <- NULL
  best_metric <- Inf  # Initialize best nRMSE (lower is better)

  repeat {
    metrics <- c()
    candidate_models <- list()

    # Forward Step: add predictors
    if (method == "forward" || method == "stepwise") {
      forward_results <- add_predictors(data, response_vars, selected_predictors, remaining_predictors)
      metrics <- c(metrics, forward_results$metrics)
      candidate_models <- c(candidate_models, forward_results$candidate_models)
    }

    # Backward Step: remove predictors
    if (method == "backward" || method == "stepwise") {
      backward_results <- remove_predictors(data, response_vars, selected_predictors)
      metrics <- c(metrics, backward_results$metrics)
      candidate_models <- c(candidate_models, backward_results$candidate_models)
    }

    # For cases where no valid models are generated
    if (length(metrics) == 0 || all(is.na(metrics))) {
      cat("No valid predictors to add or remove. Stopping selection.\n")
      break
    }

    # Select the best candidate model
    valid_metrics <- which(!is.na(metrics) & is.finite(metrics))
    if (length(valid_metrics) == 0) {
      cat("No valid models due to NA or Inf values. Stopping selection.\n")
      break
    }

    best_candidate_index <- valid_metrics[which.min(metrics[valid_metrics])]
    best_candidate_metric <- metrics[best_candidate_index]
    best_candidate_predictor <- names(candidate_models)[best_candidate_index]
    best_candidate_model <- candidate_models[[best_candidate_predictor]]

    # Stop if no improvement
    if (best_candidate_metric >= best_metric) {
      cat("No improvement. Stopping selection.\n")
      break
    }

    # Update the best model and metric
    best_model <- best_candidate_model
    best_metric <- best_candidate_metric

    # Update selected/remaining predictors
    if (method == "forward" || (method == "stepwise" && best_candidate_predictor %in% remaining_predictors)) {
      selected_predictors <- c(selected_predictors, best_candidate_predictor)
      remaining_predictors <- setdiff(remaining_predictors, best_candidate_predictor)
    } else if (method == "backward" || (method == "stepwise" && best_candidate_predictor %in% selected_predictors)) {
      selected_predictors <- setdiff(selected_predictors, best_candidate_predictor)
      remaining_predictors <- c(remaining_predictors, best_candidate_predictor)
    }

    # Output for testing:
    cat("Selected Predictors:", paste(selected_predictors, collapse = ", "), "\n")
    cat("Current nRMSE:", best_metric, "\n")
  }

  return(list(model = best_model, metric = best_metric, predictors = selected_predictors))
}


In [None]:
%%R
backward_result <- stepwise_multivariate(data, method = "backward")
nrmse_values <- calculate_nrmse(backward_result$model, data)
print(nrmse_values)

Selected Predictors: equity_size_score, price_cash_flow_ratio, dividend_yield_factor, historical_earnings_growth, sales_growth, asset_cash, holdings_n_stock, ongoing_cost, fund_size, rating_encoded, equity_category_encoded 
Current nRMSE: 0.775553 
No improvement. Stopping selection.
[1] 0.775553


In [None]:
%%R
null_model <- lm(cbind(roe, sustainability_score) ~ 1, data = data)

forward_result <- stepwise_multivariate(data, method = "forward")
forward_result
summary(forward_result$model)
cat("Best Metric:", forward_result$metric, "\n")

Selected Predictors: price_cash_flow_ratio 
Current nRMSE: 0.9165175 
Selected Predictors: price_cash_flow_ratio, ongoing_cost 
Current nRMSE: 0.8797057 
Selected Predictors: price_cash_flow_ratio, ongoing_cost, equity_size_score 
Current nRMSE: 0.8546746 
Selected Predictors: price_cash_flow_ratio, ongoing_cost, equity_size_score, historical_earnings_growth 
Current nRMSE: 0.8291099 
Selected Predictors: price_cash_flow_ratio, ongoing_cost, equity_size_score, historical_earnings_growth, equity_category_encoded 
Current nRMSE: 0.8163122 
Selected Predictors: price_cash_flow_ratio, ongoing_cost, equity_size_score, historical_earnings_growth, equity_category_encoded, sales_growth 
Current nRMSE: 0.804155 
Selected Predictors: price_cash_flow_ratio, ongoing_cost, equity_size_score, historical_earnings_growth, equity_category_encoded, sales_growth, dividend_yield_factor 
Current nRMSE: 0.7959827 
Selected Predictors: price_cash_flow_ratio, ongoing_cost, equity_size_score, historical_earnin

In [None]:
%%R
bidirectional_result <- stepwise_multivariate(data, method = "stepwise")
nrmse_values <- calculate_nrmse(bidirectional_result$model, data)

print(nrmse_values)

Selected Predictors: equity_size_score, price_cash_flow_ratio, dividend_yield_factor, historical_earnings_growth, sales_growth, asset_cash, holdings_n_stock, ongoing_cost, fund_size, rating_encoded, equity_category_encoded 
Current nRMSE: 0.775553 
Selected Predictors: equity_size_score, price_cash_flow_ratio, dividend_yield_factor, historical_earnings_growth, sales_growth, asset_cash, holdings_n_stock, ongoing_cost, fund_size, rating_encoded, equity_category_encoded, risk_rating_encoded 
Current nRMSE: 0.7751806 
No improvement. Stopping selection.
[1] 0.7751806


Including Interaction Terms

In [None]:
%%R

generate_interaction_terms <- function(data, predictors) {
  for (i in 1:(length(predictors) - 1)) {
    for (j in (i + 1):length(predictors)) {
      interaction_name <- paste0(predictors[i], "_x_", predictors[j])
      data[[interaction_name]] <- data[[predictors[i]]] * data[[predictors[j]]]
    }
  }
  return(data)
}

# Add interaction terms to the dataset
predictors <- setdiff(names(data), c("roe", "sustainability_score"))
data_interaction <- generate_interaction_terms(data, predictors)

In [None]:
%%R

bidirectional_result <- stepwise_multivariate(data_interaction, method = "stepwise")
nrmse_values <- calculate_nrmse(bidirectional_result$model, data_interaction)

print(nrmse_values)

Selected Predictors: equity_size_score, price_cash_flow_ratio, dividend_yield_factor, historical_earnings_growth, sales_growth, asset_cash, holdings_n_stock, ongoing_cost, rating_encoded, risk_rating_encoded, equity_category_encoded, equity_size_score_x_price_cash_flow_ratio, equity_size_score_x_dividend_yield_factor, equity_size_score_x_historical_earnings_growth, equity_size_score_x_sales_growth, equity_size_score_x_asset_cash, equity_size_score_x_holdings_n_stock, equity_size_score_x_ongoing_cost, equity_size_score_x_fund_size, equity_size_score_x_rating_encoded, equity_size_score_x_risk_rating_encoded, equity_size_score_x_equity_category_encoded, price_cash_flow_ratio_x_dividend_yield_factor, price_cash_flow_ratio_x_historical_earnings_growth, price_cash_flow_ratio_x_sales_growth, price_cash_flow_ratio_x_asset_cash, price_cash_flow_ratio_x_holdings_n_stock, price_cash_flow_ratio_x_ongoing_cost, price_cash_flow_ratio_x_fund_size, price_cash_flow_ratio_x_rating_encoded, price_cash_fl

Non-linear Terms

In [None]:
%%R

generate_polynomial_terms <- function(data, predictors, degree = 2) {
  for (predictor in predictors) {
    for (d in 2:degree) {
      term_name <- paste0(predictor, "_degree_", d)
      data[[term_name]] <- data[[predictor]]^d
    }
  }
  return(data)
}

# Add the Polynomial Terms to the dataset
predictors <- setdiff(names(data), c("roe", "sustainability_score"))
data_nonlinear <- generate_polynomial_terms(data, predictors, degree = 3)

# Perform Stepwise Selection
result <- stepwise_multivariate(data_nonlinear, method = "stepwise")
nrmse_values <- calculate_nrmse(result$model, data_nonlinear)
print(nrmse_values)
print("Selected Predictors:")
print(result$predictors)

Selected Predictors: equity_size_score, price_cash_flow_ratio, dividend_yield_factor, historical_earnings_growth, sales_growth, asset_cash, holdings_n_stock, ongoing_cost, fund_size, rating_encoded, risk_rating_encoded, equity_category_encoded, equity_size_score_degree_2, equity_size_score_degree_3, price_cash_flow_ratio_degree_2, price_cash_flow_ratio_degree_3, dividend_yield_factor_degree_2, dividend_yield_factor_degree_3, historical_earnings_growth_degree_2, historical_earnings_growth_degree_3, sales_growth_degree_2, sales_growth_degree_3, asset_cash_degree_2, asset_cash_degree_3, holdings_n_stock_degree_2, holdings_n_stock_degree_3, ongoing_cost_degree_2, ongoing_cost_degree_3, fund_size_degree_2, rating_encoded_degree_2, rating_encoded_degree_3, risk_rating_encoded_degree_2, risk_rating_encoded_degree_3, equity_category_encoded_degree_2, equity_category_encoded_degree_3 
Current nRMSE: 0.6565547 
Selected Predictors: equity_size_score, price_cash_flow_ratio, dividend_yield_factor,

In [None]:
%%R

forward_result$model


Call:
lm(formula = formula, data = data)

Coefficients:
                            roe         sustainability_score
(Intercept)                  2.585e+00   2.239e+01          
price_cash_flow_ratio        5.513e-01  -2.225e-01          
ongoing_cost                -4.723e+00   4.456e+00          
equity_size_score            3.535e-02  -4.943e-03          
historical_earnings_growth   4.880e-02   1.109e-01          
equity_category_encoded      2.187e-02  -1.045e-02          
sales_growth                 1.581e-01   1.799e-01          
dividend_yield_factor       -1.985e-01   5.301e-01          
rating_encoded              -6.288e-03   1.266e-03          
holdings_n_stock             1.249e-03   1.057e-03          
fund_size                   -2.533e-11   6.086e-12          
asset_cash                  -1.846e-01   1.967e-02          
risk_rating_encoded          3.762e-04   7.032e-04          



In [None]:
%%R

forward_result
# Predict values for the same dataset (or a new dataset)
predicted_values <- predict(forward_result$model, newdata = data)