<a href="https://colab.research.google.com/github/runnithan03/Dissertation/blob/main/Filling_Missing_Values_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
from google.colab import files
import os

# Upload the file
uploaded = files.upload()

# Rename the uploaded file to 'data.csv'
uploaded_filename = list(uploaded.keys())[0]  # Get the uploaded file's original name
os.rename(uploaded_filename, 'Profit ESG dataset.csv')  # Rename it to 'data.csv'

Saving Profit ESG dataset.csv to Profit ESG dataset (1).csv


In [32]:
# Import rpy2's magic command
%reload_ext rpy2.ipython


In [33]:
%%R
# Load the renamed CSV file in R
data <- read.csv('Profit ESG dataset.csv')
data <- data[rowSums(is.na(data)) <= 2, ]

data$equity_category <- as.factor(data$equity_category)
data$rating <- as.factor(data$rating)
data$risk_rating <- as.factor(data$risk_rating)

head(data)

    roe sustainability_score                         equity_category rating
1 22.69                22.44                      Switzerland Equity      3
2 16.14                20.52                           Sweden Equity      2
3 14.88                18.91 Sector Equity Consumer Goods & Services      3
4 10.04                20.73               Eurozone Large-Cap Equity      4
5  8.75                23.96                  Japan Large-Cap Equity      3
6 31.07                22.84           Global Large-Cap Blend Equity      5
  risk_rating equity_size_score price_cash_flow_ratio dividend_yield_factor
1           3            328.15                 14.80                  3.23
2           2            248.09                 10.26                  1.45
3           3            308.52                 14.41                  2.59
4           3            275.97                  7.89                  2.60
5           2            286.08                  8.40                  2.75
6           

In [36]:
%%R
# Load necessary libraries
install.packages("ranger")  # Install ranger package if not already installed
library(ranger)
library(dplyr)

# Function to remove equity categories where any column is completely missing
remove_categories_with_any_missing_column <- function(data, equity_column) {
  # Get a list of all columns except the equity column
  target_columns <- setdiff(names(data), equity_column)

  # Identify equity categories where any column is completely missing
  missing_categories <- data %>%
    group_by(!!sym(equity_column)) %>%
    summarize(across(all_of(target_columns), ~ all(is.na(.)))) %>%
    rowwise() %>%
    mutate(any_missing = any(c_across(-!!sym(equity_column)))) %>%
    filter(any_missing) %>%
    pull(!!sym(equity_column))

  # Print the categories being removed for debugging
  print(paste("Removing categories with any completely missing columns:", paste(missing_categories, collapse = ", ")))

  # Remove these categories from the dataset
  data <- data %>% filter(!(!!sym(equity_column) %in% missing_categories))

  return(data)
}

# Function to impute missing numerical variables using dynamic linear regression
impute_continuous_lm <- function(data, column, response_vars) {
  # Identify all other columns
  other_columns <- setdiff(names(data), column)

  # If the column is a response variable, include all predictors and other responses
  if (column %in% response_vars) {
    predictors <- paste(other_columns, collapse = " + ")
  } else {
    # If the column is a predictor, exclude response variables
    predictors <- paste(setdiff(other_columns, response_vars), collapse = " + ")
  }

  # Train a linear model excluding rows with NA in the column
  lm_model <- lm(as.formula(paste(column, "~", predictors)), data = data, na.action = na.exclude)

  # Identify missing indices
  missing_indices <- which(is.na(data[[column]]))

  # Predict and replace missing values
  if (length(missing_indices) > 0) {
    predictions <- predict(lm_model, newdata = data[missing_indices, ])
    data[[column]][missing_indices] <- predictions
  }

  return(data)
}

# Function to impute missing categorical variables using random forests
impute_categorical_rf <- function(data, column) {
  # Train a Random Forest classification model excluding rows with NA in the column
  rf_model <- ranger(
    formula = as.formula(paste(column, "~ .")),
    data = data[!is.na(data[[column]]), ],
    na.action = "na.omit",
    classification = TRUE
  )

  # Identify missing indices
  missing_indices <- which(is.na(data[[column]]))

  # Predict and replace missing values
  if (length(missing_indices) > 0) {
    predictions <- predict(rf_model, data = data[missing_indices, ])$predictions
    data[[column]][missing_indices] <- predictions
  }

  return(data)
}

# Combined function to apply imputation
impute_missing_values <- function(data, response_vars) {
  # Identify numerical and categorical columns
  numerical_columns <- names(data)[sapply(data, is.numeric)]
  categorical_columns <- names(data)[sapply(data, is.factor)]

  # Impute missing values for numerical columns dynamically
  for (col in numerical_columns) {
    if (any(is.na(data[[col]]))) {
      data <- impute_continuous_lm(data, col, response_vars)
    }
  }

  # Impute missing values for categorical columns
  for (col in categorical_columns) {
    if (any(is.na(data[[col]]))) {
      data <- impute_categorical_rf(data, col)
    }
  }

  return(data)
}

# Specify response variables
response_vars <- c("roe", "sustainability_score")

# Remove equity categories where any column is completely missing
data <- remove_categories_with_any_missing_column(data, "equity_category")

# Drop unused factor levels
data$equity_category <- droplevels(data$equity_category)

# Apply the combined imputation method to your dataset
clean_data <- impute_missing_values(data, response_vars)

# Verify that no missing values remain
if (any(is.na(clean_data))) {
  stop("There are still missing values in the data!")
} else {
  print("All missing values have been successfully imputed!")
}

any(is.na(clean_data)) # Should return FALSE


(as ‘lib’ is unspecified)







	‘/tmp/RtmpdU37b2/downloaded_packages’



[1] "Removing categories with any completely missing columns: Alt - Market Neutral - Equity, Emerging Europe ex-Russia Equity, India Equity, Islamic Equity - Other, Other Equity, Sector Equity Private Equity, Vietnam Equity"






Error in withVisible({ : There are still missing values in the data!


In [37]:
%%R
clean_data$holdings_n_stock <- round(clean_data$holdings_n_stock) # holdings_n_stock is an integer value
clean_data$price_cash_flow_ratio <- signif(clean_data$price_cash_flow_ratio, digits = 3)


# Save the manipulated dataset as a CSV file
write.csv(clean_data, "clean_data.csv", row.names = FALSE)

In [38]:
from google.colab import files
files.download("clean_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>