# MSOA admissions regression feature selection

In [1]:
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

## Load data

In [2]:
data = pd.read_csv('./data/msoa_collated.csv', index_col='MSOA')

# Limit to England data
mask = data['country'] == 'E'
data = data[mask]
data = data.drop('country', axis=1)
data = data.drop('All persons', axis=1)

data.head()

FileNotFoundError: [Errno 2] No such file or directory: './data/msoa_collated.csv'

## Split into X and y, and set up K_folds

In [3]:
X = data.drop('admissions', axis=1)
y = data['admissions']

In [4]:
# Set up splits
number_of_splits = 6
skf = KFold(n_splits = number_of_splits, shuffle=True, random_state=42)
X_fields = list(X)
X = X.values
y = y.values
skf.get_n_splits(X, y)

# Set up lists for sets
X_train = []
X_test = []
y_train = []
y_test = []

# Get splits and add back data field names
# Loop through the k-fold splits
for train_index, test_index in skf.split(X, y):

    # Get X and Y train/test
    X_train_set, X_test_set = X[train_index], X[test_index]
    y_train_set, y_test_set = y[train_index], y[test_index]

    # Create dataframe and store
    X_train.append(pd.DataFrame(X_train_set, columns=X_fields))
    X_test.append(pd.DataFrame(X_test_set, columns=X_fields))
    y_train.append(pd.DataFrame(y_train_set, columns=['admissions']))
    y_test.append(pd.DataFrame(y_test_set, columns=['admissions']))

## Feature selection

In [5]:
# set max features 
max_features = 25
max_features = min(len(X_fields), max_features) 
 
chosen_features = []
r_squared_by_feature = []

# Initialise chosen features list and run tracker
available_features = list(X_fields)

# Loop through features
for i in range(max_features):

    # Reset scores
    best_result = 0
    best_feature = ''

    # Loop through available features
    for feature in available_features:

        # Create copy of already chosen features to avoid original being changed
        features_to_use = chosen_features.copy()
        # Create a list of features from features already chosen + 1 new feature
        features_to_use.append(feature)

        # Set up a list to hold r-squared results for this feature for each kfold
        feature_r_squared_kfold = []

        # Loop through k folds

        r_square_results = []

        for k_fold in range(5):
            
            # Get selected data
            X_train_selected = X_train[k_fold][features_to_use]
            X_test_selected = X_test[k_fold][features_to_use]

            # Fit model
            model = LinearRegression().fit(X_train_selected, y_train[k_fold])

            # get predictions
            y_pred = model.predict(X_test_selected)
            r_square = metrics.r2_score(y_test[k_fold], y_pred)
            r_square_results.append(r_square)

        # Update chosen feature and result if this feature is a new best
        mean_r_square = np.mean(r_square_results)
        if mean_r_square > best_result:
            best_result = mean_r_square
            best_feature = feature

    # Best feature is selected
    chosen_features.append(best_feature)
    available_features.remove(best_feature)
    r_squared_by_feature.append(best_result)

    print (f'Feature {i+1:2.0f}: {best_feature}, r-squared: {best_result:0.3f}')

Feature  1: 80+, r-squared: 0.460
Feature  2: fair health, r-squared: 0.621
Feature  3: bad health, r-squared: 0.629
Feature  4: 65-79, r-squared: 0.634
Feature  5: good_health, r-squared: 0.634
Feature  6: IMD2019Score, r-squared: 0.634
Feature  7: 0-64, r-squared: 0.634
