## 4. Feature Selection

### Import Libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [None]:
# Load datasets
df_train = pd.read_csv(r'datasets\train_dataset.csv')
df_test = pd.read_csv(r'datasets\test_dataset.csv')

We now have a training a test datset ready to to build a model. However, we have a lot of features so we will likely have an overfit model as it stands. Therefore we will need to reduce our number of features.

We can first build and evaluate a basic model to act as our baeline: 

In [None]:
# We split our training dataset further into a training a validation set to avoid data leakage: 
df_train_val, df_test_val = train_test_split(df_train, test_size=0.2, random_state=42)

# Define X and y
X_train_val = df_train_val.drop(['Price'], axis=1)
X_test_val = df_test_val.drop(['Price'], axis=1)

y_train_val = df_train_val['Price']
y_test_val = df_test_val['Price']

In [None]:
# Create and fit model
rf = RandomForestRegressor()
rf.fit(X_train_val, y_train_val)

# Make predictions
y_pred = rf.predict(X_test_val)

# Evaluate
r2 = r2_score(y_test_val, y_pred)
print('R2 Score:', r2)

R2 Score: 0.5182466122875551


This is a pretty bad performance metric but we should be able to increase it through reducing our number of features. We will intially do this by reducing any feature that doesn't correlate sufficiently with our target variable Price.

In [None]:
# View correlation values
top_corr_vals = abs(df_train_val.corr()['Price']).sort_values(ascending=False)
print(top_corr_vals.head(20))

Price                 1.000000
Rating                0.481858
Dribbling             0.334290
Base Stats            0.312157
In Game Stats         0.309657
Passing               0.288767
Team_paris sg         0.285507
Pace                  0.262575
Skills                0.258870
Team_Other            0.254120
Shooting              0.246417
Weak Foot             0.230601
Team_fut icons        0.218905
League_icons          0.218905
Card Type_if          0.207927
Card Type_icon        0.160509
Popularity            0.148952
League_rem_eur_div    0.101327
WR_Att_OE             0.099537
LW                    0.093860
Name: Price, dtype: float64


Since all of our features have fairly low correlations values with our target variable, we will set our limit fairly low:

In [None]:
# Set a correlation limit
corr_limit = 0.2

# Get a list of features with a correlation over corr_limit
corr_over_lim = top_corr_vals[top_corr_vals > corr_limit].index.to_list()
corr_over_lim.remove('Price')

# Reduce features of df_train and df_test
X_train_val2 = X_train_val[corr_over_lim]
X_test_val2 = X_test_val[corr_over_lim]

# Show previous and new number of features
print('Previous number of features:', X_train_val.shape[1])
print('New number of features:', X_train_val2.shape[1])

Previous number of features: 81
New number of features: 14


We will also reduce one of any two features that have a correlation with one another of over 0.8

In [None]:
# Define colinearity limit 
colin_limit = 0.8

# Create a correlation matrix
corr_matrix = X_train_val2.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.8
to_drop = [column for column in upper.columns if any(upper[column] > colin_limit)]

# Drop the highly correlated features
X_train_val3 = X_train_val2.drop(to_drop, axis=1)
X_test_val3 = X_test_val2.drop(to_drop, axis=1)

# Show previous and new number of features
print('Previous number of features:', X_train_val2.shape[1])
print('New number of features:', X_train_val3.shape[1])

# Show dropped columns
print('Dropped features:', to_drop)

Previous number of features: 14
New number of features: 13
Dropped features: ['League_icons']


We can rerun our baseline model to see if we get better results:

In [None]:
# Create and fit model
rf = RandomForestRegressor()
rf.fit(X_train_val3, y_train_val)

# Make predictions
y_pred = rf.predict(X_test_val3)

# Evaluate
r2 = r2_score(y_test_val, y_pred)
print('R2 Score:', r2)

R2 Score: 0.5714809669861545


We can see that this has marginally improved the score, but we can build a function that will allow us to compare performance scores:

In [None]:
def test_colin_limits(df_train_val, df_test_val, corr_limit, colin_limit):

    # Find top correlation values   
    top_corr_vals = abs(df_train_val.corr()['Price']).sort_values(ascending=False)

    # Get a list of features with a correlation over corr_limit
    corr_over_lim = top_corr_vals[top_corr_vals > corr_limit].index.to_list()

    # Reduce features of df_train and df_test
    df_train_val2 = df_train_val[corr_over_lim]
    df_test_val2 = df_test_val[corr_over_lim]

    # Create a correlation matrix
    corr_matrix = df_train_val2.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.8
    to_drop = [column for column in upper.columns if any(upper[column] > colin_limit)]

    # Drop the highly correlated features
    df_train_val3 = df_train_val2.drop(to_drop, axis=1)
    df_test_val3 = df_test_val2.drop(to_drop, axis=1)

    # Define X and y
    X_train = df_train_val3.drop('Price', axis=1)
    X_test = df_test_val3.drop('Price', axis=1)

    y_train = df_train_val3['Price']
    y_test = df_test_val3['Price']

    # Create and fit model
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)

    # Make predictions
    y_pred = rf.predict(X_test)

    # Evaluate
    r2 = r2_score(y_test, y_pred)

    return r2

We then run this on a range of correlation and colinearity thresholds and find the best performing combination:

In [None]:
# Create a range of correlation and colinearity limits:
corr_lims = np.linspace(0.1, 0.3, 21)
colin_lims = np.linspace(0.5,1,11)

# Create am empty dictionary to store values
r2_dict = {}

# Loop through limits and add values to the dictionary
for corr in corr_lims:
    r2_scores = []
    num_feat_removed = [] 
    for colin in colin_lims:
        r2 = test_colin_limits(df_train, df_test, corr_limit, colin)
        r2_scores.append(r2)
    r2_dict[corr] = r2_scores


In [None]:
# Create a dataframe of r2 scores
r2_df = pd.DataFrame(r2_dict, index=colin_lims)

# Find the highest r2 score and its location in the dataframe
highest_r2 = r2_df.values.max()
highest_r2_loc = np.unravel_index(r2_df.values.argmax(), r2_df.shape)

# Get the values of the optimal correlation and colinearity limits
best_corr_lim = corr_lims[highest_r2_loc[1]]
best_colin_lim = colin_lims[highest_r2_loc[0]]

print(f'The highest R2 value is {highest_r2:.3f} when the correlation limit is {best_corr_lim:.2f} and colinearity limit is {best_colin_lim:.2f}')


The highest R2 value is 0.538 when the correlation limit is 0.13 and colinearity limit is 0.65


This is a good increase in performance so we will apply these limits before looking into building and tuning some additional models

In [None]:
# Load datasets again:
df_train = pd.read_csv(r'datasets\train_dataset.csv')
df_test = pd.read_csv(r'datasets\test_dataset.csv')

In [None]:
# Set correlation and colin limits:
corr_limit = best_corr_lim
colin_limit = best_colin_lim

# Get a list of features with a correlation over corr_limit
corr_over_lim = top_corr_vals[top_corr_vals > corr_limit].index.to_list()

# Reduce features of df_train and df_test
df_train2 = df_train[corr_over_lim]
df_test2 = df_test[corr_over_lim]

# Create a correlation matrix
corr_matrix = df_train2.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.8
to_drop = [column for column in upper.columns if any(upper[column] > colin_limit)]

# Drop the highly correlated features
df_train3 = df_train2.drop(to_drop, axis=1)
df_test3 = df_test2.drop(to_drop, axis=1)

# Show previous and new number of features
print('Previous number of features:', df_train.shape[1]-1)
print('New number of features:', df_train3.shape[1]-1)

# Show remaining features
rem_features = df_train3.columns.to_list()
rem_features.remove('Price')
print('\n')
print('Remaining features:', rem_features)


Previous number of features: 81
New number of features: 11


Remaining features: ['Rating', 'Dribbling', 'Base Stats', 'Team_paris sg', 'Pace', 'Skills', 'Team_Other', 'Weak Foot', 'Team_fut icons', 'Card Type_if', 'Popularity']


In [None]:
df_train3.to_csv(r'datasets\train_dataset_reduced_features.csv', index=False)
df_test3.to_csv(r'datasets\test_dataset_reduced_features.csv', index=False)

We have removed quite a lot of features so we can always go back and reduce out correlation and colinearity limits, but for now we can try building different models.