#  AutoFeatureSelector Tool

### Using different feature selector methods to build an Automatic Feature Selection tool
- Pearson Correlation
- Chi-Square
- RFE
- Embedded
- Tree (Random Forest)
- Tree (Light GBM)

### Dataset: FIFA 19 Player Skills
#### Attributes: FIFA 2019 players attributes like Age, Nationality, Overall, Potential, Club, Value, Wage, Preferred Foot, International Reputation, Weak Foot, Skill Moves, Work Rate, Position, Jersey Number, Joined, Loaned From, Contract Valid Until, Height, Weight, LS, ST, RS, LW, LF, CF, RF, RW, LAM, CAM, RAM, LM, LCM, CM, RCM, RM, LWB, LDM, CDM, RDM, RWB, LB, LCB, CB, RCB, RB, Crossing, Finishing, Heading, Accuracy, ShortPassing, Volleys, Dribbling, Curve, FKAccuracy, LongPassing, BallControl, Acceleration, SprintSpeed, Agility, Reactions, Balance, ShotPower, Jumping, Stamina, Strength, LongShots, Aggression, Interceptions, Positioning, Vision, Penalties, Composure, Marking, StandingTackle, SlidingTackle, GKDiving, GKHandling, GKKicking, GKPositioning, GKReflexes, and Release Clause.

In [177]:
%matplotlib inline
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
from collections import Counter
import math
from scipy import stats
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier


In [178]:
player_df = pd.read_csv("fifa19.csv")

In [179]:
# Select relevant columns
numcols = ['Overall', 'Crossing','Finishing', 'ShortPassing', 'Dribbling','LongPassing', 'BallControl', 'Acceleration',
           'SprintSpeed', 'Agility', 'Stamina','Volleys','FKAccuracy','Reactions','Balance','ShotPower','Strength',
           'LongShots','Aggression','Interceptions']
catcols = ['Preferred Foot','Position','Body Type','Nationality','Weak Foot']
player_df = player_df[numcols+catcols]
traindf = pd.concat([player_df[numcols], pd.get_dummies(player_df[catcols])],axis=1)
# Exclude features with 'Nationality'
nationality_columns = [col for col in traindf.columns if 'Nationality' in col]
traindf = traindf.drop(nationality_columns, axis=1)

In [180]:
player_df = player_df[numcols+catcols]

In [181]:

features = traindf.columns
traindf = traindf.dropna()
traindf = pd.DataFrame(traindf, columns=features)

y = traindf['Overall'] >= 87
X = traindf.copy()
del X['Overall']

In [182]:
X.head()

Unnamed: 0,Crossing,Finishing,ShortPassing,Dribbling,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Stamina,...,Body Type_Akinfenwa,Body Type_C. Ronaldo,Body Type_Courtois,Body Type_Lean,Body Type_Messi,Body Type_Neymar,Body Type_Normal,Body Type_PLAYER_BODY_TYPE_25,Body Type_Shaqiri,Body Type_Stocky
0,84.0,95.0,90.0,97.0,87.0,96.0,91.0,86.0,91.0,72.0,...,False,False,False,False,True,False,False,False,False,False
1,84.0,94.0,81.0,88.0,77.0,94.0,89.0,91.0,87.0,88.0,...,False,True,False,False,False,False,False,False,False,False
2,79.0,87.0,84.0,96.0,78.0,95.0,94.0,90.0,96.0,81.0,...,False,False,False,False,False,True,False,False,False,False
3,17.0,13.0,50.0,18.0,51.0,42.0,57.0,58.0,60.0,43.0,...,False,False,False,True,False,False,False,False,False,False
4,93.0,82.0,92.0,86.0,91.0,91.0,78.0,76.0,79.0,90.0,...,False,False,False,False,False,False,True,False,False,False


In [183]:
len(X.columns)

59

### Set some fixed set of features

In [184]:
feature_name = list(X.columns)
# no of maximum features we need to select
num_feats=30

## Filter Feature Selection - Pearson Correlation

### Pearson Correlation function

In [185]:
def cor_selector(X, y, num_feats):
    cor_support = X.corrwith(y)
    cor_feature = cor_support.abs().sort_values(ascending=False).index
    return cor_support, cor_feature

In [186]:
cor_support, cor_feature = cor_selector(X, y,num_feats)
print(str(len(cor_feature)), 'selected features')

59 selected features


### List the selected features from Pearson Correlation

In [187]:
print(cor_feature)

Index(['Reactions', 'Body Type_C. Ronaldo', 'Body Type_Messi',
       'Body Type_Neymar', 'Body Type_Courtois',
       'Body Type_PLAYER_BODY_TYPE_25', 'Position_LF', 'Position_RF',
       'ShortPassing', 'Volleys', 'LongPassing', 'FKAccuracy', 'BallControl',
       'Finishing', 'LongShots', 'ShotPower', 'Dribbling', 'Crossing',
       'Agility', 'Weak Foot', 'Stamina', 'Strength', 'SprintSpeed',
       'Acceleration', 'Position_LAM', 'Aggression', 'Balance', 'Position_LW',
       'Interceptions', 'Position_CM', 'Body Type_Lean', 'Position_RB',
       'Position_LS', 'Position_RCM', 'Position_LCM', 'Position_CB',
       'Position_LCB', 'Position_LM', 'Position_GK', 'Position_RW',
       'Body Type_Normal', 'Position_LB', 'Body Type_Stocky', 'Position_RM',
       'Preferred Foot_Left', 'Preferred Foot_Right', 'Position_RCB',
       'Position_ST', 'Position_CAM', 'Position_CDM', 'Position_RWB',
       'Position_RS', 'Position_LWB', 'Position_CF', 'Position_LDM',
       'Position_RDM', 'Po

## Filter Feature Selection - Chi-Sqaure

In [215]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import f_regression


### Chi-Squared Selector function

In [259]:
def chi_squared_selector(X, y, num_feats):
    chi_support = SelectKBest(score_func=f_regression, k=num_feats)
    chi_support.fit(X, y)
    chi_feature = X.columns[chi_support.get_support()]
    return chi_feature

from sklearn.feature_selection import mutual_info_regression

def mutual_info_selector(X, y, num_feats):
    mi_support = SelectKBest(score_func=mutual_info_regression, k=num_feats)
    mi_support.fit(X, y)
    mi_feature = X.columns[mi_support.get_support()]
    return mi_feature


In [260]:
chi_feature = chi_squared_selector(X, y, num_feats)
print(str(len(chi_feature)), 'selected features')

30 selected features


### List the selected features from Chi-Square 

In [239]:
chi_feature

Index(['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing',
       'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina',
       'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower',
       'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Weak Foot',
       'Position_CM', 'Position_LAM', 'Position_LF', 'Position_LW',
       'Position_RF', 'Body Type_C. Ronaldo', 'Body Type_Courtois',
       'Body Type_Messi', 'Body Type_Neymar', 'Body Type_PLAYER_BODY_TYPE_25'],
      dtype='object')

## Wrapper Feature Selection - Recursive Feature Elimination

In [240]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor


### RFE Selector function

In [241]:
def rfe_selector(X, y, num_feats):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X, y)
    feature_importance = model.feature_importances_

    # Get the indices of the top k features
    top_k_indices = feature_importance.argsort()[-num_feats:][::-1]

    rf_feature = X.columns[top_k_indices]
    return rfe_support, rfe_feature

In [242]:
rfe_support, rfe_feature = rfe_selector(X, y,num_feats)
print(str(len(rfe_feature)), 'selected features')

30 selected features


### List the selected features from RFE

In [243]:
rfe_feature

['Reactions',
 'Balance',
 'Strength',
 'Weak Foot',
 'Preferred Foot_Left',
 'Preferred Foot_Right',
 'Position_CAM',
 'Position_CM',
 'Position_GK',
 'Position_LAM',
 'Position_LCB',
 'Position_LF',
 'Position_LM',
 'Position_LS',
 'Position_LW',
 'Position_RAM',
 'Position_RB',
 'Position_RCB',
 'Position_RCM',
 'Position_RF',
 'Position_RM',
 'Position_RW',
 'Position_ST',
 'Body Type_C. Ronaldo',
 'Body Type_Courtois',
 'Body Type_Lean',
 'Body Type_Messi',
 'Body Type_Neymar',
 'Body Type_PLAYER_BODY_TYPE_25',
 'Body Type_Stocky']

## Embedded Selection - Lasso: SelectFromModel

In [244]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [245]:
def embedded_log_reg_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    # Standardize features using StandardScaler
    print("total number of features set by us is :",num_feats)
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Choose Logistic Regression with L1 penalty (Lasso) as the estimator
    estimator = LogisticRegression(penalty='l1', solver='liblinear')

    # Use SelectFromModel to perform feature selection
    embedded_lr_selector = SelectFromModel(estimator, max_features=num_feats)
    embedded_lr_selector.fit(X_scaled, y)

    # Get the selected features
    embedded_lr_support = embedded_lr_selector.get_support()
    embedded_lr_feature = X.columns[embedded_lr_support].tolist()

    # Print the selected features
    print("Selected features:")
    print(embedded_lr_feature)

    return embedded_lr_support, embedded_lr_feature

In [246]:
embedded_lr_support, embedded_lr_feature = embedded_log_reg_selector(X, y, num_feats)
print(str(len(embedded_lr_feature)), 'selected features')

total number of features set by us is : 30
Selected features:
['LongPassing', 'Reactions', 'Balance', 'Aggression', 'Preferred Foot_Right', 'Position_CAM', 'Position_CM', 'Position_GK', 'Position_LCB', 'Position_LM', 'Position_LW', 'Position_RB', 'Position_RCB', 'Position_RW', 'Body Type_Lean', 'Body Type_Stocky']
16 selected features


In [247]:
embedded_lr_feature

['LongPassing',
 'Reactions',
 'Balance',
 'Aggression',
 'Preferred Foot_Right',
 'Position_CAM',
 'Position_CM',
 'Position_GK',
 'Position_LCB',
 'Position_LM',
 'Position_LW',
 'Position_RB',
 'Position_RCB',
 'Position_RW',
 'Body Type_Lean',
 'Body Type_Stocky']

## Tree based(Random Forest): SelectFromModel

In [248]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [249]:
def embedded_rf_selector(X, y, num_feats):
    # Your code goes here (Multiple lines)
    
    # Choosing RandomForestClassifier as the estimator
    estimator = RandomForestClassifier()

    # Using SelectFromModel to perform feature selection
    embedded_rf_selector = SelectFromModel(estimator, max_features=num_feats)
    embedded_rf_selector.fit(X, y)

    # Get the selected features
    embedded_rf_support = embedded_rf_selector.get_support()
    embedded_rf_feature = X.columns[embedded_rf_support].tolist()

    # Print the selected features
    print("Selected features:")
    print(embedded_rf_feature)
    # Your code ends here
    return embedded_rf_support, embedded_rf_feature

In [250]:
embedded_rf_support, embedded_rf_feature = embedded_rf_selector(X, y, num_feats)
print(str(len(embedded_rf_feature)), 'selected features')


Selected features:
['Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Stamina', 'Volleys', 'FKAccuracy', 'Reactions', 'Balance', 'ShotPower', 'Strength', 'LongShots', 'Aggression', 'Interceptions']
19 selected features


In [251]:
embedded_rf_feature

['Crossing',
 'Finishing',
 'ShortPassing',
 'Dribbling',
 'LongPassing',
 'BallControl',
 'Acceleration',
 'SprintSpeed',
 'Agility',
 'Stamina',
 'Volleys',
 'FKAccuracy',
 'Reactions',
 'Balance',
 'ShotPower',
 'Strength',
 'LongShots',
 'Aggression',
 'Interceptions']

## Tree based(Light GBM): SelectFromModel

In [252]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

In [253]:
def embedded_lgbm_selector(X, y, num_feats):
    embedded_lgbm_selector = SelectFromModel(LGBMClassifier(n_estimators=100), max_features=num_feats)
    embedded_lgbm_selector.fit(X, y)
    embedded_lgbm_support = embedded_lgbm_selector.get_support()
    embedded_lgbm_feature = X.loc[:, embedded_lgbm_support].columns.tolist()
    return embedded_lgbm_support, embedded_lgbm_feature

In [254]:
embedded_lgbm_support, embedded_lgbm_feature = embedded_lgbm_selector(X, y, num_feats)
print(str(len(embedded_lgbm_feature)), 'selected features')


[LightGBM] [Info] Number of positive: 55, number of negative: 18104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003605 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1664
[LightGBM] [Info] Number of data points in the train set: 18159, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003029 -> initscore=-5.796555
[LightGBM] [Info] Start training from score -5.796555
19 selected features


In [255]:
embedded_lgbm_feature

['Crossing',
 'Finishing',
 'ShortPassing',
 'Dribbling',
 'LongPassing',
 'BallControl',
 'Acceleration',
 'SprintSpeed',
 'Agility',
 'Stamina',
 'Volleys',
 'FKAccuracy',
 'Reactions',
 'Balance',
 'ShotPower',
 'Strength',
 'LongShots',
 'Aggression',
 'Interceptions']

## Putting all of it together: AutoFeatureSelector Tool

In [256]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embedded_lr_support,
                                    'Random Forest':embedded_rf_support, 'LightGBM':embedded_lgbm_support})
# count the selected times for each feature


# Count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df.iloc[:, 1:], axis=1)

# Display the top features
top_features_df = feature_selection_df.sort_values(['Total', 'Feature'], ascending=False).head(num_feats)
top_features_df.index = range(1, len(top_features_df) + 1)

# Display the top features
print(top_features_df)


                          Feature   Pearson  Chi-2    RFE  Logistics  \
1                       Reactions  0.150929   True   True       True   
2                         Balance  0.020264   True   True       True   
3                     LongPassing  0.062561   True  False       True   
4                        Strength  0.032390   True   True      False   
5                      Aggression  0.021708   True  False       True   
6                    ShortPassing  0.063966   True  False      False   
7                         Volleys  0.063605   True  False      False   
8                      FKAccuracy  0.058284   True  False      False   
9                     BallControl  0.058178   True  False      False   
10                      Finishing  0.056083   True  False      False   
11                      LongShots  0.054059   True  False      False   
12                      ShotPower  0.051085   True  False      False   
13                      Dribbling  0.045555   True  False      F

## Python script to select the best one

In [262]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.feature_selection import f_classif
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import f_classif
from sklearn.ensemble import RandomForestRegressor 

def preprocess_dataset(dataset_path):
    df = pd.read_csv(dataset_path)

    # Separate numeric and non-numeric columns
    numeric_cols = df.select_dtypes(include=['number']).columns
    non_numeric_cols = df.select_dtypes(exclude=['number']).columns

    # Impute missing values for numeric columns with the mean
    numeric_imputer = SimpleImputer(strategy='mean')
    df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])

    # Impute missing values for non-numeric columns with the most frequent value
    non_numeric_imputer = SimpleImputer(strategy='most_frequent')
    df[non_numeric_cols] = non_numeric_imputer.fit_transform(df[non_numeric_cols])

    # Exclude non-numeric columns
    df_numeric = df[numeric_cols]

    X = df_numeric.iloc[:, :-1]
    y = df_numeric.iloc[:, -1]
    num_feats = len(X.columns)
    return X, y, num_feats

def rf_feature_importance_selector(X, y, num_feats):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X, y)
    feature_importance = model.feature_importances_

    # Get the indices of the top k features
    top_k_indices = feature_importance.argsort()[-num_feats:][::-1]

    rf_feature = X.columns[top_k_indices]
    return rf_feature

def cor_selector(X, y, num_feats):
    cor_support = SelectKBest(score_func=f_classif, k=num_feats)
    cor_support.fit(X, y)
    cor_feature = X.columns[cor_support.get_support()]
    return cor_feature


def chi_squared_selector(X, y, num_feats):
    chi_support = SelectKBest(score_func=chi2, k=num_feats)
    chi_support.fit(X, y)
    chi_feature = X.columns[chi_support.get_support()]
    return chi_feature


def rfe_selector(X, y, num_feats):
    model = LogisticRegression(solver='lbfgs', max_iter=1000)
    rfe_selector = RFE(model, n_features_to_select=num_feats)
    rfe_selector = rfe_selector.fit(X, y)
    rfe_feature = X.columns[rfe_selector.support_]
    return rfe_feature


def embedded_log_reg_selector(X, y, num_feats):
    embedded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1", solver='liblinear'), max_features=num_feats)
    embedded_lr_selector.fit(X, y)
    embedded_lr_feature = X.columns[embedded_lr_selector.get_support()]
    return embedded_lr_feature


def embedded_rf_selector(X, y, num_feats):
    model = RandomForestClassifier(n_estimators=100, random_state=0)
    model.fit(X, y)
    embedded_rf_selector = SelectFromModel(model, max_features=num_feats)
    embedded_rf_selector.fit(X, y)
    embedded_rf_feature = X.columns[embedded_rf_selector.get_support()]
    return embedded_rf_feature


def embedded_lgbm_selector(X, y, num_feats):
    model = LGBMClassifier(n_estimators=100, random_state=0)
    model.fit(X, y)
    embedded_lgbm_selector = SelectFromModel(model, max_features=num_feats)
    embedded_lgbm_selector.fit(X, y)
    embedded_lgbm_feature = X.columns[embedded_lgbm_selector.get_support()]
    return embedded_lgbm_feature


def auto_feature_selector(dataset_path, methods=[]):
    X, y, num_feats = preprocess_dataset(dataset_path)
    selected_features = {}

    is_regression = y.dtype.kind == 'f'

    if 'pearson' in methods:
        selected_features['pearson'] = cor_selector(X, y, num_feats)

    if 'chi-square' in methods and not is_regression:
        chi_feature = chi_squared_selector(X, y, num_feats)
        selected_features['chi-square'] = chi_feature

    if 'rfe' in methods and not is_regression:
        selected_features['rfe'] = rf_feature_importance_selector(X, y, num_feats)

    if 'log-reg' in methods and not is_regression:
        log_reg_model = LogisticRegression(solver='liblinear', penalty="l1")
        selected_features['log-reg'] = embedded_selector(X, y, log_reg_model, num_feats)

    if 'rf' in methods:
        selected_features['rf'] = rf_feature_importance_selector(X, y, num_feats)

    if 'lgbm' in methods:
        selected_features['lgbm'] = embedded_lgbm_selector(X, y, num_feats)

    # Convert the selected features to DataFrame for consistent output
    result_df = pd.DataFrame(selected_features)
    return result_df




In [None]:
dataset_path = "fifa19.csv"
methods_to_try = ['pearson', 'chi-square', 'rfe', 'log-reg', 'rf', 'lgbm']
best_features = auto_feature_selector(dataset_path, methods=methods_to_try)
print(best_features)
