In [1]:
# ----------------------------------------------------------------
# Implement various feature selection, Select Transforms
# ----------------------------------------------------------------

In [2]:
# Import pandas, read the file and split into X and Y
import pandas as pd
f = pd.read_csv('Students2.csv')
X = f.iloc[:, :-1]
Y = f.iloc[:,  -1]

In [3]:
# Import various select transforms along with the f_regression mode
from sklearn.feature_selection import SelectKBest,             \
                                      SelectPercentile,        \
                                      GenericUnivariateSelect, \
                                      f_regression

In [4]:
# Implement and print SelectKBest
# f_regression for regression problem. f_classif for classification
# k=3 meaning, we are asking for the 3 best features

selectorK = SelectKBest(score_func=f_regression, k=3)
x_k = selectorK.fit_transform(X, Y)
x_k

array([[   0,    6, 2491],
       [   1,    7, 2303],
       [   1,    6, 2475],
       [   1,    8, 2282],
       [   1,    8, 2359],
       [   2,    8, 2354],
       [   2,    6, 2943],
       [   3,    6, 2119],
       [   3,    7, 2511],
       [   3,    7, 2666],
       [   3,    8, 2934],
       [   4,    8, 2838],
       [   4,    7, 2102],
       [   4,    7, 2560],
       [   5,    8, 2068],
       [   5,    6, 2541],
       [   6,    6, 2690],
       [   6,    7, 2956],
       [   7,    8, 2239],
       [   7,    8, 2703],
       [   7,    6, 2603],
       [   7,    6, 2031],
       [   8,    7, 2885],
       [   8,    8, 2153],
       [   9,    8, 2384],
       [   9,    5, 2882],
       [   9,    8, 2271],
       [  10,    8, 2264],
       [  10,    8, 2522],
       [  11,    7, 2279]])

In [5]:
# Get f_score and p_values for the selected features
f_score = selectorK.scores_
p_values = selectorK.pvalues_

In [6]:
# Print the f_score and p_values
# Print the table of Features, F-Score and P-values
columns = list(X.columns)

print (" ")
print (" ")
print (" ")

print ("    Features     ", "F-Score    ", "P-Values")
print ("    -----------  ---------    ---------")

for i in range(0, len(columns)):
    f1 = "%4.2f" % f_score[i]
    p1 = "%2.6f" % p_values[i]
    print("    ", columns[i].ljust(12), f1.rjust(8),"    ", p1.rjust(8))

cols = selectorK.get_support(indices=True)
selectedCols = X.columns[cols].to_list()

print(selectedCols)

 
 
 
    Features      F-Score     P-Values
    -----------  ---------    ---------
     Hours          141.91      0.000000
     sHours           4.57      0.041403
     hoursplayed      0.00      0.969907
     income           0.16      0.692200
     distance         0.00      0.955528
     calories         0.40      0.530086
['Hours', 'sHours', 'calories']


In [7]:
# Implement SelectPercentile
# same concept as above but here, we are asking for the top 50% features

selectorP = SelectPercentile(score_func=f_regression, percentile=50)
x_p = selectorP.fit_transform(X, Y)
x_p

array([[   0,    6, 2491],
       [   1,    7, 2303],
       [   1,    6, 2475],
       [   1,    8, 2282],
       [   1,    8, 2359],
       [   2,    8, 2354],
       [   2,    6, 2943],
       [   3,    6, 2119],
       [   3,    7, 2511],
       [   3,    7, 2666],
       [   3,    8, 2934],
       [   4,    8, 2838],
       [   4,    7, 2102],
       [   4,    7, 2560],
       [   5,    8, 2068],
       [   5,    6, 2541],
       [   6,    6, 2690],
       [   6,    7, 2956],
       [   7,    8, 2239],
       [   7,    8, 2703],
       [   7,    6, 2603],
       [   7,    6, 2031],
       [   8,    7, 2885],
       [   8,    8, 2153],
       [   9,    8, 2384],
       [   9,    5, 2882],
       [   9,    8, 2271],
       [  10,    8, 2264],
       [  10,    8, 2522],
       [  11,    7, 2279]])

In [8]:
# Implement GenericUnivariateSelect with k_best
selectorG1 = GenericUnivariateSelect(score_func=f_regression,
                                     mode='k_best',
                                     param=3)
x_g1 = selectorG1.fit_transform(X,Y)
x_g1

array([[   0,    6, 2491],
       [   1,    7, 2303],
       [   1,    6, 2475],
       [   1,    8, 2282],
       [   1,    8, 2359],
       [   2,    8, 2354],
       [   2,    6, 2943],
       [   3,    6, 2119],
       [   3,    7, 2511],
       [   3,    7, 2666],
       [   3,    8, 2934],
       [   4,    8, 2838],
       [   4,    7, 2102],
       [   4,    7, 2560],
       [   5,    8, 2068],
       [   5,    6, 2541],
       [   6,    6, 2690],
       [   6,    7, 2956],
       [   7,    8, 2239],
       [   7,    8, 2703],
       [   7,    6, 2603],
       [   7,    6, 2031],
       [   8,    7, 2885],
       [   8,    8, 2153],
       [   9,    8, 2384],
       [   9,    5, 2882],
       [   9,    8, 2271],
       [  10,    8, 2264],
       [  10,    8, 2522],
       [  11,    7, 2279]])

In [None]:
# Implement GenericUnivariateSelect with percentile
selectorG2 = GenericUnivariateSelect(score_func=f_regression,
                                     mode='percentile',
                                     param=50)
x_g2 = selectorG2.fit_transform(X,Y)
x_g