# Feature selection with F-Test

In [1]:
# --------------------------------------------------------
# Apply Feature Selection with F-Test on Linear Regression
# Compare the result with selected features
# --------------------------------------------------------

In [2]:
# Import libraries
import pandas as pd

In [3]:
# Read the file
f = pd.read_csv('Students2.csv')

In [4]:
f

Unnamed: 0,Hours,sHours,hoursplayed,income,distance,calories,Marks
0,0,6,6,146,9,2491,34
1,1,7,2,112,5,2303,36
2,1,6,1,84,7,2475,33
3,1,8,5,134,0,2282,39
4,1,8,5,104,8,2359,42
5,2,8,4,80,1,2354,45
6,2,6,3,93,0,2943,38
7,3,6,3,83,7,2119,45
8,3,7,6,139,1,2511,53
9,3,7,5,138,10,2666,46


In [5]:
# Split the columns into independent (X) and dependent (Y) features
X = f.iloc[:,:-1]
Y = f.iloc[:, -1]

In [6]:
# Perform Linear Regression using original dataset
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [7]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = \
    train_test_split(X, Y, test_size = 0.4, random_state = 1234)

lr.fit(X_train, Y_train)

Y_predict = lr.predict(X_test)
Y_predict

array([37.4304362 , 56.8479958 , 38.85467354, 38.975673  , 82.86947672,
       59.67634562, 52.29968093, 81.17290759, 67.77562938, 63.7577854 ,
       82.18209854, 86.65908659])

In [8]:
# Calculate the RMSE error for the regression
from sklearn.metrics import mean_squared_error
import math

rmse = math.sqrt(mean_squared_error(Y_test, Y_predict))
rmse

6.982206715357434

In [9]:
# import and perform the f_regression to get the F-Score and P-Values
from sklearn.feature_selection import f_regression as fr
result = fr(X,Y)
result

(array([1.41905913e+02, 4.57019756e+00, 1.44882087e-03, 1.59990513e-01,
        3.16606568e-03, 4.04208927e-01]),
 array([1.77038466e-12, 4.14028344e-02, 9.69907241e-01, 6.92200477e-01,
        9.55528076e-01, 5.30086171e-01]))

In [10]:
# Split the result tuple into F_Score and P_Values
f_score = result[0]
p_values = result[1]

print(f_score)
print(p_values)

[1.41905913e+02 4.57019756e+00 1.44882087e-03 1.59990513e-01
 3.16606568e-03 4.04208927e-01]
[1.77038466e-12 4.14028344e-02 9.69907241e-01 6.92200477e-01
 9.55528076e-01 5.30086171e-01]


In [11]:
# Print the table of Features, F-Score and P-values
columns = list(X.columns)

print (" ")
print (" ")
print (" ")

print ("    Features     ", "F-Score    ", "P-Values")
print ("    -----------  ---------    ---------")

for i in range(0, len(columns)):
    f1 = "%4.2f" % f_score[i]
    p1 = "%2.6f" % p_values[i]
    print("    ", columns[i].ljust(12), f1.rjust(8),"    ", p1.rjust(8))

 
 
 
    Features      F-Score     P-Values
    -----------  ---------    ---------
     Hours          141.91      0.000000
     sHours           4.57      0.041403
     hoursplayed      0.00      0.969907
     income           0.16      0.692200
     distance         0.00      0.955528
     calories         0.40      0.530086


In [12]:
# Perform the Linear Regression with reduced features
# In this case, we kept Hours and sHours because both are below 0.05 (alpha)
# we discarded the rest
X_train_n = X_train[['Hours', 'sHours']]
X_test_n = X_test[['Hours', 'sHours']]

lr1 = LinearRegression()
lr1.fit(X_train_n, Y_train)

Y_predict_n = lr1.predict(X_test_n)

In [13]:
# Calculate the RMSE with reduced features
rmse_n = math.sqrt(mean_squared_error(Y_test, Y_predict_n))
rmse_n

5.09721728108113

In [14]:
print(rmse)
print(rmse_n)

6.982206715357434
5.09721728108113


In [None]:
# rmse was improved with only 2 features - the ones we selected due to their p values being below 0