***ANOVA: F-VALUE for FEATURE SELECTION***

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
import yfinance as yf

In [7]:
symbol = 'AAPL'
start = '2014-01-01'
end = '2024-01-01'

dataset = yf.download(symbol, start, end)
dataset.columns = dataset.columns.get_level_values(0)
dataset.head()

[*********************100%***********************]  1 of 1 completed




Price,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01-02,17.192822,17.314045,17.15832,17.272083,234684800
2014-01-03,16.81517,17.210543,16.798074,17.184434,392467600
2014-01-06,16.90686,16.996067,16.585775,16.705444,412610800
2014-01-07,16.785954,16.969963,16.720058,16.918988,317209200
2014-01-08,16.892258,16.957531,16.743993,16.747721,258529600


In [9]:
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Close'].shift(-1) > dataset['Close'],1,0)
dataset['Returns'] = dataset['Close'].pct_change()
dataset = dataset.dropna()

dataset.shape

(2514, 9)

In [11]:
x = dataset[['Open', 'High', 'Low', 'Volume', 'Increase_Decrease', 'Buy_Sell_on_Open', 'Buy_Sell', 'Returns']]
y = dataset['Close']

In [12]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [14]:
#create on selectKBest object to select features with two best ANOVA F-values
fvalue_selector = SelectKBest(f_classif, k = 2)

#Apply the selectionKBest objectto the features and target
x_kbest = fvalue_selector.fit_transform(x, y)

In [16]:
print('Original number of features:', x.shape[1])
print('Reduced number of features:', x_kbest.shape[1])

Original number of features: 8
Reduced number of features: 2


In [17]:
fvalue_selector.scores_

array([2.21177075e+03, 5.33419390e+03, 6.79230922e+03, 5.35879916e+00,
       9.72666761e-01, 9.19922967e-01, 8.81724063e-01, 2.28137780e+00])

In [18]:
fvalue_selector.pvalues_

array([1.81824835e-58, 6.40338090e-66, 5.75757961e-68, 5.63925235e-09,
       5.77535882e-01, 6.71453079e-01, 7.37575878e-01, 9.36134475e-04])

In [21]:
names = x.columns.values[fvalue_selector.get_support()]
scores = fvalue_selector.scores_[fvalue_selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_scores'])

#sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)

  Feat_names     F_scores
1        Low  6792.309223
0       High  5334.193902
