# Filter Based Feature Selection

## 1. ❌ Duplicate Features

In [5]:
import pandas as pd
import numpy as np

In [6]:
feature = {
    'id' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'gpa': [3.5, 3.67, 3.5, 3.90, 3.25, 3.67, 3.25, 3.33, 3.8, 3.16],
    'cgpa': [3.5, 3.67, 3.5, 3.90, 3.25, 3.67, 3.25, 3.33, 3.8, 3.16]
}

In [7]:
data = pd.DataFrame(data = feature)

In [8]:
data.head()

Unnamed: 0,id,gpa,cgpa
0,1,3.5,3.5
1,2,3.67,3.67
2,3,3.5,3.5
3,4,3.9,3.9
4,5,3.25,3.25


In [9]:
data.nunique()

id      10
gpa      7
cgpa     7
dtype: int64

In [10]:
duplicates = data.T.duplicated()
duplicates

id      False
gpa     False
cgpa     True
dtype: bool

In [11]:
new_data = data.loc[:, ~duplicates]
new_data

Unnamed: 0,id,gpa
0,1,3.5
1,2,3.67
2,3,3.5
3,4,3.9
4,5,3.25
5,6,3.67
6,7,3.25
7,8,3.33
8,9,3.8
9,10,3.16


## 2. 📉 Variance Threshold

In [13]:
from sklearn.feature_selection import VarianceThreshold

In [23]:
df = pd.DataFrame({
    'gpa': [3.50, 3.67, 3.50, 3.90, 3.25],
    'study_hours': [2, 2.5, 2.1, 2.3, 2],
    'same_score': [0.99, 0.99, 0.99, 0.99, 0.99]
})

In [35]:
selector = VarianceThreshold(threshold = 0.01)
selector

In [27]:
selector.fit_transform(df)

array([[3.5 , 2.  ],
       [3.67, 2.5 ],
       [3.5 , 2.1 ],
       [3.9 , 2.3 ],
       [3.25, 2.  ]])

In [33]:
print(selector.get_support())

[ True  True False]


## 3. 🔗 Correlation

In [41]:
corr_matrix = data.corr().abs()
corr_matrix

Unnamed: 0,id,gpa,cgpa
id,1.0,0.328227,0.328227
gpa,0.328227,1.0,1.0
cgpa,0.328227,1.0,1.0


In [43]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))
upper

Unnamed: 0,id,gpa,cgpa
id,,0.328227,0.328227
gpa,,,1.0
cgpa,,,


In [45]:
to_drop = [column for column in upper.columns if any (upper[column] > 0.9)]

In [47]:
data = data.drop(columns = to_drop)
data

Unnamed: 0,id,gpa
0,1,3.5
1,2,3.67
2,3,3.5
3,4,3.9
4,5,3.25
5,6,3.67
6,7,3.25
7,8,3.33
8,9,3.8
9,10,3.16


## 4. 🧪 ANOVA (Analysis of Variance)

In [50]:
exm_data = pd.DataFrame({
    'study_hours': [2, 3, 5, 6, 8, 9],
    'exam_score': [55, 60, 70, 72, 85, 88],
    'category': ['low', 'low', 'medium', 'medium', 'high', 'high']
})

In [72]:
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.preprocessing import LabelEncoder

In [56]:
le = LabelEncoder()

In [60]:
X = exm_data[['study_hours', 'exam_score']]
y = exm_data[['category']]

In [66]:
exm_data['category_encoded'] = le.fit_transform(y)

  y = column_or_1d(y, warn=True)


In [68]:
X_new = SelectKBest(score_func = f_classif, k = 5).fit_transform(X, y)

  y = column_or_1d(y, warn=True)


In [82]:
X_new

array([[ 2, 55],
       [ 3, 60],
       [ 5, 70],
       [ 6, 72],
       [ 8, 85],
       [ 9, 88]], dtype=int64)

## 5. 🧮 Chi-Square Test

In [85]:
from sklearn.feature_selection import chi2

In [89]:
chi2_df = pd.DataFrame({
    'study_group': ['yes', 'no', 'yes', 'no', 'yes'],
    'study_hours': [1, 2, 3, 1, 2],
    'result': ['pass', 'fail', 'pass', 'fail', 'pass']
})

In [91]:
encoder = LabelEncoder()

In [95]:
chi2_df['study_group_encoded'] = encoder.fit_transform(chi2_df['study_group'])
chi2_df['result_encoded'] = encoder.fit_transform(chi2_df['result']) 

In [99]:
X = chi2_df[['study_group_encoded', 'study_hours']]
y = chi2_df['result_encoded']

In [105]:
selector = SelectKBest(score_func=chi2, k='all')
X_new = selector.fit_transform(X, y)

In [109]:
scores = pd.DataFrame({
    'Feature': X.columns,
    'Chi2 Score': selector.scores_,
    'p-value': selector.pvalues_
})
scores

Unnamed: 0,Feature,Chi2 Score,p-value
0,study_group_encoded,2.0,0.157299
1,study_hours,0.166667,0.683091
