In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd

iris = load_iris()
X = iris.data
y = iris.target

df = pd.DataFrame(X, columns=iris.feature_names)
print("Original Dataset:")
print(df.head())


Original Dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [2]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.2)
X_high_variance = selector.fit_transform(X)
print("\nDataset after Variance Thresholding:")
print(pd.DataFrame(X_high_variance, columns=df.columns[selector.get_support(indices=True)]))


Dataset after Variance Thresholding:
     sepal length (cm)  petal length (cm)  petal width (cm)
0                  5.1                1.4               0.2
1                  4.9                1.4               0.2
2                  4.7                1.3               0.2
3                  4.6                1.5               0.2
4                  5.0                1.4               0.2
..                 ...                ...               ...
145                6.7                5.2               2.3
146                6.3                5.0               1.9
147                6.5                5.2               2.0
148                6.2                5.4               2.3
149                5.9                5.1               1.8

[150 rows x 3 columns]


In [4]:
import numpy as np

corr_matrix = pd.DataFrame(X_high_variance, columns=df.columns[selector.get_support(indices=True)]).corr()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))


to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

X_low_correlation = pd.DataFrame(X_high_variance, columns=df.columns[selector.get_support()])
print("\nDataset after removing highly correlated features:")
print(X_low_correlation.head())


Dataset after removing highly correlated features:
   sepal length (cm)  petal length (cm)  petal width (cm)
0                5.1                1.4               0.2
1                4.9                1.4               0.2
2                4.7                1.3               0.2
3                4.6                1.5               0.2
4                5.0                1.4               0.2


In [5]:
from sklearn.feature_selection import SelectKBest, f_classif
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_low_correlation, y, test_size=0.2, random_state=42)
# Configure to select the top 2 features
k_best_features = 2
selector_kbest = SelectKBest(score_func=f_classif, k=k_best_features)
# Apply the SelectKBest object to the dataset
X_train_kbest = selector_kbest.fit_transform(X_train, y_train)
X_test_kbest = selector_kbest.transform(X_test)
print(f"\nDataset after SelectKBest Feature Selection (Top {k_best_features} features): ")
print(pd.DataFrame(X_train_kbest, columns=X_train.columns[selector_kbest.get_support()]))


Dataset after SelectKBest Feature Selection (Top 2 features): 
     petal length (cm)  petal width (cm)
0                  1.0               0.2
1                  1.5               0.4
2                  4.4               1.4
3                  1.6               0.2
4                  1.3               0.2
..                 ...               ...
115                4.0               1.3
116                4.5               1.7
117                1.2               0.2
118                4.0               1.2
119                5.9               2.1

[120 rows x 2 columns]
