Exploring Feature Selection

Classification Model Development

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Load the data
glass = pd.read_csv('glass.csv')

# Drop the ID column
glass = glass.drop('idno', axis=1)

# # Show basic info
# print(glass.info())
# print(glass.head())

# Check for missing values
print("Missing values in each column:")
print(glass.isnull().sum())
print()


# Separate features and target
X = glass.drop('type', axis=1)
y = glass['type']

# Train-test split (80/20, stratified, seed=10)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=10, stratify=y)

selector = SelectKBest(score_func=f_classif, k=5)
selector.fit(X_train, y_train)
top5_uni = X.columns[selector.get_support()].tolist()
print('Top 5 Univariate Features:', top5_uni)

rf = ExtraTreesClassifier(random_state=10)
rf.fit(X_train, y_train)
importances = pd.Series(rf.feature_importances_, index=X.columns)
top5_imp = importances.sort_values(ascending=False).head(5).index.tolist()
print('Top 5 Important Features:', top5_imp)

correlations = X_train.corrwith(y_train).abs()
top5_corr = correlations.sort_values(ascending=False).head(5).index.tolist()
print('Top 5 Correlated Features:', top5_corr)
print()

def evaluate_model(X_tr, X_te, y_tr, y_te, features, label):
    gnb = GaussianNB()
    gnb.fit(X_tr[features], y_tr)
    y_pred = gnb.predict(X_te[features])
    cv = KFold(n_splits=8, shuffle=True, random_state=10)
    cv_scores = cross_val_score(gnb, X_tr[features], y_tr, cv=cv, scoring='accuracy')
    # report = classification_report(y_te, y_pred, output_dict=True)
    print(f'--- {label} ---')
    print('Test Accuracy:', accuracy_score(y_te, y_pred))
    print('Test Precision:', precision_score(y_te, y_pred, average="weighted", zero_division=0))
    print('Test Recall:', recall_score(y_te, y_pred, average="weighted", zero_division=0))
    print('Test F1:', f1_score(y_te, y_pred, average="weighted", zero_division=0))
    print('CV Mean Accuracy:', np.mean(cv_scores))
    print('Classification Report:\n', classification_report(y_te, y_pred))

evaluate_model(X_train, X_test, y_train, y_test, X.columns, 'All Features')
evaluate_model(X_train, X_test, y_train, y_test, top5_uni, 'Univariate Selection')
evaluate_model(X_train, X_test, y_train, y_test, top5_imp, 'Feature Importance')
evaluate_model(X_train, X_test, y_train, y_test, top5_corr, 'Correlation')


Missing values in each column:
RI      0
Na      0
Mg      0
Al      0
Si      0
K       0
Ca      0
Ba      0
Fe      0
type    0
dtype: int64

Top 5 Univariate Features: ['Na', 'Mg', 'Al', 'K', 'Ba']
Top 5 Important Features: ['Mg', 'Al', 'RI', 'Ca', 'K']
Top 5 Correlated Features: ['Mg', 'Al', 'Ba', 'Na', 'Fe']

--- All Features ---
Test Accuracy: 0.32558139534883723
Test Precision: 0.37522361359570666
Test Recall: 0.32558139534883723
Test F1: 0.3303813327069141
CV Mean Accuracy: 0.450487012987013
Classification Report:
               precision    recall  f1-score   support

           1       0.38      0.36      0.37        14
           2       0.20      0.07      0.10        15
           3       0.08      0.33      0.13         3
           5       0.17      0.33      0.22         3
           6       0.50      0.50      0.50         2
           7       1.00      0.83      0.91         6

    accuracy                           0.33        43
   macro avg       0.39      0.40   