In [8]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_classif
from sklearn.linear_model import LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

data_path = 'E:/Research data/MS/All-based-Sensory.csv'
data_df = pd.read_csv(data_path)

features = data_df.drop(columns=['Motor System', 'Image'])
labels = data_df['Motor System']

features.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(strategy='median')
features = imputer.fit_transform(features)

scaler = StandardScaler()
features = scaler.fit_transform(features)

features_non_negative = features - features.min(axis=0)

if labels.dtype == 'object':
    encoder = LabelEncoder()
    labels = encoder.fit_transform(labels)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)
X_train_non_negative, X_test_non_negative = train_test_split(features_non_negative, test_size=0.3, random_state=42)

In [11]:
def univariate_selection(X, y):
    selector = SelectKBest(score_func=f_classif, k='all')
    selector.fit(X, y)
    scores = selector.scores_
    return scores

def random_forest_importance(X, y):
    model = RandomForestClassifier(random_state=42)
    model.fit(X, y)
    importances = model.feature_importances_
    return importances

def lasso_regularization(X, y):
    lasso = LassoCV(cv=5, random_state=42).fit(X, y)
    importances = np.abs(lasso.coef_)
    return importances

def mutual_info_gain(X, y):
    selector = SelectKBest(score_func=mutual_info_classif, k='all')
    selector.fit(X, y)
    scores = selector.scores_
    return scores

def chi_square(X, y):
    selector = SelectKBest(score_func=chi2, k='all')
    selector.fit(X, y)
    scores = selector.scores_
    return scores

def correlation_with_target(X, y):
    corrs = [np.corrcoef(X[:, i], y)[0, 1] for i in range(X.shape[1])]
    corrs = np.abs(corrs)
    return corrs

def anova_f_value(X, y):
    selector = SelectKBest(score_func=f_classif, k='all')
    selector.fit(X, y)
    scores = selector.scores_
    return scores

def recursive_feature_elimination(X, y):
    model = LogisticRegression(max_iter=1000)
    selector = RFE(model, n_features_to_select=1)
    selector.fit(X, y)
    rankings = selector.ranking_
    return rankings

In [12]:
ranking_methods = {
    'Univariate Selection': univariate_selection,
    'Random Forest': random_forest_importance,
    'Lasso Regularization': lasso_regularization,
    'Mutual Information Gain': mutual_info_gain,
    'Chi-Square': chi_square,
    'Correlation with Target': correlation_with_target,
    'ANOVA F-value': anova_f_value,
    'RFE': recursive_feature_elimination
}

feature_scores = {}
for method_name, method_func in ranking_methods.items():
    if method_name == 'Chi-Square':
        scores = method_func(X_train_non_negative, y_train)
    else:
        scores = method_func(X_train, y_train)
    feature_scores[method_name] = scores

feature_names = data_df.drop(columns=['Sensory', 'Image']).columns

ranking_dfs = {}
for method_name, scores in feature_scores.items():
    if method_name == 'RFE':
        ranking = scores  
    else:
        ranking = scores.argsort()[::-1].argsort() + 1  

    ranking_df = pd.DataFrame({'Feature': feature_names, 'Rank': ranking})
    ranking_df = ranking_df.sort_values(by='Rank')
    ranking_dfs[method_name] = ranking_df
    print(f"\n\n\n=============== Feature ranking using {method_name} ==========================\n")
    print(ranking_df)

def train_and_evaluate_model(X_train, X_test, y_train, y_test, selected_features, feature_names, num_features):
    # Select top features
    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_selected, y_train)

    y_pred = model.predict(X_test_selected)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy using top {num_features} features: {accuracy}")
    print(f"Classification Report using top {num_features} features:")
    print(classification_report(y_test, y_pred))

top_features_counts = [10, 15, 20, 25]

for method_name, method_func in ranking_methods.items():
    print(f"\n\n\n=============== Feature ranking using {method_name} ==========================\n")
    for num_features in top_features_counts:
        selected_features = np.argsort(-feature_scores[method_name])[:num_features]
        train_and_evaluate_model(X_train, X_test, y_train, y_test, selected_features, feature_names, num_features)





               Feature  Rank
23              Visual     1
15       gabor_entropy     2
14        gabor_energy     3
19            contrast     4
17       dissimilarity     5
22        Motor System     6
18         homogeneity     7
11  Standard Deviation     8
12          lbp_energy     9
21          brightness    10
10                Mean    11
13         lbp_entropy    12
20              energy    13
1             PA_ratio    14
5           ConvexArea    15
7           FilledArea    16
0                 Area    17
3          Circularity    18
4        EquivDiameter    19
9    Minor axis length    20
16         correlation    21
2             Solidity    22
6               Extent    23
8    Major axis length    24




               Feature  Rank
23              Visual     1
15       gabor_entropy     2
19            contrast     3
14        gabor_energy     4
22        Motor System     5
17       dissimilarity     6
16         correlation     7
21          brightness     8
11  St