In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [3]:
# Load the dataset
parkinsons_data = pd.read_csv('/content/parkinsons.csv')

# Prepare data: Drop irrelevant columns and separate features (X) and target (y)
X = parkinsons_data.drop(columns=['name', 'status'])
y = parkinsons_data['status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Initialize the SVC model
svc = SVC(kernel="linear")

# Initialize RFE with the SVC estimator, specifying the number of features to select
rfe = RFE(estimator=svc, n_features_to_select=10)
rfe.fit(X_train, y_train)

# Create a DataFrame with feature names and their ranks
feature_ranking = pd.DataFrame({
    'Feature': X.columns,
    'Rank': rfe.ranking_
}).sort_values(by='Rank')

# Display the ranked features
print("Feature Ranking:\n", feature_ranking)

Feature Ranking:
              Feature  Rank
21               PPE     1
19           spread2     1
18           spread1     1
16              RPDE     1
14               NHR     1
12          MDVP:APQ     1
11      Shimmer:APQ5     1
20                D2     1
8       MDVP:Shimmer     1
9   MDVP:Shimmer(dB)     1
0        MDVP:Fo(Hz)     2
15               HNR     3
1       MDVP:Fhi(Hz)     4
13       Shimmer:DDA     5
17               DFA     6
10      Shimmer:APQ3     7
3     MDVP:Jitter(%)     8
2       MDVP:Flo(Hz)     9
7         Jitter:DDP    10
6           MDVP:PPQ    11
5           MDVP:RAP    12
4   MDVP:Jitter(Abs)    13


In [5]:
# Variables to store results
train_accuracies = []
train_f1_scores = []
test_accuracies = []
test_f1_scores = []
num_features = []
removed_feature_names = []
removed_feature_ranks = []

# Set minimum features to avoid empty feature set
min_features = 9

# Start with original dataset
X_train_reduced, X_test_reduced = X_train.copy(), X_test.copy()

for i in range(len(feature_ranking)):
    # Train the model with the current set of features
    svc.fit(X_train_reduced, y_train)

    # Make predictions on train and test sets
    y_train_pred = svc.predict(X_train_reduced)
    y_test_pred = svc.predict(X_test_reduced)

    # Store performance metrics
    train_accuracies.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred))
    test_accuracies.append(accuracy_score(y_test, y_test_pred))
    test_f1_scores.append(f1_score(y_test, y_test_pred))
    num_features.append(X_train_reduced.shape[1])

    # Stop if reaching minimum feature count
    if X_train_reduced.shape[1] <= min_features:
        removed_feature_names.append("None")
        removed_feature_ranks.append("N/A")
        break

    # Drop the least important feature
    least_important_feature = feature_ranking.iloc[i]['Feature']
    feature_rank = feature_ranking.iloc[i]['Rank']
    removed_feature_names.append(least_important_feature)
    removed_feature_ranks.append(feature_rank)

    X_train_reduced = X_train_reduced.drop(columns=[least_important_feature])
    X_test_reduced = X_test_reduced.drop(columns=[least_important_feature])

# Create performance DataFrame
performance_df = pd.DataFrame({
    'Num_Features': num_features,
    'Removed_Feature': removed_feature_names,
    'Feature_Rank': removed_feature_ranks,
    'Train_Accuracy': train_accuracies,
    'Test_Accuracy': test_accuracies,
    'Train_F1_Score': train_f1_scores,
    'Test_F1_Score': test_f1_scores
})

print("Performance Metrics:\n", performance_df)

Performance Metrics:
     Num_Features   Removed_Feature Feature_Rank  Train_Accuracy  \
0             22               PPE            1        0.882353   
1             21           spread2            1        0.889706   
2             20           spread1            1        0.882353   
3             19              RPDE            1        0.875000   
4             18               NHR            1        0.882353   
5             17          MDVP:APQ            1        0.882353   
6             16      Shimmer:APQ5            1        0.882353   
7             15                D2            1        0.882353   
8             14      MDVP:Shimmer            1        0.816176   
9             13  MDVP:Shimmer(dB)            1        0.816176   
10            12       MDVP:Fo(Hz)            2        0.816176   
11            11               HNR            3        0.808824   
12            10      MDVP:Fhi(Hz)            4        0.845588   
13             9              None      

In [6]:
# Calculate the mean, median, and mode for each feature in the dataset
feature_stats = pd.DataFrame({
    'Feature': X.columns,
    'Mean': X.mean(),
    'Median': X.median(),
    'Mode': X.mode().iloc[0],  # Mode returns a DataFrame; we take the first row for each feature
    'Rank': rfe.ranking_  # Use the RFE rankings obtained earlier
})

# Determine the appropriate default value for each feature
def recommend_default(row):
    # For symmetric distributions, mean is a good default
    if abs(row['Mean'] - row['Median']) < 0.1 * row['Mean']:
        return row['Mean']
    # For skewed distributions, use median
    elif abs(row['Mean'] - row['Median']) >= 0.1 * row['Mean']:
        return row['Median']
    # For categorical-like features, use mode
    else:
        return row['Mode']

# Apply the recommendation function to determine default values
feature_stats['Recommended_Default'] = feature_stats.apply(recommend_default, axis=1)

# Sort by rank for readability
feature_stats = feature_stats.sort_values(by='Rank')

# Display the resulting DataFrame
print("Feature Statistics with Recommended Defaults:\n",
      feature_stats[['Feature', 'Mean', 'Median', 'Mode', 'Rank', 'Recommended_Default']])


Feature Statistics with Recommended Defaults:
                            Feature        Mean      Median        Mode  Rank  \
PPE                            PPE    0.206552    0.194052    0.044539     1   
spread2                    spread2    0.226510    0.218885    0.210279     1   
spread1                    spread1   -5.684397   -5.720868   -7.964984     1   
RPDE                          RPDE    0.498536    0.495954    0.256570     1   
NHR                            NHR    0.024847    0.011660    0.002310     1   
MDVP:APQ                  MDVP:APQ    0.024081    0.018260    0.009030     1   
Shimmer:APQ5          Shimmer:APQ5    0.017878    0.013470    0.007470     1   
D2                              D2    2.381826    2.361532    1.423287     1   
MDVP:Shimmer          MDVP:Shimmer    0.029709    0.022970    0.014500     1   
MDVP:Shimmer(dB)  MDVP:Shimmer(dB)    0.282251    0.221000    0.154000     1   
MDVP:Fo(Hz)            MDVP:Fo(Hz)  154.228641  148.790000   88.333000   