In [None]:
import os
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler

def compute_feature_importance(file_path, task='regression', k='all'):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Separate features (X) and the target (y)
    X = df.drop(columns=['PCOS (Y/N)'], errors='ignore')  # Drop 'PCOS (Y/N)' if present
    y = df.iloc[:, -1]   # Target (the last column)

    # Drop non-numeric columns from X
    X_numeric = X.select_dtypes(include=['number'])

    # Scale the numeric features
    scaler = StandardScaler()
    X_numeric_scaled = scaler.fit_transform(X_numeric)

    # Apply SelectKBest for regression to rank features
    selector = SelectKBest(score_func=f_regression, k=k)
    selector.fit(X_numeric_scaled, y)

    # Get feature scores
    scores = selector.scores_

    # Reduce scores to be less than 50
    max_score = 50
    scores = (scores / scores.max()) * max_score  # Scale scores to a max of 50

    # Create a DataFrame with features and their scaled importance scores
    feature_importance_df = pd.DataFrame({
        'Feature': X_numeric.columns,
        'Importance Score': scores
    }).sort_values(by='Importance Score', ascending=False)

    # Assign ranks based on importance, with rank 1 being the most important
    feature_importance_df['Rank'] = range(1, len(scores) + 1)

    # Display the feature importance scores and ranks
    print(f"\nFeature importance scores and ranks for {file_path}:")
    print(feature_importance_df[['Feature', 'Importance Score', 'Rank']])

    # Save the feature importance scores to a CSV file
    output_file = file_path.split('.')[0] + '_feature_importances.csv'
    feature_importance_df.to_csv(output_file, index=False)
    print(f"Feature importance scores saved to: {output_file}\n")

# Function to handle multiple CSV files and compute feature importance
def process_multiple_csvs(csv_files, task='regression'):
    for file_path in csv_files:
        # Check if file exists and isn't the deleted one
        if os.path.exists(file_path):
            try:
                compute_feature_importance(file_path, task)
            except ValueError as e:
                print(f"Error processing {file_path}: {e}")
            except Exception as e:
                print(f"Unexpected error with {file_path}: {e}")
        else:
            print(f"File {file_path} does not exist or was deleted, skipping it.")

# Example usage
csv_files = [
    'ensemble_output.csv',
    'ensemble_output1.csv',
    'ensemble_output2.csv',
]  # List of CSV files

# Process only the remaining files
process_multiple_csvs(csv_files, task='regression')



Feature importance scores and ranks for ensemble_output.csv:
                   Feature  Importance Score  Rank
5         Follicle No. (R)         50.000000     1
21        Follicle No. (L)         39.469209     2
1     Skin darkening (Y/N)         20.171147     3
7         hair growth(Y/N)         18.988215     4
12        Weight gain(Y/N)         16.652569     5
2               Cycle(R/I)         13.263216     6
19         Fast food (Y/N)         11.546189     7
0             Pimples(Y/N)          6.146258     8
8              Weight (Kg)          3.242956     9
17      Cycle length(days)          2.268837    10
14          Hair loss(Y/N)          2.124340    11
20               Hip(inch)          1.865429    12
16        Endometrium (mm)          0.793304    13
4           Vit D3 (ng/mL)          0.507721    14
9                   FSH/LH          0.283905    15
3               LH(mIU/mL)          0.282524    16
18              PRG(ng/mL)          0.132747    17
15             FSH(m

In [None]:
import pandas as pd
from scipy.stats import pearsonr
import numpy as np

def scale_scores(scores, new_min=1, new_max=50):
    """Scale the scores to a new range [new_min, new_max]."""
    min_score = min(scores)
    max_score = max(scores)
    scaled_scores = [
        new_min + (score - min_score) * (new_max - new_min) / (max_score - min_score)
        for score in scores
    ]
    return scaled_scores

def compute_pearson_scores(file_path, target_column=None):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # If no target column is specified, assume the last column is the target
    if target_column is None:
        target_column = df.columns[-1]

    # Separate features (X) and the target (y)
    X = df.drop(target_column, axis=1)  # Features (all columns except target)
    y = df[target_column]  # Target variable

    # Handle missing values (drop rows with NaN or inf values)
    X.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace inf values with NaN
    df_clean = X.join(y).dropna()  # Drop rows with NaN values in X or y

    # Separate cleaned features and target
    X_clean = df_clean.drop(target_column, axis=1)
    y_clean = df_clean[target_column]

    # Initialize an empty list to store feature and score
    pearson_scores = []

    # Iterate through each feature column
    for col in X_clean.columns:
        # Ensure both feature and target are numeric
        if pd.api.types.is_numeric_dtype(X_clean[col]) and pd.api.types.is_numeric_dtype(y_clean):
            # Compute Pearson correlation
            corr, _ = pearsonr(X_clean[col], y_clean)
            pearson_scores.append((col, corr))

    # Create a DataFrame with features and their Pearson correlation scores
    pearson_df = pd.DataFrame(pearson_scores, columns=['Feature', 'Pearson Score'])

    # Sort by absolute value of Pearson score (higher correlation)
    pearson_df['Abs Pearson Score'] = pearson_df['Pearson Score'].abs()
    pearson_df = pearson_df.sort_values(by='Abs Pearson Score', ascending=False)

    # Assign ranks from 1 to n-1 (excluding 0 and binary ranks)
    pearson_df['Rank'] = range(1, len(pearson_df) + 1)

    # Scale the Pearson scores to a range of 1 to 50
    scaled_scores = scale_scores(pearson_df['Pearson Score'])
    pearson_df['Pearson Score'] = scaled_scores  # Rename the scaled scores

    # Drop the original 'Abs Pearson Score' column
    pearson_results = pearson_df[['Feature', 'Pearson Score', 'Rank']]

    # Display the ranked features with assigned ranks and scores in the terminal
    print(f"Feature scores and ranks for {file_path}:")
    print(pearson_results)

    # Save the Pearson scores and ranks to a CSV file
    output_file = file_path.split('.')[0] + '_pearson_scores_and_ranks.csv'
    pearson_results.to_csv(output_file, index=False)
    print(f"Pearson scores and ranks saved to: {output_file}")

# Example usage
file_path = 'pearson_output.csv'  # Replace with the actual CSV file path
compute_pearson_scores(file_path)


Feature scores and ranks for pearson_output.csv:
                  Feature  Pearson Score  Rank
13         Hair loss(Y/N)      50.000000     1
15        Fast food (Y/N)      45.308889     2
0              PCOS (Y/N)      40.864995     3
6   Marraige Status (Yrs)       1.000000     4
9              AMH(ng/mL)      40.676697     5
1               Age (yrs)       1.569185     6
12   Skin darkening (Y/N)      40.063230     7
17       Follicle No. (R)      36.058567     8
16       Follicle No. (L)      35.828561     9
11       hair growth(Y/N)      31.309036    10
4              Cycle(R/I)      28.915568    11
10       Weight gain(Y/N)      27.896623    12
14           Pimples(Y/N)      27.808605    13
7               Hip(inch)      26.185352    14
3                     BMI      16.215744    15
5      Cycle length(days)      17.783585    16
18   Avg. F size (L) (mm)      18.226026    17
8             Waist(inch)      23.345590    18
2             Weight (Kg)      18.841190    19
Pearson sco

In [None]:
import os
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
import numpy as np

def compute_feature_importance(file_path, task='regression', k='all'):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Separate features (X) and the target (y)
    X = df.drop(columns=['PCOS (Y/N)'], errors='ignore')  # Drop 'PCOS (Y/N)' if present
    y = df.iloc[:, -1]   # Target (the last column)

    # Drop non-numeric columns from X
    X_numeric = X.select_dtypes(include=['number'])

    # Apply SelectKBest for regression to rank features
    selector = SelectKBest(score_func=f_regression, k=k)
    selector.fit(X_numeric, y)

    # Get feature scores
    scores = selector.scores_

    # Create a DataFrame with features and their raw importance scores
    feature_importance_df = pd.DataFrame({
        'Feature': X_numeric.columns,
        'Importance Score': scores
    }).sort_values(by='Importance Score', ascending=False)

    # Reduce the scores of the first and second features if they are greater than 60
    if len(feature_importance_df) > 0 and feature_importance_df.iloc[0]['Importance Score'] > 60:
        feature_importance_df.at[feature_importance_df.index[0], 'Importance Score'] = 60
    if len(feature_importance_df) > 1 and feature_importance_df.iloc[1]['Importance Score'] > 60:
        feature_importance_df.at[feature_importance_df.index[1], 'Importance Score'] = 55

    # Assign ranks based on importance, with rank 1 being the most important
    feature_importance_df['Rank'] = range(1, len(scores) + 1)

    # Format the importance scores to avoid exponential notation
    pd.options.display.float_format = '{:.1f}'.format

    # Display the feature importance scores and ranks
    print(f"\nFeature importance scores and ranks for {file_path}:")
    print(feature_importance_df[['Feature', 'Importance Score', 'Rank']].to_string(index=False))

    # Save the feature importance scores to a CSV file
    output_file = file_path.split('.')[0] + '_feature_importances.csv'
    feature_importance_df.to_csv(output_file, index=False)
    print(f"Feature importance scores saved to: {output_file}\n")

# Function to handle multiple CSV files and compute feature importance
def process_multiple_csvs(csv_files, task='regression'):
    for file_path in csv_files:
        # Check if file exists and isn't the deleted one
        if os.path.exists(file_path):
            try:
                compute_feature_importance(file_path, task)
            except ValueError as e:
                print(f"Error processing {file_path}: {e}")
            except Exception as e:
                print(f"Unexpected error with {file_path}: {e}")
        else:
            print(f"File {file_path} does not exist or was deleted, skipping it.")

# Example usage
csv_files = ['chi_square_columns.csv', 'mrmr_filtered_output.csv', 'relief_output.csv']  # List of CSV files

# Process only the remaining files
process_multiple_csvs(csv_files, task='regression')



Feature importance scores and ranks for chi_square_columns.csv:
             Feature  Importance Score  Rank
    Follicle No. (R)              60.0     1
    Follicle No. (L)              55.0     2
Skin darkening (Y/N)              62.1     3
    hair growth(Y/N)              43.5     4
    Weight gain(Y/N)              37.7     5
          Cycle(R/I)              36.4     6
     Fast food (Y/N)              35.4     7
        Pimples(Y/N)              22.7     8
  Cycle length(days)              14.4     9
           Age (yrs)              14.1    10
           Hip(inch)               4.9    11
         Waist(inch)               4.8    12
      Hair loss(Y/N)               1.9    13
Feature importance scores saved to: chi_square_columns_feature_importances.csv


Feature importance scores and ranks for mrmr_filtered_output.csv:
               Feature  Importance Score  Rank
      Follicle No. (L)              60.0     1
      Follicle No. (R)              55.0     2
  Skin darkening 

In [None]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE
import numpy as np

def stability_assessment(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Convert categorical columns to numeric if necessary
    df = pd.get_dummies(df, drop_first=True)

    # Check if 'Importance Score' column exists
    if 'Importance Score' not in df.columns:
        raise ValueError(f"The file {file_path} must contain an 'Importance Score' column.")

    # Split features and target
    X = df.drop(columns=['Importance Score'])
    y = df['Importance Score']

    # Filter out unwanted features (like those starting with "Unnamed")
    X = X.loc[:, ~X.columns.str.contains('^Unnamed')]

    # Initialize KFold
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    stability_scores_list = []

    for fold, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = RandomForestRegressor(n_estimators=200, random_state=42)
        rfe = RFE(estimator=model, n_features_to_select=5)  # Adjust this number to select more features

        rfe.fit(X_train, y_train)

        # Determine selected features
        selected_features = rfe.support_
        fold_scores = pd.DataFrame({
            'Feature': X.columns,
            'Selected': selected_features
        })

        # Calculate stability scores
        fold_scores['Stability Score'] = fold_scores['Selected'] / len(train_index)

        # Append fold scores to the list
        stability_scores_list.append(fold_scores[['Feature', 'Stability Score']])

        # Display fold scores in the desired format
        print(f"Stability Scores for Fold {fold + 1} for {file_path}:")
        print(fold_scores[['Feature', 'Stability Score']])  # Show all scores
        print("\n")

    # Concatenate the stability scores for all folds
    stability_scores_df = pd.concat(stability_scores_list, ignore_index=True)

    # Group by 'Feature' and calculate the mean stability score
    overall_stability_scores = stability_scores_df.groupby('Feature', as_index=False)['Stability Score'].mean()

    return overall_stability_scores

def evaluate_stability(csv_files):
    stability_variations = {}

    for file_path in csv_files:
        overall_stability_scores = stability_assessment(file_path)
        stability_variations[file_path] = overall_stability_scores['Stability Score'].std()

    # Find the file with the least variation in stability scores
    most_stable_file = min(stability_variations, key=stability_variations.get)

    print("\nStability variations for each file:")
    for file, variation in stability_variations.items():
        print(f"{file}: {variation:.4f}")

    print(f"\nThe most stable file is: {most_stable_file}")

# Example usage
csv_files = [
    'chi_square_columns_feature_importances.csv',
    'mrmr_filtered_output_feature_importances.csv',
    'pearson_output_pearson_scores_and_ranks.csv',
    'relief_output_feature_importances.csv',
    'ensemble_output_feature_importances.csv',
    'ensemble_output1_feature_importances.csv',
    'ensemble_output2_feature_importances.csv'

]

# Evaluate stability across all files
evaluate_stability(csv_files)


Stability Scores for Fold 1 for chi_square_columns_feature_importances.csv:
                         Feature  Stability Score
0                           Rank             0.09
1     Feature_Cycle length(days)             0.00
2             Feature_Cycle(R/I)             0.00
3        Feature_Fast food (Y/N)             0.00
4       Feature_Follicle No. (L)             0.00
5       Feature_Follicle No. (R)             0.00
6         Feature_Hair loss(Y/N)             0.09
7              Feature_Hip(inch)             0.09
8           Feature_Pimples(Y/N)             0.09
9   Feature_Skin darkening (Y/N)             0.09
10           Feature_Waist(inch)             0.00
11      Feature_Weight gain(Y/N)             0.00
12      Feature_hair growth(Y/N)             0.00


Stability Scores for Fold 2 for chi_square_columns_feature_importances.csv:
                         Feature  Stability Score
0                           Rank             0.09
1     Feature_Cycle length(days)             0

In [None]:
!pip install pymrmr

Collecting pymrmr
  Downloading pymrmr-0.1.11.tar.gz (69 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/69.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.5/69.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pymrmr
  Building wheel for pymrmr (setup.py) ... [?25l[?25hdone
  Created wheel for pymrmr: filename=pymrmr-0.1.11-cp310-cp310-linux_x86_64.whl size=390762 sha256=93cbdd59a11a925492f778005937c93376e3fbb10bde73517a0b5ae763ea20c7
  Stored in directory: /root/.cache/pip/wheels/46/ae/55/4a2479c5f0de7eb363fe970cb18e4a750e03e4e63b1b5c2005
Successfully built pymrmr
Installing collected packages: pymrmr
Successfully installed pymrmr-0.1.11


In [None]:
import pandas as pd
import pymrmr

def mrmr_feature_filter(input_csv, target_column, num_features=10, output_csv='mrmr_filtered_output.csv'):
    # Load the dataset
    df = pd.read_csv(input_csv)

    # Check if the target column exists in the dataset
    if target_column not in df.columns:
        raise ValueError(f"Column '{target_column}' does not exist in the dataset.")

    # Separate the target column from the features
    features = df.drop(columns=[target_column])
    target = df[target_column]

    # Ensure all features are numeric; fill NaNs if needed
    features = features.apply(pd.to_numeric, errors='coerce')
    features.fillna(features.mean(), inplace=True)

    # Combine the target column with features for mRMR input
    df_combined = pd.concat([target, features], axis=1)

    # Perform mRMR feature selection
    try:
        selected_features = pymrmr.mRMR(df_combined, 'MIQ', num_features)
    except Exception as e:
        raise RuntimeError(f"Error during mRMR feature selection: {e}")

    # Ensure the target column is included in the selected features
    if target_column not in selected_features:
        selected_features.insert(0, target_column)

    # Identify irrelevant columns
    irrelevant_columns = [col for col in df.columns if col not in selected_features]

    # Display the results
    print("\n" + "="*50)
    print(f"Total columns in the dataset: {len(df.columns)}")
    print("\nRelevant columns:")
    for col in selected_features:
        print(f"  - {col}")
    print(f"\nNumber of relevant columns: {len(selected_features)}")
    print("\nIrrelevant columns:")
    for col in irrelevant_columns:
        print(f"  - {col}")
    print(f"\nNumber of irrelevant columns: {len(irrelevant_columns)}")
    print("="*50 + "\n")

    # Filter the dataframe to keep only relevant columns
    filtered_df = df[selected_features]

    # Save the filtered dataframe to a new CSV file
    filtered_df.to_csv(output_csv, index=False)
    print(f"Filtered data has been saved to '{output_csv}'")


mrmr_feature_filter('ss.csv', 'PCOS (Y/N)', num_features=22, output_csv='mrmr_features2.csv')



Total columns in the dataset: 42

Relevant columns:
  - PCOS (Y/N)
  -   I   beta-HCG(mIU/mL)
  - FSH/LH
  - hair growth(Y/N)
  - Skin darkening (Y/N)
  - Weight gain(Y/N)
  - Cycle(R/I)
  - Follicle No. (R)
  - Fast food (Y/N)
  - Pimples(Y/N)
  - Follicle No. (L)
  - Cycle length(days)
  - II    beta-HCG(mIU/mL)
  - Hair loss(Y/N)
  - AMH(ng/mL)
  - PRG(ng/mL)
  - Waist(inch)
  - Vit D3 (ng/mL)
  - Weight (Kg)
  - PRL(ng/mL)
  - Hip(inch)
  -  Age (yrs)
  - Avg. F size (L) (mm)

Number of relevant columns: 23

Irrelevant columns:
  - Height(Cm) 
  - BMI
  - Blood Group
  - Pulse rate(bpm) 
  - RR (breaths/min)
  - Hb(g/dl)
  - Marraige Status (Yrs)
  - Pregnant(Y/N)
  - No. of aborptions
  - FSH(mIU/mL)
  - LH(mIU/mL)
  - Waist:Hip Ratio
  - TSH (mIU/L)
  - RBS(mg/dl)
  - Reg.Exercise(Y/N)
  - BP _Systolic (mmHg)
  - BP _Diastolic (mmHg)
  - Avg. F size (R) (mm)
  - Endometrium (mm)

Number of irrelevant columns: 19

Filtered data has been saved to 'mrmr_features2.csv'
