<a href="https://colab.research.google.com/github/rayasrujanareddy/ML-FS-DIABETES-/blob/main/Feature_Selection(Diabetes)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Missing Values Ratio

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the diabetes dataset
file_path = '/content/diabetes.csv'
data = pd.read_csv(file_path)

# Step 3: Identify and remove features with more than 30% missing values
threshold = 30
features_to_remove = missing_percentage[missing_percentage > threshold].index

# Use 'columns' instead of 'Features' in the drop function
reduced_data = data.drop(columns=features_to_remove)

print("\nFeatures Removed (more than 30% missing):", features_to_remove)
print("Dataset Shape after Features Removal:", reduced_data.shape)


Features Removed (more than 30% missing): Index([], dtype='object')
Dataset Shape after Features Removal: (768, 9)


## High Correlation Filter

In [34]:
import pandas as pd
# Import numpy
import numpy as np # This line imports the numpy library and assigns it to the alias 'np'

# Step 1: Compute the correlation matrix
correlation_matrix = data.corr()

# Step 2: Identify highly correlated feature pairs (correlation > 0.8)
threshold = 0.8
highly_correlated_pairs = np.where((correlation_matrix > threshold) & (correlation_matrix < 1.0))

# Create a set to store features to remove
features_to_remove = set()

for i, j in zip(*highly_correlated_pairs):
    features_to_remove.add(correlation_matrix.columns[j])  # Remove the second feature in the pair

# Assuming correlated_features is a variable you want to define based on features_to_remove
correlated_features = list(features_to_remove)
print("Highly Correlated Feature Pairs (correlation > 0.8):", correlated_features)

Highly Correlated Feature Pairs (correlation > 0.8): []


## Low Variance Filter


In [35]:
from sklearn.feature_selection import VarianceThreshold
import pandas as pd

# Step 1: Set the threshold for low variance (e.g., < 0.01)
threshold = 0.01
var_filter = VarianceThreshold(threshold=threshold)

# Step 2: Fit and transform the data
X_filtered = var_filter.fit_transform(data.drop('Outcome', axis=1))
reduced_data = pd.DataFrame(X_filtered, columns=data.drop('Outcome', axis=1).columns[var_filter.get_support()])

# Calculate the variance of each feature
variance = var_filter.variances_

# Remove features with variance below the threshold
low_variance_features = data.drop('Outcome', axis=1).columns[variance < threshold] # Get the feature names
reduced_data = data.drop(columns=low_variance_features) # Drop from the original dataframe

print(f"Features removed due to low variance: {low_variance_features}")


Features removed due to low variance: Index([], dtype='object')


## Forward Feature Selection

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector

# Load the diabetes dataset
data = pd.read_csv('/content/diabetes.csv')

# Separate features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Step 1: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Create a Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Step 3: Apply Forward Feature Selection using SequentialFeatureSelector
# Change n_features_to_select to 'auto' instead of None
selector = SequentialFeatureSelector(model, direction='forward', n_features_to_select='auto', scoring='accuracy', cv=5)
selector = selector.fit(X_train, y_train)

# Step 4: Get the selected features
selected_features = X_train.columns[selector.get_support()]

print("Selected Features From Forward Selection:", selected_features)

# Step 5: Train the model using only the selected features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

model.fit(X_train_selected, y_train)

# Step 6: Predict and calculate accuracy
y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)

print("Model Accuracy after Forward Feature Selection:", accuracy)



Selected Features From Forward Selection: Index(['Pregnancies', 'Glucose', 'BMI', 'DiabetesPedigreeFunction'], dtype='object')
Model Accuracy after Forward Feature Selection: 0.7727272727272727


## Backward Feature Elimination

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE

# Load the diabetes dataset
data = pd.read_csv('/content/diabetes.csv')

# Separate features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Step 1: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Create a Decision Tree classifier
model = DecisionTreeClassifier()

# Step 3: Apply Backward Feature Elimination using Recursive Feature Elimination (RFE)
# n_features_to_select=None will eliminate features until one is left
selector = RFE(model, n_features_to_select=1, step=1)
selector = selector.fit(X_train, y_train)

# Step 4: Get the ranking of features
ranking = selector.ranking_
selected_features = X_train.columns[selector.support_]

print("Selected Features after Backward Elimination:", selected_features)

# Step 5: Train the model using only the selected features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

model.fit(X_train_selected, y_train)

# Step 6: Predict and calculate accuracy
y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)

print("Model Accuracy after Backward Feature Elimination:", accuracy)


Selected Features after Backward Elimination: Index(['Glucose'], dtype='object')
Model Accuracy after Backward Feature Elimination: 0.6688311688311688


## Random Forest

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the diabetes dataset
data = pd.read_csv('/content/diabetes.csv')

# Step 1: Separate features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Get feature importances from the trained Random Forest model
feature_importances = rf_model.feature_importances_

# Step 5: Rank the features based on importance
feature_ranking = pd.Series(feature_importances, index=X.columns).sort_values(ascending=False)
print("Feature importance ranking:\n", feature_ranking)

# Step 6: Keep only the top 5 most important features
top_5_features = feature_ranking.index[:5]
X_train_reduced = X_train[top_5_features]
X_test_reduced = X_test[top_5_features]

# Step 7: Train a new Random Forest model on the reduced feature set
rf_model_reduced = RandomForestClassifier(random_state=42)
rf_model_reduced.fit(X_train_reduced, y_train)

# Step 8: Predict and calculate accuracy using the reduced feature set
y_pred_reduced = rf_model_reduced.predict(X_test_reduced)
accuracy_reduced = accuracy_score(y_test, y_pred_reduced)

print("Model accuracy with top 5 features:", accuracy_reduced)


Feature importance ranking:
 Glucose                     0.258864
BMI                         0.169984
Age                         0.140931
DiabetesPedigreeFunction    0.123768
BloodPressure               0.088134
Pregnancies                 0.076551
Insulin                     0.076122
SkinThickness               0.065646
dtype: float64
Model accuracy with top 5 features: 0.7792207792207793
