<a href="https://colab.research.google.com/github/rayasrujanareddy/ML-DIABETETES-AND-MELBOURNE-/blob/main/Feature_Selection(Melbourne_Housing).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Missing Values Ratio

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold
import numpy as np

# Load the Melbourne housing dataset
file_path = '/content/melbourne_housing_raw.csv'
data = pd.read_csv(file_path)

# Step 1: Calculate percentage of missing values for each column
missing_percentage = data.isnull().mean() * 100

# Step 2: Remove Features with more than 20% missing values, EXCLUDING 'Price'
threshold = 20

# Exclude 'Price' from the Features to drop
Features_to_remove = missing_percentage[(missing_percentage > threshold) & (missing_percentage.index != 'Price')].index
# Use 'columns' instead of 'Features' to specify columns to drop
reduced_data = data.drop(columns=Features_to_remove) # Changed 'Features' to 'columns'

print("\nFeatures Removed (more than 20% missing):", Features_to_remove)
print("Dataset Shape after Feature Removal:", reduced_data.shape)

# Step 3: Train a Linear Regression model to predict 'Price'
# Assuming 'Price' is the target variable and removing any remaining rows with missing values
reduced_data.dropna(inplace=True)
X = reduced_data.drop('Price', axis=1)
y = reduced_data['Price']


# ----> Convert categorical features to numerical using one-hot encoding
X = pd.get_dummies(X, drop_first=True) # Use pandas get_dummies for one-hot encoding


# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)




Features Removed (more than 20% missing): Index(['Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt',
       'Lattitude', 'Longtitude'],
      dtype='object')
Dataset Shape after Feature Removal: (34857, 12)


## High Correlation Filter

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the Melbourne housing dataset
file_path = '/content/melbourne_housing_raw.csv'
data = pd.read_csv(file_path)

# Step 1: Display initial dataset shape and check for correlation
print(f"Reduced dataset shape: {data.shape}")

# Step 2: Drop rows with missing target value (assuming 'Price' is the target)
data = data.dropna(subset=['Price'])

# Step 3: Drop non-numerical columns for correlation analysis
numeric_data = data.select_dtypes(include=[np.number])

# Step 4: Calculate correlation matrix
corr_matrix = numeric_data.corr().abs()

# Step 5: Identify Features with correlation greater than 0.85
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.85)]

print("\nFeatures with high correlation (> 0.85):", to_drop)

# Step 6: Remove highly correlated features
reduced_data = numeric_data.drop(columns=to_drop)

# Step 7: Separate features and target (Price)
X = reduced_data.drop('Price', axis=1)  # Features
y = reduced_data['Price']  # Target

# Impute or drop missing values in X before splitting
X.dropna(inplace=True)
y = y[X.index]  # Align y with dropped rows in X

# Step 8: Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 9: Train a Linear Regression model on the reduced dataset
model = LinearRegression()
model.fit(X_train, y_train)

# Step 10: Predict and evaluate the model
y_pred = model.predict(X_test)

# The print statement was incorrectly indented
mse = mean_squared_error(y_test, y_pred) # Calculate MSE
print(f"Mean Squared Error after removing features with high correlation: {mse}")

Reduced dataset shape: (34857, 20)

Features with high correlation (> 0.85): ['Bedroom2']
Mean Squared Error after removing features with high correlation: 139263362178.95044


## Low Variance Filter

In [19]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Step 1: Handling categorical variables (optional, if present)
X = pd.get_dummies(X, drop_first=True)

# Step 2: Remove features with low variance
threshold = 0.01  # Define a variance threshold
selector = VarianceThreshold(threshold=threshold)
X_high_variance = selector.fit_transform(X)

# Step 3: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_high_variance, y, test_size=0.2, random_state=42)

# Step 4: Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 5: Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error after removing low-variance features:", mse)


Mean Squared Error after removing low-variance features: 143982083160.5457


## Forward Feature Selection

In [20]:
from sklearn.feature_selection import VarianceThreshold, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Step 1: Handling categorical variables (optional, if present)
X = pd.get_dummies(X, drop_first=True)

# Step 2: Remove features with low variance
threshold = 0.01  # Define a variance threshold
selector = VarianceThreshold(threshold=threshold)

# Store original feature names
original_features = X.columns
X_high_variance = selector.fit_transform(X)

# Get the indices of the selected features
selected_feature_indices = selector.get_support(indices=True)
# Get the names of the selected features using the indices
selected_features = original_features[selected_feature_indices]

# Step 3: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_high_variance, y, test_size=0.2, random_state=42)

# Step 4: Train a Linear Regression model
# Create a DataFrame for X_train with the selected feature names
X_train = pd.DataFrame(X_train, columns=selected_features)
# Create a DataFrame for X_test with the selected feature names
X_test = pd.DataFrame(X_test, columns=selected_features)

model = LinearRegression()
sfs = SequentialFeatureSelector(model, n_features_to_select='auto', direction='forward')
sfs.fit(X_train, y_train)

# Get the selected features
selected_features_sfs = X_train.columns[sfs.get_support()]
print("\nSelected features from forward feature selection:", selected_features_sfs)

# Train and evaluate with selected features
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

# Recreate DataFrames with selected features
X_train_selected = pd.DataFrame(X_train_selected, columns=selected_features_sfs)
X_test_selected = pd.DataFrame(X_test_selected, columns=selected_features_sfs)

model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)
mse = mean_squared_error(y_test, y_pred)
print("\nMean Squared Error with forward selected features:", mse)


Selected features from forward feature selection: Index(['Distance', 'Bathroom', 'BuildingArea', 'YearBuilt', 'Longtitude'], dtype='object')

Mean Squared Error with forward selected features: 161311987518.01767


## Backward Feature Elimination

In [21]:
# Importing necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv('/content/melbourne_housing_raw.csv')

# Drop rows with missing target values
data = data.dropna(axis=0, subset=['Price'])

# Select target and features
y = data['Price']
X = data.drop(columns=['Price'])

# Remove categorical columns for simplicity
X = X.select_dtypes(exclude=['object'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

# Get initial feature importances
feature_importances = model.feature_importances_

# Backward Feature Elimination
# Limit the number of features to remove to avoid an empty DataFrame
max_features_to_remove = len(X_train.columns) - 1  # Keep at least one feature

for i in range(max_features_to_remove):
    # Get the least important feature
    feature_importances = model.feature_importances_
    least_important_index = feature_importances.argmin()
    least_important = X_train.columns[least_important_index]

    # Remove the least important feature
    print(f"Removing feature: {least_important}")
    X_train = X_train.drop(columns=[least_important])
    X_test = X_test.drop(columns=[least_important])

    # Re-train the model
    model.fit(X_train, y_train)

    # Make predictions and evaluate
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")

    # Print remaining features after each iteration
    if len(X_train.columns) == 1:
        print(f"Remaining features: {list(X_train.columns)}")
        break  # Stop when only one feature remains



Removing feature: Bedroom2
Mean Squared Error: 97867133109.50856
Removing feature: Car
Mean Squared Error: 97972656014.63878
Removing feature: Bathroom
Mean Squared Error: 99640140583.48747
Removing feature: YearBuilt
Mean Squared Error: 101623748900.94626
Removing feature: Propertycount
Mean Squared Error: 103897503319.53635
Removing feature: Lattitude
Mean Squared Error: 105570006035.0283
Removing feature: BuildingArea
Mean Squared Error: 115233044858.10635
Removing feature: Longtitude
Mean Squared Error: 122347047183.65295
Removing feature: Landsize
Mean Squared Error: 144426955105.20114
Removing feature: Rooms
Mean Squared Error: 296076986588.5735
Removing feature: Distance
Mean Squared Error: 296509137061.96814
Remaining features: ['Postcode']


## Random Forest


In [22]:
# Importing necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error # Import mean_squared_error

# Load the dataset
data = pd.read_csv('/content/melbourne_housing_raw.csv')

# Drop rows with missing target values
data = data.dropna(axis=0, subset=['Price'])

# Select target and features
y = data['Price']
X = data.drop(columns=['Price'])

# Remove categorical columns for simplicity
X = X.select_dtypes(exclude=['object'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)

# Get feature importances
importances = model.feature_importances_
features = X_train.columns

# Display the most important features
feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})
print(feature_importance.sort_values(by='Importance', ascending=False))

# Assuming you want to remove the least important feature for demonstration
# Get the least important feature
least_important_feature = feature_importance.sort_values(by='Importance').iloc[0]['Feature']

# Drop the least important feature from training and test sets
X_train_reduced = X_train.drop(columns=[least_important_feature])
X_test_reduced = X_test.drop(columns=[least_important_feature])

# Re-train the model with reduced features - THIS IS THE 'model_reduced'
model_reduced = RandomForestRegressor(n_estimators=100, random_state=0) # Define model_reduced
model_reduced.fit(X_train_reduced, y_train)

# Make predictions and evaluate the reduced model
y_pred_reduced = model_reduced.predict(X_test_reduced) # Use model_reduced for prediction
mse_reduced = mean_squared_error(y_test, y_pred_reduced)
print(f"Removed feature: {least_important_feature}, MSE after removal: {mse_reduced}")

          Feature  Importance
1        Distance    0.270842
0           Rooms    0.216239
2        Postcode    0.181843
6        Landsize    0.087393
10     Longtitude    0.057649
7    BuildingArea    0.051040
9       Lattitude    0.039213
11  Propertycount    0.033881
8       YearBuilt    0.027321
4        Bathroom    0.016745
5             Car    0.013324
3        Bedroom2    0.004510
Removed feature: Bedroom2, MSE after removal: 97867133109.50856
