In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, f_regression
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/content/sample_data/melbourne_housing_raw.csv')
threshold = 0.2
missing_ratio = df.isnull().mean()
columns_to_keep = missing_ratio[missing_ratio <= threshold].index
df = df[columns_to_keep]
df = df.dropna()
X = df.drop('Price', axis=1)
y = df['Price']
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)
mse_before = mean_squared_error(y_test, y_pred)
print(f'MSE before feature selection: {mse_before}')
corr_matrix = X_train.corr().abs()
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.85)]
X_train = X_train.drop(high_corr_features, axis=1)
X_test = X_test.drop(high_corr_features, axis=1)

lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)
mse_after_corr_filter = mean_squared_error(y_test, y_pred)
print(f'MSE after removing highly correlated features: {mse_after_corr_filter}')
variance_threshold = VarianceThreshold(threshold=0.01)
X_train_var = variance_threshold.fit_transform(X_train)
X_test_var = variance_threshold.transform(X_test)

lin_reg.fit(X_train_var, y_train)
y_pred = lin_reg.predict(X_test_var)
mse_after_var_filter = mean_squared_error(y_test, y_pred)
print(f'MSE after removing low variance features: {mse_after_var_filter}')

k = 10
select_k_best = SelectKBest(score_func=f_regression, k=k)
X_train_k_best = select_k_best.fit_transform(X_train, y_train)
X_test_k_best = select_k_best.transform(X_test)


lin_reg.fit(X_train_k_best, y_train)
y_pred = lin_reg.predict(X_test_k_best)
mse_after_forward_selection = mean_squared_error(y_test, y_pred)
print(f'MSE after forward feature selection: {mse_after_forward_selection}')

random_forest = RandomForestRegressor(random_state=42)
rfe = RFE(random_forest, n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)
random_forest.fit(X_train_rfe, y_train)
y_pred = random_forest.predict(X_test_rfe)
mse_after_backward_elimination = mean_squared_error(y_test, y_pred)
print(f'MSE after backward feature elimination: {mse_after_backward_elimination}')
random_forest.fit(X_train, y_train)
feature_importances = random_forest.feature_importances_
top_features = np.argsort(feature_importances)[-10:]
X_train_top_features = X_train.iloc[:, top_features]
X_test_top_features = X_test.iloc[:, top_features]
random_forest.fit(X_train_top_features, y_train)
y_pred = random_forest.predict(X_test_top_features)
mse_after_top_features = mean_squared_error(y_test, y_pred)
print(f'MSE after keeping top 10 features: {mse_after_top_features}')
plt.figure(figsize=(10, 6))
sns.barplot(x=X_train.columns[top_features], y=feature_importances[top_features])
plt.xticks(rotation=45)
plt.title('Top 10 Feature Importances from Random Forest')
plt.show()


KeyError: "['Price'] not found in axis"