Missing Values Ratio

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [None]:
data = pd.read_csv('melbourne_housing_raw.csv')
print(data)

           Suburb  Rooms Type      Price Method        SellerG     Date  \
0      Abbotsford      2    h        NaN     SS         Jellis   3/9/16   
1      Abbotsford      2    h  1480000.0      S         Biggin  3/12/16   
2      Abbotsford      2    h  1035000.0      S         Biggin   4/2/16   
3      Abbotsford      3    u        NaN     VB         Rounds   4/2/16   
4      Abbotsford      3    h  1465000.0     SP         Biggin   4/3/17   
...           ...    ...  ...        ...    ...            ...      ...   
34852  Yarraville      4    h  1480000.0     PI            Jas  24/2/18   
34853  Yarraville      2    h   888000.0     SP        Sweeney  24/2/18   
34854  Yarraville      2    t   705000.0      S            Jas  24/2/18   
34855  Yarraville      3    h  1140000.0     SP  hockingstuart  24/2/18   
34856  Yarraville      2    h  1020000.0     PI             RW  24/2/18   

       Distance  Postcode  Bedroom2  Bathroom  Car  Landsize  BuildingArea  \
0           2.5    30

In [None]:
missing_percent = (data.isnull().sum() / len(data)) * 100

In [None]:
columns_to_drop = missing_percent[(missing_percent > 20) & (missing_percent.index != 'Price')].index.tolist()
data_filtered = data.drop(columns=columns_to_drop)

In [None]:
data_filtered = data_filtered.dropna(subset=['Price'])

In [None]:
X = data_filtered.drop('Price', axis=1)
y = data_filtered['Price']

In [None]:
X = pd.get_dummies(X)
X[column] = pd.to_numeric(X[column], errors='coerce')
X = X.fillna(X.mean())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE with filtered columns: {rmse}")

RMSE with filtered columns: 388776.6040623026




High Correlation Filter

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
correlation_matrix = X.corr().abs()
upper_triangle = correlation_matrix.where(
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool) # Changed pd.np to np
)
highly_correlated_features = [
    column for column in upper_triangle.columns if any(upper_triangle[column] > 0.85)
]
X_reduced = X.drop(columns=highly_correlated_features)

X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(
    X_reduced, y, test_size=0.2, random_state=42
)

model_reduced = LinearRegression()
model_reduced.fit(X_train_reduced, y_train)
y_pred_reduced = model_reduced.predict(X_test_reduced)
rmse_reduced = mean_squared_error(y_test, y_pred_reduced, squared=False)
print(f"RMSE with reduced columns (correlation > 0.85 removed): {rmse_reduced}")

RMSE with reduced columns (correlation > 0.85 removed): 388783.1609091534




Low Varience Filter

In [None]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.1)
X_variance_filtered = selector.fit_transform(X_train)
selected_features = X_train.columns[selector.get_support()]
X_train_variance_filtered = X_train[selected_features]
X_test_variance_filtered = X_test[selected_features]
model_variance_filtered = LinearRegression()
model_variance_filtered.fit(X_train_variance_filtered, y_train)
y_pred_variance_filtered = model_variance_filtered.predict(X_test_variance_filtered)
rmse_variance_filtered = mean_squared_error(y_test, y_pred_variance_filtered, squared=False)
print(f"RMSE with variance filtered columns: {rmse_variance_filtered}")


RMSE with variance filtered columns: 450213.43166508543




Forward Feature Selection