In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [2]:
train_data = pd.read_csv('housing_train_clean.csv') 

X_train = train_data.drop('SalePrice', axis=1)  # Drops the SalePrice column
Y_train = train_data[['SalePrice']]  # Keeps only the SalePrice column

test_data = pd.read_csv('housing_test_clean.csv')  

X_test = test_data.drop('SalePrice', axis=1)  # Drops the SalePrice column
Y_test = test_data[['SalePrice']]  # Keeps only the SalePrice column

In [3]:
X_train=X_train.values 
Y_train=Y_train.values.reshape(-1) 

X_test=X_test.values 
Y_test=Y_test.values.reshape(-1)

In [4]:
from sklearn.model_selection import GridSearchCV

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
param_range = list(range(1, 101, 10))

In [7]:
def perform_grid_search(estimator, param_grid, X, y):
    grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)
    return grid_search.best_params_, grid_search.best_score_


In [8]:
knn_params = {'n_neighbors': param_range}
knn_best_params, knn_best_score = perform_grid_search(KNeighborsRegressor(), knn_params, X_train_scaled, Y_train)

# DecisionTreeRegressor
dt_params = {'max_depth': param_range}
dt_best_params, dt_best_score = perform_grid_search(DecisionTreeRegressor(random_state=0), dt_params, X_train_scaled, Y_train)

# RandomForestRegressor
rf_params = {'max_depth': param_range}
rf_best_params, rf_best_score = perform_grid_search(RandomForestRegressor(random_state=0), rf_params, X_train_scaled, Y_train)


In [9]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mape = mean_absolute_percentage_error(y_test, predictions)
    return mse, mae, mape

# Evaluate each model
knn_model = KNeighborsRegressor(n_neighbors=knn_best_params['n_neighbors'])
knn_metrics = evaluate_model(knn_model, X_train_scaled, Y_train, X_test_scaled, Y_test)

dt_model = DecisionTreeRegressor(max_depth=dt_best_params['max_depth'],random_state=0)
dt_metrics = evaluate_model(dt_model, X_train_scaled, Y_train, X_test_scaled, Y_test)

rf_model = RandomForestRegressor(max_depth=rf_best_params['max_depth'],random_state=0)
rf_metrics = evaluate_model(rf_model, X_train_scaled, Y_train, X_test_scaled, Y_test)

In [10]:
results_df = pd.DataFrame({
    '': ['KNeighborsRegressor', 'DecisionTreeRegressor', 'RandomForestRegressor'],
    'MSE': [knn_metrics[0], dt_metrics[0], rf_metrics[0]],
    'MAE': [knn_metrics[1], dt_metrics[1], rf_metrics[1]],
    'MAPE': [knn_metrics[2], dt_metrics[2], rf_metrics[2]]
})

In [11]:
results_df_transposed = results_df.set_index('').T 
results_df_transposed

Unnamed: 0,KNeighborsRegressor,DecisionTreeRegressor,RandomForestRegressor
MSE,1362214000.0,1177341000.0,620376500.0
MAE,22525.75,19804.76,10814.93
MAPE,0.1270808,0.1100841,0.06108037
