In [2]:
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sqlalchemy import create_engine, text

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
data = pd.read_sql_query('select * from houseprices',con=engine)
# No need for an open connection, 
# because you're only doing a single query
engine.dispose()

# Step 2: Data Cleaning
# Handling missing values, encoding categorical variables, scaling numerical variables

# Identify columns with missing values
missing_values = data.isnull().sum()
missing_values.info()
# Impute missing values for numerical columns with median
num_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

# Impute missing values for categorical columns with mode
cat_cols = data.select_dtypes(include=['object']).columns
for col in cat_cols:
    if data[col].isnull().any():
        mode_value = data[col].mode()
        if not mode_value.empty:
            data[col].fillna(mode_value[0], inplace=True)

# Encoding categorical variables using OneHotEncoder
data = pd.get_dummies(data, columns=cat_cols, drop_first=True)

# Split the data into training and test sets
X = data.drop('saleprice', axis=1)
y = data['saleprice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a KNN regression model
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
knn_predictions = knn.predict(X_test_scaled)

# Train an OLS regression model
ols = LinearRegression()
ols.fit(X_train_scaled, y_train)
ols_predictions = ols.predict(X_test_scaled)

# Evaluate performance
knn_mse = mean_squared_error(y_test, knn_predictions)
knn_r2 = r2_score(y_test, knn_predictions)
ols_mse = mean_squared_error(y_test, ols_predictions)
ols_r2 = r2_score(y_test, ols_predictions)

print("KNN Regression - MSE:", knn_mse, "R2:", knn_r2)
print("OLS Regression - MSE:", ols_mse, "R2:", ols_r2)


<class 'pandas.core.series.Series'>
Index: 81 entries, id to saleprice
Series name: None
Non-Null Count  Dtype
--------------  -----
81 non-null     int64
dtypes: int64(1)
memory usage: 3.3+ KB
KNN Regression - MSE: 1871123205.240548 R2: 0.7480677508532432
OLS Regression - MSE: 979545526.2358037 R2: 0.8681117807341264


# KNN Regression:

Mean Squared Error (MSE): 1871123205.240548
R-squared (R2): 0.7480677508532432
OLS Regression:
Mean Squared Error (MSE): 979545526.2358037
R-squared (R2): 0.8681117807341264

# Analysis

The performance metrics indicate that the OLS regression model outperforms the KNN regression model on this dataset. The lower MSE and higher R2 score for the OLS regression model suggest it is more accurate in predicting the target variable.

# KNN Regression:

KNN regression uses the average of the nearest neighbors to make predictions. In this case, the KNN model has a higher MSE and lower R2 compared to the OLS model. This could be because the relationships between the features and the target variable are more linear, which makes OLS regression a better fit.

# OLS Regression:

OLS regression assumes a linear relationship between the predictors and the target variable. The lower MSE and higher R2 score suggest that this assumption holds reasonably well for the Diabetes dataset, making OLS regression more effective in this scenario.
Conclusion
Given the results, the OLS regression model is favored over the KNN regression model for this dataset. The linear nature of the relationships in the data likely contributes to the better performance of the OLS model.

