In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [26]:
try:
    file = pd.read_csv('Cleaned.csv')
    df = pd.DataFrame(file)
except FileNotFoundError:
    print("The file 'Cleaned.csv' was not found.")
    raise

In [27]:
# Separate features and target
X = df.drop('price', axis=1)
y = df['price']

In [28]:
# Identify categorical and numerical features
categorical_features = ['county', 'propertyType']
numerical_features = ['bedrooms', 'bathrooms', 'squareFootage', 'lotSize', 'yearBuilt']


In [29]:
# Check for missing values
if X.isnull().any().any() or y.isnull().any():
    print("Handling missing values...")
    # Impute missing values
    for col in numerical_features:
        X[col].fillna(X[col].median(), inplace=True)
    for col in categorical_features:
        X[col].fillna(X[col].mode()[0], inplace=True)
    y.fillna(y.median(), inplace=True)

In [30]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [31]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor

# Define models and their hyperparameters
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Ridge Regression': (Ridge(random_state=1), {'model__alpha': [0.1, 1.0, 10.0]}),
    'Lasso Regression': (Lasso(random_state=1), {'model__alpha': [0.1, 1.0, 10.0]}),
    'Elastic Net Regression': (ElasticNet(random_state=1), {'model__alpha': [0.1, 1.0, 10.0], 'model__l1_ratio': [0.1, 0.5, 0.9]}),
    'Random Forest Regression': (RandomForestRegressor(random_state=1), {'model__n_estimators': [100, 200], 'model__max_depth': [None, 10, 20]})
}

In [32]:
# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', model)])

In [33]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [35]:
# Perform hyperparameter tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 4, 5]
}


In [36]:
print("Starting Grid Search...")
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)
print("Grid Search complete.")

Starting Grid Search...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END model__learning_rate=0.01, model__max_depth=3, model__n_estimators=100; total time=   9.5s
[CV] END model__learning_rate=0.01, model__max_depth=3, model__n_estimators=100; total time=   9.1s
[CV] END model__learning_rate=0.01, model__max_depth=3, model__n_estimators=100; total time=   8.4s
[CV] END model__learning_rate=0.01, model__max_depth=3, model__n_estimators=100; total time=   9.9s
[CV] END model__learning_rate=0.01, model__max_depth=3, model__n_estimators=100; total time=   9.0s
[CV] END model__learning_rate=0.01, model__max_depth=3, model__n_estimators=200; total time=  19.0s
[CV] END model__learning_rate=0.01, model__max_depth=3, model__n_estimators=200; total time=  16.2s
[CV] END model__learning_rate=0.01, model__max_depth=3, model__n_estimators=200; total time=  16.5s
[CV] END model__learning_rate=0.01, model__max_depth=3, model__n_estimators=200; total time=  21.3s
[CV] END model

In [38]:
# Best parameters and model
best_model = grid_search.best_estimator_
model_name = type(best_model.named_steps['model']).__name__
print(f"Best Parameters for {model_name}:", grid_search.best_params_)
print(f"Best Score for {model_name}:", -grid_search.best_score_)
print(f"Best Model for {model_name}:", best_model)
print()


Best Parameters for GradientBoostingRegressor: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 300}
Best Score for GradientBoostingRegressor: 141348165476.88568
Best Model for GradientBoostingRegressor: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['bedrooms', 'bathrooms',
                                                   'squareFootage', 'lotSize',
                                                   'yearBuilt']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['county',
                                                   'propertyType'])])),
                ('model',
                 GradientBoostingRegressor(max_depth=5, n_estimators=300,
                                           random_state=1))]

In [40]:
# Predictions and evaluation
print(f"Evaluating {model_name} on the test set...")
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error for {model_name}:", mse)
print(f"R² Score for {model_name}:", r2)

Evaluating GradientBoostingRegressor on the test set...
Mean Squared Error for GradientBoostingRegressor: 130811946782.38727
R² Score for GradientBoostingRegressor: 0.8307493505977925


In [42]:
# Save results to CSV for analysis
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results.to_csv(f'regression_results_{model_name.replace(" ", "_")}.csv', index=False)
print(f"Results saved to regression_results_{model_name.replace(' ', '_')}.csv.\n")

Results saved to regression_results_GradientBoostingRegressor.csv.



In [None]:
#save the best model
import joblib
joblib.dump(best_model, 'best_model.pkl')
print("Best model saved as best_model.pkl.")
