In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

df = pd.read_csv(Path("datasets/housing/housing.csv"))




In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [4]:
df.keys()


Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [5]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

print(cat_cols)
print(num_cols)


Index(['ocean_proximity'], dtype='object')
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)



In [8]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

col_transformer = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])


In [9]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

models = [
    ("LinearRegression", LinearRegression()),
    ("Ridge", Ridge(alpha=0.1)),
    ("Lasso", Lasso(alpha=0.1)),
    ("ElasticNet", ElasticNet(alpha=0.1, l1_ratio=0.5)),
    ("DecisionTreeRegressor", DecisionTreeRegressor()),
    ("RandomForestRegressor", RandomForestRegressor()),
    ("SVR", SVR(kernel='linear')),
    ("GradientBoostingRegressor", GradientBoostingRegressor()),
]

pipelines = []
for name, model in models:
    pipe = Pipeline([
        ("preprocessor", col_transformer),
        ("regressor", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    pipelines.append((name, rmse, r2))


results = pd.DataFrame([
    {"Model": name, "RMSE": rmse, "Score R2": r2, } 
    for name, rmse, r2 in pipelines
])

print(results)



                       Model           RMSE  Score R2
0           LinearRegression   69791.254856  0.648635
1                      Ridge   69791.342795  0.648634
2                      Lasso   69791.275312  0.648635
3                 ElasticNet   71080.243650  0.635536
4      DecisionTreeRegressor   68441.974628  0.662090
5      RandomForestRegressor   49373.310084  0.824151
6                        SVR  114626.241100  0.052183
7  GradientBoostingRegressor   56799.590145  0.767273


In [10]:
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
# from scipy.stats import randint, uniform
# import time

# # Choose search method: 'grid' or 'randomized'
# SEARCH_METHOD = 'randomized'  # Change to 'grid' for GridSearchCV
# CV_FOLDS = 5  # Number of cross-validation folds
# N_ITER = 50  # For RandomizedSearchCV - number of iterations

# # Define parameter grids for GridSearchCV
# param_grids = {
#     "Ridge": {
#         "regressor__alpha": [0.01, 0.1, 1.0, 10.0, 100.0]
#     },
#     "Lasso": {
#         "regressor__alpha": [0.01, 0.1, 1.0, 10.0, 100.0]
#     },
#     "ElasticNet": {
#         "regressor__alpha": [0.01, 0.1, 1.0, 10.0],
#         "regressor__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]
#     },
#     "DecisionTreeRegressor": {
#         "regressor__max_depth": [None, 10, 20, 30, 50],
#         "regressor__min_samples_split": [2, 5, 10],
#         "regressor__min_samples_leaf": [1, 2, 4]
#     },
#     "RandomForestRegressor": {
#         "regressor__n_estimators": [100, 200, 300],
#         "regressor__max_depth": [None, 10, 20, 30],
#         "regressor__min_samples_split": [2, 5, 10],
#         "regressor__min_samples_leaf": [1, 2, 4]
#     },
#     "SVR": {
#         "regressor__C": [0.1, 1, 10, 100],
#         "regressor__kernel": ['linear', 'rbf', 'poly'],
#         "regressor__gamma": ['scale', 'auto', 0.001, 0.01, 0.1]
#     },
#     "GradientBoostingRegressor": {
#         "regressor__n_estimators": [100, 200, 300],
#         "regressor__learning_rate": [0.01, 0.1, 0.2],
#         "regressor__max_depth": [3, 5, 7],
#         "regressor__min_samples_split": [2, 5, 10]
#     }
# }

# # Define parameter distributions for RandomizedSearchCV
# param_distributions = {
#     "Ridge": {
#         "regressor__alpha": uniform(0.01, 100)
#     },
#     "Lasso": {
#         "regressor__alpha": uniform(0.01, 100)
#     },
#     "ElasticNet": {
#         "regressor__alpha": uniform(0.01, 10),
#         "regressor__l1_ratio": uniform(0.1, 0.9)
#     },
#     "DecisionTreeRegressor": {
#         "regressor__max_depth": randint(5, 50),
#         "regressor__min_samples_split": randint(2, 20),
#         "regressor__min_samples_leaf": randint(1, 10)
#     },
#     "RandomForestRegressor": {
#         "regressor__n_estimators": randint(100, 500),
#         "regressor__max_depth": [None] + list(range(10, 50, 10)),
#         "regressor__min_samples_split": randint(2, 20),
#         "regressor__min_samples_leaf": randint(1, 10)
#     },
#     "SVR": {
#         "regressor__C": uniform(0.1, 100),
#         "regressor__kernel": ['linear', 'rbf', 'poly'],
#         "regressor__gamma": ['scale', 'auto'] + list(uniform(0.001, 0.1).rvs(3))
#     },
#     "GradientBoostingRegressor": {
#         "regressor__n_estimators": randint(100, 500),
#         "regressor__learning_rate": uniform(0.01, 0.3),
#         "regressor__max_depth": randint(3, 10),
#         "regressor__min_samples_split": randint(2, 20)
#     }
# }

# optimized_results = []

# for name, model in models:
#     if name == "LinearRegression":
#         # LinearRegression doesn't have hyperparameters to tune
#         pipe = Pipeline([
#             ("preprocessor", col_transformer),
#             ("regressor", model)
#         ])
#         cv_scores = cross_val_score(pipe, X_train, y_train, 
#                                    cv=CV_FOLDS, scoring='neg_mean_squared_error')
#         rmse_cv = np.sqrt(-cv_scores.mean())
        
#         pipe.fit(X_train, y_train)
#         y_pred = pipe.predict(X_test)
#         rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
#         r2 = r2_score(y_test, y_pred)
        
#         optimized_results.append({
#             "Model": name,
#             "RMSE_CV": rmse_cv,
#             "RMSE_Test": rmse_test,
#             "R2_Score": r2,
#             "Best_Params": "No hyperparameters"
#         })
#         continue
    
#     if name not in param_grids:
#         continue
    
#     pipe = Pipeline([
#         ("preprocessor", col_transformer),
#         ("regressor", model)
#     ])
    
#     print(f"\n{'='*60}")
#     print(f"Optimizing {name}...")
#     print(f"{'='*60}")
    
#     start_time = time.time()
    
#     if SEARCH_METHOD == 'grid':
#         search = GridSearchCV(
#             pipe,
#             param_grids[name],
#             cv=CV_FOLDS,
#             scoring='neg_mean_squared_error',
#             n_jobs=-1,
#             verbose=1
#         )
#     else:  # randomized
#         search = RandomizedSearchCV(
#             pipe,
#             param_distributions[name],
#             n_iter=N_ITER,
#             cv=CV_FOLDS,
#             scoring='neg_mean_squared_error',
#             n_jobs=-1,
#             verbose=1,
#             random_state=42
#         )
    
#     search.fit(X_train, y_train)
    
#     elapsed_time = time.time() - start_time
    
#     # Get best model and evaluate on test set
#     best_pipe = search.best_estimator_
#     y_pred = best_pipe.predict(X_test)
    
#     rmse_cv = np.sqrt(-search.best_score_)
#     rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
#     r2 = r2_score(y_test, y_pred)
    
#     optimized_results.append({
#         "Model": name,
#         "RMSE_CV": rmse_cv,
#         "RMSE_Test": rmse_test,
#         "R2_Score": r2,
#         "Best_Params": str(search.best_params_),
#         "Time_Seconds": round(elapsed_time, 2)
#     })
    
#     print(f"Best CV RMSE: {rmse_cv:.2f}")
#     print(f"Test RMSE: {rmse_test:.2f}")
#     print(f"Best Parameters: {search.best_params_}")

# # Create results dataframe
# optimized_results_df = pd.DataFrame(optimized_results)
# print("\n" + "="*80)
# print("OPTIMIZED RESULTS WITH CROSS-VALIDATION")
# print("="*80)
# print(optimized_results_df.to_string(index=False))
