In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score




In [23]:
# First, let's load the CSV file to understand its structure
import pandas as pd

# Load the data
data = pd.read_csv('housing.csv', header=None)

# Display the first few rows of the dataframe to understand its structure
data.head()


Unnamed: 0,0
0,0.00632 18.00 2.310 0 0.5380 6.5750 65...
1,0.02731 0.00 7.070 0 0.4690 6.4210 78...
2,0.02729 0.00 7.070 0 0.4690 7.1850 61...
3,0.03237 0.00 2.180 0 0.4580 6.9980 45...
4,0.06905 0.00 2.180 0 0.4580 7.1470 54...


In [24]:
# Since the values are separated by spaces, we'll specify the delimiter and column names
column_names = ["crime", "zone", "industry", "charles", "no", "rooms",
                "age", "distance", "radial", "tax", "pupil", "aam", "lower", "med_price"]

# Read the data again with the correct delimiter and column names
data = pd.read_csv('housing.csv', delim_whitespace=True, names=column_names)
data



Unnamed: 0,crime,zone,industry,charles,no,rooms,age,distance,radial,tax,pupil,aam,lower,med_price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [27]:
# Save the modified DataFrame into a new CSV file
new_csv_path = 'new_housing_dataset.csv'
data.to_csv(new_csv_path, index=False)

new_csv_path

'new_housing_dataset.csv'

In [28]:
data.head()

Unnamed: 0,crime,zone,industry,charles,no,rooms,age,distance,radial,tax,pupil,aam,lower,med_price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [44]:
# Assuming 'data' is your DataFrame
X, y = data.iloc[:, :-1], data.iloc[:, -1]

# Corrected Pipeline creation
rf_pipeline = Pipeline([
                        ("st_scaler", StandardScaler()), 
                        ("rf_model", RandomForestRegressor())
])

# Perform cross-validation
scores = cross_val_score(rf_pipeline, X, y, scoring="neg_mean_squared_error", cv=10)
final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))
print("Final Random Forest RMSE:", final_avg_rmse)

Final Random Forest RMSE: 4.200931886476444


In [43]:
import xgboost as xgb

xgb_pipeline = Pipeline([
                        ("st_scaler", StandardScaler()),
                        ("xgb_model",xgb.XGBRegressor())
])

scores = cross_val_score(xgb_pipeline, X, y, scoring="neg_mean_squared_error",cv=10)
final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))
print("Final XGB RMSE:", final_avg_rmse)

Final XGB RMSE: 4.60829421081934


In [48]:
from sklearn.model_selection import RandomizedSearchCV

xgb_pipeline = Pipeline([
                       ("st_scaler", StandardScaler()), ("xgb_model",xgb.XGBRegressor())
])
gbm_param_grid = {
     'xgb_model__subsample': np.arange(.05, 1, .05),
     'xgb_model__max_depth': np.arange(3,20,1),
     'xgb_model__colsample_bytree': np.arange(.1,1.05,.05) }
randomized_neg_mse = RandomizedSearchCV(estimator=xgb_pipeline, param_distributions=gbm_param_grid, n_iter=10,
        scoring='neg_mean_squared_error', cv=4)

randomized_neg_mse.fit(X, y)

In [49]:
print("Best rmse: ", np.sqrt(np.abs(randomized_neg_mse.best_score_)))

Best rmse:  4.7506231507720855


In [50]:
print("Best model: ", randomized_neg_mse.best_estimator_)

Best model:  Pipeline(steps=[('st_scaler', StandardScaler()),
                ('xgb_model',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=1.0000000000000004, device=None,
                              early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=8, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, mult