In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score

In [None]:
# 1. Load the data
from google.colab import files
uploaded = files.upload()

Saving housing.csv to housing.csv


In [None]:
df = pd.read_csv("housing.csv")

In [None]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
# 2. Create a stratified test set based on income category
df['income_cat'] = pd.cut(df['median_income'], bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['income_cat']):
    strat_train_set = df.loc[train_index].drop('income_cat', axis=1)
    strat_test_set = df.loc[test_index].drop('income_cat', axis=1)

In [None]:
# Work on a copy of training data
df = strat_train_set.copy()

In [None]:
# 3. Separate predictors/features and labels
df_label = df['median_house_value']
df_features = df.drop('median_house_value', axis=1)

In [None]:
# 4. Separate numerical and categorical columns
num_attribs = df_features.drop('ocean_proximity', axis=1).columns.tolist()
cat_attribs = ['ocean_proximity']

In [None]:
# 5. Pipelines

# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Full pipeline
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
])

In [None]:
df_prepared = full_pipeline.fit_transform(df_features)
df_prepared = pd.DataFrame(df_prepared)
df_prepared.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.94135,1.347438,0.027564,0.584777,0.640371,0.732602,0.556286,-0.893647,0.0,1.0,0.0,0.0,0.0
1,1.171782,-1.19244,-1.722018,1.261467,0.781561,0.533612,0.721318,1.292168,0.0,0.0,0.0,0.0,1.0
2,0.267581,-0.125972,1.22046,-0.469773,-0.545138,-0.674675,-0.524407,-0.525434,0.0,1.0,0.0,0.0,0.0
3,1.221738,-1.351474,-0.370069,-0.348652,-0.036367,-0.467617,-0.037297,-0.865929,0.0,0.0,0.0,0.0,1.0
4,0.437431,-0.635818,-0.131489,0.427179,0.27279,0.37406,0.220898,0.325752,1.0,0.0,0.0,0.0,0.0


In [None]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(df_prepared, df_label)

# Decision Tree
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(df_prepared, df_label)

# Random Forest
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(df_prepared, df_label)

In [None]:
# Predict using training data
lr_preds = lr_model.predict(df_prepared)
dt_preds = dt_model.predict(df_prepared)
rf_preds = rf_model.predict(df_prepared)

In [None]:
# Calculate RMSE
lr_rmse = root_mean_squared_error(df_label, lr_preds)
dt_rmse = root_mean_squared_error(df_label, dt_preds)
rf_rmse = root_mean_squared_error(df_label, rf_preds)

In [None]:
print(f"Linear Regression RMSE: {lr_rmse}")
print(f"Decision Tree RMSE: {dt_rmse}")
print(f"Random Forest RMSE: {rf_rmse}")

Linear Regression RMSE: 69050.56219504567
Decision Tree RMSE: 0.0
Random Forest RMSE: 18342.366362322846


In [None]:
lr_cv = -cross_val_score(lr_model, df_prepared, df_label, scoring='neg_root_mean_squared_error', cv=10)
print("Linear Regression CV RMSEs:", lr_cv)
print("\nCross-Validation Performance (Linear Regression):")
print(pd.Series(lr_cv).describe())

Decision Tree CV RMSEs: [72229.03469752 65318.2240289  67706.39604745 69368.53738998
 66767.61061621 73003.75273869 70522.24414582 69440.77896541
 66930.32945876 70756.31946074]

Cross-Validation Performance (Decision Tree):
count       10.000000
mean     69204.322755
std       2500.382157
min      65318.224029
25%      67124.346106
50%      69404.658178
75%      70697.800632
max      73003.752739
dtype: float64


In [None]:
dt_cv = -cross_val_score(dt_model, df_prepared, df_label, scoring='neg_root_mean_squared_error', cv=10)
print("Decision Tree CV RMSEs:", dt_cv)
print("\nCross-Validation Performance (Decision Tree):")
print(pd.Series(dt_cv).describe())

Decision Tree CV RMSEs: [71177.6601991  69770.07865373 64770.5639395  68536.60203993
 67057.08155801 68847.12456973 70977.38255647 69208.86346929
 67187.87131535 73280.38732407]

Cross-Validation Performance (Decision Tree):
count       10.000000
mean     69081.361563
std       2420.500173
min      64770.563939
25%      67525.053996
50%      69027.994020
75%      70675.556581
max      73280.387324
dtype: float64


In [None]:
rf_cv = -cross_val_score(rf_model, df_prepared, df_label, scoring='neg_root_mean_squared_error', cv=10)
print("Random Forest CV RMSEs:", rf_cv)
print("\nCross-Validation Performance (Random Forest):")
print(pd.Series(rf_cv).describe())

Decision Tree CV RMSEs: [51039.08053738 48741.94041426 45940.42771745 50501.41453432
 47387.7896427  49595.25845731 51625.68567717 48865.70709952
 47322.87631489 53301.08748462]

Cross-Validation Performance (Decision Tree):
count       10.000000
mean     49432.126788
std       2239.797830
min      45940.427717
25%      47726.327336
50%      49230.482778
75%      50904.664037
max      53301.087485
dtype: float64
