In [160]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, r2_score

In [161]:
# Load dataset
df = pd.read_csv("californiahousing.csv")

# Display first few rows
print(df.head())

# Check dataset shape
print("Dataset shape:", df.shape)

# View dataframe info
print(df.info())

# Get summary statistics for all columns
print("\nSummary for all columns:")
print(df.describe(include='all'))

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  
Dataset shape: (20640, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex

In [162]:
df["total_bedrooms"] = df["total_bedrooms"].fillna(df["total_bedrooms"].median())

In [163]:
# Dropping any non-numerical data
numeric_df = df.select_dtypes(include=[np.number])
print(numeric_df.columns)

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')


In [171]:
numeric_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,536.838857,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,419.391878,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,297.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,643.25,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [164]:
# Target variable
y = numeric_df["median_house_value"]

# Feature variables
X = numeric_df.drop("median_house_value", axis=1)

In [165]:
X = X.fillna(X.median())

In [166]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=50
)

In [167]:
tree_model = DecisionTreeRegressor(
    max_depth=10,
    random_state=90
)

tree_model.fit(X_train, y_train)

In [168]:
y_pred = tree_model.predict(X_test)

In [169]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
accuracy = r2 *100
print("Model accuracy percentage explained by variance:", accuracy, "%")

Mean Squared Error: 3849685480.9411197
R-squared: 0.7036660704398434
Model accuracy percentage explained by variance: 70.36660704398435 %


In [170]:
print("Tree depth:", tree_model.get_depth())
print("Number of leaves:", tree_model.get_n_leaves())

Tree depth: 10
Number of leaves: 774
