In [None]:
# 1.0 Data Loading Code Runs At This Point
import pandas as pd

# 1.0.1
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,accuracy_score

# 1.0.2
from pathlib import Path


In [None]:
# 1.0.3 Get data from gdrive:

from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
# 1.1 Path to data folder:

pathToFolder = "/gdrive/MyDrive/bdda1/Dataset"

In [None]:
# 1.11 path + Filename:

path = Path(pathToFolder) / "housing.csv.zip"

In [None]:
# 1.2 Display from a cell outputs of multiple commands:

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# 1.3 Load data:
X = pd.read_csv(path, encoding='ISO-8859-1')

In [None]:
# 1.3.1 About data:

X.shape   # (2000, 21)
X.head()

(20640, 10)

Unnamed: 0.1,Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
# 1.3.2 Data types:
X.dtypes

Unnamed: 0       int64
MedInc         float64
HouseAge       float64
AveRooms       float64
AveBedrms      float64
Population     float64
AveOccup       float64
Latitude       float64
Longitude      float64
MedHouseVal    float64
dtype: object

In [None]:
# 1.2 How many unique values each column has:

for i in X.columns:
  print(i, X[i].nunique())

Unnamed: 0 20640
MedInc 12928
HouseAge 52
AveRooms 19392
AveBedrms 14233
Population 3888
AveOccup 18841
Latitude 862
Longitude 844
MedHouseVal 3842


In [None]:
# 1.5 Filter rows with missing values
X.isnull().sum().sum()

0

In [None]:
# 2.0 Choose target and features
y = X.AveBedrms
X.pop('AveBedrms')

0        1.023810
1        0.971880
2        1.073446
3        1.073059
4        1.081081
           ...   
20635    1.133333
20636    1.315789
20637    1.120092
20638    1.171920
20639    1.162264
Name: AveBedrms, Length: 20640, dtype: float64

In [None]:
X

Unnamed: 0.1,Unnamed: 0,MedInc,HouseAge,AveRooms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,0,8.3252,41.0,6.984127,322.0,2.555556,37.88,-122.23,4.526
1,1,8.3014,21.0,6.238137,2401.0,2.109842,37.86,-122.22,3.585
2,2,7.2574,52.0,8.288136,496.0,2.802260,37.85,-122.24,3.521
3,3,5.6431,52.0,5.817352,558.0,2.547945,37.85,-122.25,3.413
4,4,3.8462,52.0,6.281853,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,20635,1.5603,25.0,5.045455,845.0,2.560606,39.48,-121.09,0.781
20636,20636,2.5568,18.0,6.114035,356.0,3.122807,39.49,-121.21,0.771
20637,20637,1.7000,17.0,5.205543,1007.0,2.325635,39.43,-121.22,0.923
20638,20638,1.8672,18.0,5.329513,741.0,2.123209,39.43,-121.32,0.847


In [None]:
# 2.1 Split data into training and validation data, for both features and target:

train_x, test_x, train_y, y_test = train_test_split(X, y,random_state= 1,test_size= 0.5)

In [None]:
X.shape
train_x.shape
test_x.shape
train_y.shape
y_test.shape

(20640, 9)

(10320, 9)

(10320, 9)

(10320,)

(10320,)

In [None]:
# 3.0 A function to model and also to calculate MAE
def get_mae(max_depth, train_x, test_x, train_y, y_test):
  # 3.1 Instantiate Decision tree regressor
    model = DecisionTreeRegressor(max_depth=max_depth)
    # 3.2 Train model
    model.fit(train_x, train_y)
    # 3.3 Make predictions
    preds_val = model.predict(test_x)
    # 3.4 Get error
    mae = mean_absolute_error(y_test, preds_val)
    # 3.5 Return error
    return(mae)

In [None]:
# 3.1 Compare MAE with differing values of max_depth:
#  Get

for max_depth in [5, 10, 15, 20]:
    my_mae = get_mae(max_depth, train_x, test_x, train_y, y_test)
    print(f"Max depth: {max_depth} \t\t Mean Absolute Error: {my_mae}")

Max depth: 5 		 Mean Absolute Error: 0.078041214907258
Max depth: 10 		 Mean Absolute Error: 0.06962469761890053
Max depth: 15 		 Mean Absolute Error: 0.06955870084490465
Max depth: 20 		 Mean Absolute Error: 0.0816210009166139


## **Conclusion:**
The maximum depth of the model increases, the Mean Absolute Error also increases. This suggests that a lower max depth, such as 5, results in a more accurate model with a lower mean absolute error compared to higher max depths like 10, 15, and 20. However, it's worth noting that beyond a certain depth (in this case, around 15 to 20), increasing the max depth does not lead to a significant improvement in accuracy, as evidenced by the relatively stable MAE values for max depths 15 and 20.
