# Regression tree - Housing Price database

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint

## Dara Preprocessing

- Adding the feature name to the data
- making sure there are no null values in the data

In [2]:
word_labels = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "label"]
train_df = pd.read_csv("../HousingData/housing_train.txt", delim_whitespace=True, names = word_labels, header=None) 
test_df = pd.read_csv("../HousingData/housing_test.txt", delim_whitespace=True, names = word_labels, header=None) 


In [3]:
# df.head()

In [4]:
# df['label'].replace(0, 'non-spam',inplace=True)
# df['label'].replace(1, 'spam',inplace=True)
# df.head()

# Train-Test-Split

***

In [5]:
data = train_df.values
data[:5]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00, 2.4000e+01],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00, 2.1600e+01],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        7.1850e+00, 6.1100e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9283e+02, 4.0300e+00, 3.4700e+01],
       [3.2370e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        6.9980e+00, 4.5800e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9463e+02, 2.9400e+00, 3.3400e+01],
       [6.9050e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        7.1470e+00, 5.4200e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9690e+02, 5.3300e+00, 3.6200e+01]])

### Evaluate data in each bucket

### Classify

In [6]:
def data_classifier(data):
    
    label_column = data[:, -1]
    mean_data = np.mean(label_column)
    classification = mean_data
    
    return classification

### Potential splits

In [7]:
def get_data_partitions(data):
    
    pot_data_partitions = {}
    _, num_columns = data.shape
    for column_index in range(num_columns - 1):        # excluding the last column which is the label
        pot_data_partitions[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)

        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                pot_data_partition = (current_value + previous_value) / 2
                
                pot_data_partitions[column_index].append(pot_data_partition)
    
    return pot_data_partitions

### Partition the data

In [8]:
def partition_data(data, split_feature, threshold):
    
    split_feature_values = data[:, split_feature]

    data_left = data[split_feature_values <= threshold]
    data_right = data[split_feature_values >  threshold]
    
    return data_left, data_right

In [9]:
def loss(y, y_pred):
    return 0.5 * np.power((y - y_pred), 2)

def gradient(y, y_pred):
    return -(y - y_pred)

### Lowest total variance

In [10]:
def calculate_variance(data):
    
    label_column = data[:, -1]
    variance = np.var(label_column)
    variance = variance**2/len(data) 
    return variance

In [11]:
def calculate_total_variance(data_left, data_right):
    
    n = len(data_left) + len(data_right)
    p_data_left = len(data_left) / n
    p_data_right = len(data_right) / n

    total_variance =  (p_data_left * calculate_variance(data_left) 
                      + p_data_right * calculate_variance(data_right))
    
    return total_variance

In [12]:
def determine_best_split(data, pot_data_partitions):
    
    total_variance = np.inf
    best_split_column = 0
    best_split_value = 0.000
    for column_index in pot_data_partitions:
        for value in pot_data_partitions[column_index]:
            data_left, data_right = partition_data(data, split_feature=column_index, threshold=value)
            current_total_variance = calculate_total_variance(data_left, data_right)

            if current_total_variance <= total_variance:
                total_variance = current_total_variance
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

## Decision Tree Algorithm

In [13]:
def decision_tree_algorithm(df, counter=0, min_samples=20, max_depth=2):
    
    global bool_var
    if counter == 0:
        global feature_names
        feature_names = df.columns
        data = df.values
    else:
        data = df
        
    label_column = data[:, -1]
    variance = np.var(label_column)

    if variance == 0:
        bool_var = True
    else:
        bool_var = False
    

    if (bool_var) or (len(data) < min_samples) or (counter == max_depth):
        classification = data_classifier(data)
        
        return classification

    

    else:    
        counter += 1


        pot_data_partitions = get_data_partitions(data)
        split_feature, threshold = determine_best_split(data, pot_data_partitions)
        data_left, data_right = partition_data(data, split_feature, threshold)
        

        feature_name = feature_names[split_feature]
        deciding_factor = "{} <= {}".format(feature_name, threshold)
        sub_tree = {deciding_factor: []}
        

        ans_true = decision_tree_algorithm(data_left, counter, min_samples, max_depth)
        ans_false = decision_tree_algorithm(data_right, counter, min_samples, max_depth)
        

        if ans_true == ans_false:
            sub_tree = ans_true
        else:
            sub_tree[deciding_factor].append(ans_true)
            sub_tree[deciding_factor].append(ans_false)
        
        return sub_tree

In [14]:
tree = decision_tree_algorithm(train_df)
pprint(tree)

{'RM <= 7.436999999999999': [{'LSTAT <= 15.0': [24.615808823529413,
                                                14.362878787878788]},
                             {'B <= 361.925': [21.9, 46.00714285714286]}]}


In [15]:
# d_tree = decision_tree_algorithm(d_train_df)
# pprint(tree)

## Classification

In [16]:

example = test_df.iloc[0]
example

CRIM         0.84054
ZN           0.00000
INDUS        8.14000
CHAS         0.00000
NOX          0.53800
RM           5.59900
AGE         85.70000
DIS          4.45460
RAD          4.00000
TAX        307.00000
PTRATIO     21.00000
B          303.42000
LSTAT       16.51000
label       13.90000
Name: 0, dtype: float64

In [17]:
def classify_example(example, tree):
    deciding_factor = list(tree.keys())[0]
    feature_name, comparison_operator, value = deciding_factor.split(" ")

    # deciding_factor condition
    if example[feature_name] <= float(value):
        result = tree[deciding_factor][0]
    else:
        result = tree[deciding_factor][1]

    # if result is not dictionary we have reached final step and return the result
    if not isinstance(result, dict):
        return result
    
    # the subtree needs to be divided further
    else:
        return classify_example(example, result)

In [18]:
classify_example(example, tree)

14.362878787878788

In [19]:
n_estimators=100
learning_rate=.5

# trees = []
# for _ in range(n_estimators):
#     tree = decision_tree_algorithm(train_df)
#     trees.append(tree)

In [20]:
xi = test_df

In [21]:
xi.iloc[0]["label"]

13.9

In [22]:
test_df.iloc[:, -1]

0     13.9
1     16.6
2     14.8
3     18.4
4     21.0
5     16.6
6     14.4
7     19.4
8     20.0
9     20.8
10    21.2
11    20.3
12    28.0
13    27.5
14    26.5
15    18.6
16    19.3
17    23.0
18    18.4
19    15.6
20    18.1
21    17.4
22    15.6
23    14.6
24    23.8
25    23.8
26    22.3
27    17.4
28    29.8
29    34.9
      ... 
44    20.1
45    23.2
46    18.2
47    20.6
48    17.8
49    21.7
50    22.7
51    22.6
52    13.8
53    15.0
54    27.5
55    15.0
56    17.2
57    17.9
58    16.3
59    10.2
60    10.9
61    11.0
62    11.8
63    14.9
64    12.6
65    19.9
66    19.0
67    19.1
68    19.1
69    13.6
70    20.1
71    21.8
72    24.5
73    23.1
Name: label, Length: 74, dtype: float64

In [23]:
predf = []
for i in range(len(xi)):
    predf.append(0)
for i in range(n_estimators): # like n_estimators
    new_y = []
    for i in range(len(xi)):
        example = xi.iloc[i]
        pred_i = classify_example(example, tree) 
        new_y.append(pred_i)
    xi = xi.drop(['label'], axis = 1)
    xi['label'] = new_y
    print(xi)
    
#      predf = predf + predi  # final prediction will be previous prediction value + new prediction of residual
    
#     ei = y - predf  # needed originl y here as residual always from original y    
#     yi = ei # update yi as resid
        
#     prediction = xi.apply(classify_example, axis=1, args=(tree,))
#     predf = predf + prediction  # final prediction will be previous prediction value + new prediction of residual
#     ei = xi["label"] - predf  # needed originl y here as residual always from original y 
#     yi = ei.to_frame()
#     yi.columns = ['label']
#     xi = xi.iloc[:, :-1]
#     xi = xi.join(yi)
#     xi["label1"] = (xi["label"] - xi["prediction"])
#     yi = ei # update yi as residual to reloop
#     xi.drop(xi["label"])
#     xi = xi.append(xi["label1"], axis=1)

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

[74 rows x 14 columns]
        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6 

[74 rows x 14 columns]
        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6 

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

[74 rows x 14 columns]
        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6 

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

        CRIM    ZN  INDUS  CHAS    NOX     RM    AGE      DIS  RAD    TAX  \
0    0.84054   0.0   8.14     0  0.538  5.599   85.7   4.4546    4  307.0   
1    0.67191   0.0   8.14     0  0.538  5.813   90.3   4.6820    4  307.0   
2    0.95577   0.0   8.14     0  0.538  6.047   88.8   4.4534    4  307.0   
3    0.77299   0.0   8.14     0  0.538  6.495   94.4   4.4547    4  307.0   
4    1.00245   0.0   8.14     0  0.538  6.674   87.3   4.2390    4  307.0   
5    0.22927   0.0   6.91     0  0.448  6.030   85.5   5.6894    3  233.0   
6    0.25387   0.0   6.91     0  0.448  5.399   95.3   5.8700    3  233.0   
7    0.21977   0.0   6.91     0  0.448  5.602   62.0   6.0877    3  233.0   
8    0.10153   0.0  12.83     0  0.437  6.279   74.5   4.0522    5  398.0   
9    0.08707   0.0  12.83     0  0.437  6.140   45.8   4.0905    5  398.0   
10   0.05646   0.0  12.83     0  0.437  6.232   53.7   5.0141    5  398.0   
11   0.08387   0.0  12.83     0  0.437  5.874   36.6   4.5026    5  398.0   

In [24]:
y_predict = xi.iloc[:, -1]

In [25]:
test_df.iloc[:, -1]

0     13.9
1     16.6
2     14.8
3     18.4
4     21.0
5     16.6
6     14.4
7     19.4
8     20.0
9     20.8
10    21.2
11    20.3
12    28.0
13    27.5
14    26.5
15    18.6
16    19.3
17    23.0
18    18.4
19    15.6
20    18.1
21    17.4
22    15.6
23    14.6
24    23.8
25    23.8
26    22.3
27    17.4
28    29.8
29    34.9
      ... 
44    20.1
45    23.2
46    18.2
47    20.6
48    17.8
49    21.7
50    22.7
51    22.6
52    13.8
53    15.0
54    27.5
55    15.0
56    17.2
57    17.9
58    16.3
59    10.2
60    10.9
61    11.0
62    11.8
63    14.9
64    12.6
65    19.9
66    19.0
67    19.1
68    19.1
69    13.6
70    20.1
71    21.8
72    24.5
73    23.1
Name: label, Length: 74, dtype: float64

In [26]:
y_predict

0     14.362879
1     24.615809
2     14.362879
3     24.615809
4     24.615809
5     14.362879
6     14.362879
7     14.362879
8     24.615809
9     24.615809
10    24.615809
11    24.615809
12    24.615809
13    24.615809
14    24.615809
15    24.615809
16    24.615809
17    24.615809
18    14.362879
19    14.362879
20    14.362879
21    14.362879
22    14.362879
23    14.362879
24    24.615809
25    24.615809
26    24.615809
27    24.615809
28    24.615809
29    24.615809
        ...    
44    24.615809
45    24.615809
46    24.615809
47    24.615809
48    14.362879
49    24.615809
50    24.615809
51    24.615809
52    14.362879
53    24.615809
54    14.362879
55    24.615809
56    14.362879
57    14.362879
58    14.362879
59    14.362879
60    24.615809
61    14.362879
62    14.362879
63    14.362879
64    14.362879
65    24.615809
66    14.362879
67    14.362879
68    14.362879
69    14.362879
70    24.615809
71    24.615809
72    24.615809
73    14.362879
Name: label, Length: 74,

In [27]:
mse = np.mean((y_predict - test_df["label"])**2)
mse

27.02860716067321

## Calculate mse for test data

In [28]:
# def calculate_mse(df, tree):

#     df["prediction"] = df.apply(classify_example, axis=1, args=(tree,))
    
#     mse = np.mean((df["label"] - df["prediction"])**2)
    
#     return mse

In [29]:
# mse = calculate_mse(test_df, tree)
# mse

In [30]:
# mse = calculate_mse(train_df, tree)
# mse

In [31]:
# mse = calculate_mse(d_test_df, d_tree)
# mse

In [32]:
# mse = calculate_mse(d_train_df, d_tree)
# mse

In [33]:
# test_df