In [1]:
%matplotlib inline

from collections import namedtuple

import numpy as np
import matplotlib.pyplot as plt
from scipy import optimize
from sklearn.model_selection import train_test_split

In [2]:
Leaf = namedtuple('Leaf', ('value'))
Node = namedtuple('Node', ('feature', 'value', 'left', 'right',))

class BaseDecisionTree:
    def __init__(self, x, y, max_depth=np.inf):
        self.x = np.atleast_2d(x)
        self.y = np.atleast_1d(y)
        self.max_depth = max_depth
        
        self.root = self.build_tree(self.x, self.y)
    
    # Will fail in case of depth ~ 1000 because of limit of recursion calls
    def build_tree(self, x, y, depth=1):
        ### recursive fnction
        if (depth >= self.max_depth) | (self.criteria(y) < 1e-8):
            return Leaf(self.leaf_value(y))
        feature, value = self.find_best_split(x, y)
        left, right = self.partition(x, y, feature, value)
        left = self.build_tree(*left, depth = depth + 1)
        right = self.build_tree(*right, depth = depth + 1)
        return Node(feature, value, left, right)
    
    def partition(self, x, y, feature, value):
        ### your code here
        i = x[:, feature] >= value
        return [x[~i], y[~i]], [x[i], y[i]]
    
    def _impurity_partition(self, value, feature, x, y):
        ### call partition
        (_, left), (_, right) = self.partition(x, y, feature, value)
        return self.impurity(left, right)
    
    def find_best_split(self, x, y):
        best_feature, best_value, best_impurity = 0, x[0,0], np.inf
        ### use optimize.minimize_scalar to find the best split
        for feature in range(x.shape[1]):
            if x.shape[0] > 2:
                res = optimize.minimize_scalar(
                    self._impurity_partition,
                    args=(feature, x, y),
                    bounds=(x[:, feature].min(), x[:, feature].max()),
                    method='bounded'
                )
                assert res.success
                value = res.x
                impurity = res.fun
            else:
                value = x[:, feature].max()
                impurity = self._impurity_partition(value, feature, x, y)
            if (impurity < best_impurity):
                best_value, best_feature, best_feature = value, feature, impurity
        return best_feature, best_value
        
    def _predict_one(self, x):
        ### your code here
        node = self.root
        while not isinstance(node, Leaf):
            if x[node.feature] > node.value:
                node = node.right
            else:
                node = node.left
        return node.value

    def predict(self, x):
        ### call _predict_one for each x
        x = np.atleast_2d(x)
        y_pred = np.empty(x.shape[0], dtype=self.y.dtype) 
        for i, row in enumerate(x):
            y_pred[i] = self._predict_one(row)
        return y_pred

    def impurity(self, left, right):
        ### will be implemented in a subclass        
        left_impurity = self.criteria(left)
        right_impurity = self.criteria(right)
        return  np.sqrt(left_impurity**2 + right_impurity**2)
    
    def criteria(self, y):
        ### will be implemented in a subclass
        return np.mean((np.mean(y) - y)**2)
    
    def leaf_value(self, y):
        ### will be implemented in a subclass
        return np.mean(y)
        
class DecisionTreeClassifier(BaseDecisionTree):
    def __init__(self, x, y, *args, random_state=None, **kwargs):
        y = np.asarray(y, dtype=float)
        self.random_state = np.random.RandomState(random_state)
        self.classes = np.unique(y)
        super().__init__(x, y, *args, **kwargs)
        
    def criteria(self, y):
        """Gini"""
        ### your code here
        return np.mean((np.mean(y) - y)**2)
        
    def impurity(self, left, right):
        ### weighted l and r impurity
        left_impurity = self.criteria(left)
        right_impurity = self.criteria(right)
        
        return  np.sqrt(left_impurity**2 + right_impurity**2)
        
    def leaf_value(self, y):
        return np.mean(y)
"""
    def leaf_value(self, y):
        ### Calculate class counts
        class_counts = np.sum(y == self.classes.reshape(-1, 1), axis=1)
        m = np.max(class_counts)
        most_common  = self.classes(class_counts == 0)
        if most_common 
        return
"""

'\n    def leaf_value(self, y):\n        ### Calculate class counts\n        class_counts = np.sum(y == self.classes.reshape(-1, 1), axis=1)\n        m = np.max(class_counts)\n        most_common  = self.classes(class_counts == 0)\n        if most_common \n        return\n'

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("sdss_redshift.csv")
x = data[['u', 'g', 'r', 'i', 'z']]
y = data['redshift']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=np.random.randint(0, np.random.randint(50, 1000)))
y = y_train
x = X_train

In [5]:
y0 = y + 0
k = 0 
forest = []
while k < 3 :
    try:
        k1 = 0
        print(y0)
        garden = []
        while k1 < 3:
            try:
                X_train, _, y_train, _ = train_test_split(x, y0, test_size=0.2, random_state=np.random.randint(0, 50000))
                tree = DecisionTreeClassifier(X_train, y_train, random_state=42,max_depth = 50)
                print(np.mean(np.abs(y0 - tree.predict(x)))/np.mean(np.abs(y0)))
                if np.mean(np.abs(y0 - tree.predict(x)))/np.mean(np.abs(y0)) < 0.45:
                    k1 += 1
                    print(tree.predict(x))
                    garden.append(tree)
            except:
                pass
        y0 = y0 - np.mean([tree.predict(x) for tree in garden],axis=0)
        forest.append(garden)
        k += 1
    except:
        pass

1349    0.094337
116     0.034450
2857    0.170563
5083    0.044402
1282    0.184142
          ...   
3457    0.144626
5614    0.068819
1630    0.031128
4929    0.143737
5376    0.080931
Name: redshift, Length: 5290, dtype: float64


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


0.1850719361041709
[0.0943368 0.130609  0.170563  ... 0.0311284 0.143737  0.107514 ]
0.19648800285644283
[0.0550024 0.158743  0.170563  ... 0.0546352 0.0860627 0.080931 ]
0.19473881776886898
[0.0943368 0.130609  0.170563  ... 0.0311284 0.103549  0.080931 ]
1349    0.013111
116    -0.105538
2857    0.000000
5083   -0.020196
1282    0.032743
          ...   
3457    0.113686
5614   -0.025170
1630   -0.007836
4929    0.032621
5376   -0.008861
Name: redshift, Length: 5290, dtype: float64
0.7446985363337093
0.7384828538715219
0.74309004527927
0.756611322128381
0.756679714388736
0.7437162524056148
0.7083174150649981
0.7311589319068015
0.7259911356350224
0.7354028226435146
0.862532688166367
0.7303999669990877
0.8094278289440591
0.7452398863003703
0.8168121698304527
0.7348265452090732
0.8011279681364387
0.7325413419692383
0.7886629788794276
0.7549228721696891
0.879665173069271
0.7534531663823457
0.8902775103428932
0.7296925767120205
0.7488250000077871
0.7703791512646765
0.7246736173831362
0.76

In [None]:
y_pred = np.sum([np.mean([tree.predict(X_test) for tree in garden], axis=0) for garden in forest], axis=0)
y_pred

array([8.38244500e-02, 1.46610143e-01, 1.20361705e-01, 1.01081500e-01,
       1.22542137e-01, 1.43230299e-01, 8.16528991e-02, 4.61719813e-02,
       1.41105751e-01, 6.96342000e-02, 1.72604169e-01, 1.24008548e-01,
       1.27198925e-01, 1.46140627e-01, 1.02707935e-01, 1.61217463e-01,
       1.15229924e-01, 9.61531941e-02, 9.80978769e-02, 9.63809041e-02,
       1.39099230e-01, 1.05133747e-01, 1.08261483e-01, 1.55809067e-01,
       1.58758843e-01, 1.37471418e-01, 8.57123346e-02, 1.19025962e-01,
       1.28055180e-01, 9.65751508e-02, 8.59587267e-02, 2.95168250e-02,
       8.80412697e-02, 1.31763405e-01, 8.01260685e-02, 1.21432246e-01,
       1.41110255e-01, 1.44880512e-01, 1.32584919e-01, 1.30775518e-01,
       1.54127140e-01, 1.32662654e-01, 5.62273391e-02, 2.94564750e-02,
       1.54803044e-01, 1.28738804e-01, 1.05376764e-01, 1.18016305e-01,
       1.28697860e-01, 1.24362866e-01, 1.63564627e-01, 1.43745064e-01,
       1.40055143e-01, 1.26973670e-01, 9.80806047e-02, 2.28740683e-01,
      

In [None]:
accuracy = np.std(y_test - y_pred)/np.mean(y_test)
print("Accuracy:", accuracy)

Accuracy: 0.49182176999839466


In [None]:
plt.title("Prediction")
plt.plot(y_test, y_pred , '.', label = 'Data', alpha = 0.3)
plt.grid(True)
plt.legend()
plt.savefig("redhift.png")

NameError: name 'plt' is not defined

In [None]:
%history

%history
