In [1]:
class RandomForest():
    def __init__(self, x, y, n_trees, n_features, sample_sz, depth=10, min_leaf=5):
        np.random.seed(12)
        if n_features == 'sqrt':
            self.n_features = int(np.sqrt(x.shape[1]))
        elif n_features == 'log2':
            self.n_features = int(np.log2(x.shape[1]))
        else:
            self.n_features = n_features
#         print(self.n_features, "sha: ",x.shape[1])    
        self.x, self.y, self.sample_sz, self.depth, self.min_leaf  = x, y, sample_sz, depth, min_leaf
        self.trees = [self.create_tree() for i in range(n_trees)]

    def create_tree(self):
        idxs = np.random.permutation(len(self.y))[:self.sample_sz]
        f_idxs = np.random.permutation(self.x.shape[1])[:self.n_features]
        return DecisionTree(self.x.iloc[idxs], self.y[idxs], self.n_features, f_idxs,
                    idxs=np.array(range(self.sample_sz)),depth = self.depth, min_leaf=self.min_leaf)
        
    def predict(self, x):
        return np.mean([t.predict(x) for t in self.trees], axis=0)

def std_agg(cnt, s1, s2): return math.sqrt((s2/cnt) - (s1/cnt)**2)

class DecisionTree():
    def __init__(self, x, y, n_features, f_idxs,idxs,depth=10, min_leaf=5):
        self.x, self.y, self.idxs, self.min_leaf, self.f_idxs = x, y, idxs, min_leaf, f_idxs
        self.depth = depth
#         print(f_idxs)
#         print(self.depth)
        self.n_features = n_features
        self.n, self.c = len(idxs), x.shape[1]
        self.val = np.mean(y[idxs])
        self.score = float('inf')
        self.find_varsplit()
        
    def find_varsplit(self):
        for i in self.f_idxs: self.find_better_split(i)
        if self.is_leaf: return
        x = self.split_col
        lhs = np.nonzero(x<=self.split)[0]
        rhs = np.nonzero(x>self.split)[0]
        lf_idxs = np.random.permutation(self.x.shape[1])[:self.n_features]
        rf_idxs = np.random.permutation(self.x.shape[1])[:self.n_features]
        self.lhs = DecisionTree(self.x, self.y, self.n_features, lf_idxs, self.idxs[lhs], depth=self.depth-1, min_leaf=self.min_leaf)
        self.rhs = DecisionTree(self.x, self.y, self.n_features, rf_idxs, self.idxs[rhs], depth=self.depth-1, min_leaf=self.min_leaf)

    def find_better_split(self, var_idx):
        x, y = self.x.values[self.idxs,var_idx], self.y[self.idxs]
        sort_idx = np.argsort(x)
        sort_y,sort_x = y[sort_idx], x[sort_idx]
        rhs_cnt,rhs_sum,rhs_sum2 = self.n, sort_y.sum(), (sort_y**2).sum()
        lhs_cnt,lhs_sum,lhs_sum2 = 0,0.,0.

        for i in range(0,self.n-self.min_leaf-1):
            xi,yi = sort_x[i],sort_y[i]
            lhs_cnt += 1; rhs_cnt -= 1
            lhs_sum += yi; rhs_sum -= yi
            lhs_sum2 += yi**2; rhs_sum2 -= yi**2
            if i<self.min_leaf or xi==sort_x[i+1]:
                continue

            lhs_std = std_agg(lhs_cnt, lhs_sum, lhs_sum2)
            rhs_std = std_agg(rhs_cnt, rhs_sum, rhs_sum2)
            curr_score = lhs_std*lhs_cnt + rhs_std*rhs_cnt
            if curr_score<self.score: 
                self.var_idx,self.score,self.split = var_idx,curr_score,xi

    @property
    def split_name(self): return self.x.columns[self.var_idx]
    
    @property
    def split_col(self): return self.x.values[self.idxs,self.var_idx]

    @property
    def is_leaf(self): return self.score == float('inf') or self.depth <= 0 
    

    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: return self.val
        t = self.lhs if xi[self.var_idx]<=self.split else self.rhs
        return t.predict_row(xi)

In [10]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from pandas_datareader import DataReader
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, kpss, acf, grangercausalitytests
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf,month_plot,quarter_plot
from pandas.plotting import lag_plot
from statsmodels.tsa.holtwinters import SimpleExpSmoothing, ExponentialSmoothing
import scipy.optimize as opt
import math

from math import sqrt
from sklearn.metrics import mean_squared_error

import yfinance as yf

In [11]:
start_date = datetime(2021,1,1)
end_date = datetime(2022,1,1)
NVR = yf.download('NVR',start_date ,end_date)
Y = NVR['Adj Close'].values
NVR["Date"] = NVR.index
NVR = NVR[["Date", "Open", "High", 
             "Low", "Close", "Adj Close", "Volume"]]
NVR.reset_index(drop=True, inplace=True)
print(NVR.head())


[*********************100%***********************]  1 of 1 completed
        Date         Open         High          Low        Close    Adj Close  \
0 2021-01-04  4089.979980  4106.790039  3988.000000  4040.649902  4040.649902   
1 2021-01-05  4025.199951  4049.989990  3965.000000  4008.909912  4008.909912   
2 2021-01-06  3970.000000  4021.489990  3892.000000  3966.489990  3966.489990   
3 2021-01-07  3997.479980  4080.729980  3977.030029  4025.929932  4025.929932   
4 2021-01-08  4040.689941  4077.000000  3892.669922  3930.000000  3930.000000   

   Volume  
0   23700  
1   26800  
2   29400  
3   29600  
4   44600  


In [12]:
X = NVR
print(X)


          Date         Open         High          Low        Close  \
0   2021-01-04  4089.979980  4106.790039  3988.000000  4040.649902   
1   2021-01-05  4025.199951  4049.989990  3965.000000  4008.909912   
2   2021-01-06  3970.000000  4021.489990  3892.000000  3966.489990   
3   2021-01-07  3997.479980  4080.729980  3977.030029  4025.929932   
4   2021-01-08  4040.689941  4077.000000  3892.669922  3930.000000   
..         ...          ...          ...          ...          ...   
247 2021-12-27  5817.000000  5830.970215  5763.479980  5820.069824   
248 2021-12-28  5820.500000  5893.500000  5820.500000  5869.470215   
249 2021-12-29  5880.479980  5962.430176  5861.790039  5959.330078   
250 2021-12-30  5977.609863  5978.160156  5884.060059  5906.029785   
251 2021-12-31  5884.100098  5917.700195  5850.500000  5908.870117   

       Adj Close  Volume  
0    4040.649902   23700  
1    4008.909912   26800  
2    3966.489990   29400  
3    4025.929932   29600  
4    3930.000000   44600

In [13]:
X['Stock_Price'] = (2*NVR.High + NVR.Low + NVR.Close)/4
X['Stock_Price']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Stock_Price'] = (2*NVR.High + NVR.Low + NVR.Close)/4


0      4060.557495
1      4018.472473
2      3975.367493
3      4041.104980
4      3994.167480
          ...     
247    5811.372559
248    5869.242554
249    5936.495117
250    5936.602539
251    5898.692627
Name: Stock_Price, Length: 252, dtype: float64

In [14]:
from sklearn.preprocessing import StandardScaler
col_names = ['SP500']
features = X[col_names]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)


KeyError: "None of [Index(['SP500'], dtype='object')] are in the [columns]"