In [5]:
import os
import sys
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import accuracy_score, classification_report

Class parameters

In [6]:
class random_forest_preprocessing:
    def __init__(self,stock_symbol,days_out,n,w):
        self.stock_symbol=stock_symbol
        self.days_out = days_out
        self.n = n 
        self.w = w 
        self.price_data = pd.read_csv('../data/processed/stocks/nse_scraped/'+self.stock_symbol+".csv")

    def values_sort_price_change_calculation(self):
        
        self.price_data = self.price_data[['Date','Symbol','Prev Close','Open',
                                'High','Low','Last','Close','VWAP','Volume']]

        # sort the values by symbol and then date
        self.price_data.sort_values(by = ['Symbol','Date'], inplace = True)

        # calculate the change in price
        self.price_data['change_in_price'] = self.price_data['Close'].diff()

    def row_symbol_change(self):
        # identify rows where the symbol changes
        mask = self.price_data['Symbol'] != self.price_data['Symbol'].shift(1)

        # For those rows, let's make the value null
        self.price_data['change_in_price'] = np.where(mask == True, np.nan, self.price_data['change_in_price'])

        # print the rows that have a null value, should only be 5
        self.price_data[self.price_data.isna().any(axis = 1)]
        #return price_data
        
    def grouping_signal_flag(self):
        # Group by symbol, then apply the rolling function and grab the Min and Max.
        price_data_smoothed = self.price_data.groupby(['Symbol'])[['Close','Low','High','Open','Volume']].transform(lambda x: x.ewm(span = self.days_out).mean())

        # Join the smoothed columns with the symbol and datetime column from the old data frame.
        smoothed_df = pd.concat([self.price_data[['Symbol','Date']], price_data_smoothed], axis=1, sort=False)
        return smoothed_df
        
    def signal_flag(self, smoothed_df):
    # define the number of days out you want to predict
        #days_out = 30

        # create a new column that will house the flag, and for each group calculate the diff compared to 30 days ago. Then use Numpy to define the sign.
        smoothed_df['Signal_Flag'] = smoothed_df.groupby('Symbol')['Close'].transform(lambda x : np.sign(x.diff(self.days_out)))

        # print the first 50 rows
        return smoothed_df


    def RSI(self):
        # First make a copy of the data frame twice
        up_df, down_df = self.price_data[['Symbol','change_in_price']].copy(), self.price_data[['Symbol','change_in_price']].copy()

        # For up days, if the change is less than 0 set to 0.
        up_df.loc['change_in_price'] = up_df.loc[(up_df['change_in_price'] < 0), 'change_in_price'] = 0

        # For down days, if the change is greater than 0 set to 0.
        down_df.loc['change_in_price'] = down_df.loc[(down_df['change_in_price'] > 0), 'change_in_price'] = 0

        # We need change in price to be absolute.
        down_df['change_in_price'] = down_df['change_in_price'].abs()

        # Calculate the EWMA (Exponential Weighted Moving Average), meaning older values are given less weight compared to newer values.
        ewma_up = up_df.groupby('Symbol')['change_in_price'].transform(lambda x: x.ewm(span = self.n).mean())
        ewma_down = down_df.groupby('Symbol')['change_in_price'].transform(lambda x: x.ewm(span = self.n).mean())

        # Calculate the Relative Strength
        relative_strength = ewma_up / ewma_down

        # Calculate the Relative Strength Index
        relative_strength_index = 100.0 - (100.0 / (1.0 + relative_strength))

        # Add the info to the data frame.
        self.price_data['down_days'] = down_df['change_in_price']
        self.price_data['up_days'] = up_df['change_in_price']
        self.price_data['RSI'] = relative_strength_index

        # Display the head.
        #return price_data


    def Stochastic_Oscillator(self):
        # Make a copy of the high and low column.
        low_14,high_14 = self.price_data[['Symbol','Low']].copy(), self.price_data[['Symbol','High']].copy()

        # Group by symbol, then apply the rolling function and grab the Min and Max.
        low_14 = low_14.groupby('Symbol')['Low'].transform(lambda x: x.rolling(window = self.n).min())
        high_14 = high_14.groupby('Symbol')['High'].transform(lambda x: x.rolling(window = self.n).max())

        # Calculate the Stochastic Oscillator.
        k_percent = 100 * ((self.price_data['Close'] - low_14) / (high_14 - low_14))

        # Add the info to the data frame.
        self.price_data['low_14'] = low_14
        self.price_data['high_14'] = high_14
        self.price_data['k_percent'] = k_percent
        #return price_data


    def Williams (self): 
        # Make a copy of the high and low column.
        low_14, high_14 = self.price_data[['Symbol','Low']].copy(), self.price_data[['Symbol','High']].copy()

        # Group by symbol, then apply the rolling function and grab the Min and Max.
        low_14 = low_14.groupby('Symbol')['Low'].transform(lambda x: x.rolling(window =self.n).min())
        high_14 = high_14.groupby('Symbol')['High'].transform(lambda x: x.rolling(window = self.n).max())

        # Calculate William %R indicator.
        r_percent = ((high_14 - self.price_data['Close']) / (high_14 - low_14)) * - 100

        # Add the info to the data frame.
        self.price_data['r_percent'] = r_percent

        # Display the head.
        #return price_data


    def MACD(self):
        # Calculate the MACD
        ema_26 = self.price_data.groupby('Symbol')['Close'].transform(lambda x: x.ewm(span = 26).mean())
        ema_12 = self.price_data.groupby('Symbol')['Close'].transform(lambda x: x.ewm(span = 12).mean())
        macd = ema_12 - ema_26

        # Calculate the EMA
        ema_9_macd = macd.ewm(span = 9).mean()

        # Store the data in the data frame.
        self.price_data['MACD'] = macd
        self.price_data['MACD_EMA'] = ema_9_macd

        # Print the head.
        #return price_data

    def rate_of_change(self):
        # Calculate the Rate of Change in the Price, and store it in the Data Frame.
        self.price_data['Price_Rate_Of_Change'] = self.price_data.groupby('Symbol')['Close'].transform(lambda x: x.pct_change(periods = self.w))

        # Print the first 30 rows
        #return price_data

    def obv(self, group):

        # Grab the volume and close column.
        volume = group['Volume']
        change = group['Close'].diff()

        # intialize the previous OBV
        prev_obv = 0
        obv_values = []

        # calculate the On Balance Volume
        for i, j in zip(change, volume):

            if i > 0:
                current_obv = prev_obv + j
            elif i < 0:
                current_obv = prev_obv - j
            else:
                current_obv = prev_obv

            # OBV.append(current_OBV)
            prev_obv = current_obv
            obv_values.append(current_obv)
        
        # Return a panda series.
        return pd.Series(obv_values, index = group.index)

    def apply_obv_to_groups(self):
    # apply the function to each group
        obv_groups = self.price_data.groupby('Symbol').apply(self.obv)

        # add to the data frame, but drop the old index, before adding it.
        self.price_data['On Balance Volume'] = obv_groups.reset_index(level=0, drop=True)

        # display the data frame.
        #return price_data

    def close_group(self):
        # Group by the `Symbol` column, then grab the `Close` column.
        close_groups = self.price_data.groupby('Symbol')['Close']
        # Apply the lambda function which will return -1.0 for down, 1.0 for up and 0.0 for no change.
        close_groups = close_groups.transform(lambda x : np.sign(x.diff()))
        self.price_data['Prediction'] = close_groups
        self.price_data.loc[self.price_data['Prediction'] == 0.0] = 1.0

    def close_group_nan_remove(self):
        # We need to remove all rows that have an NaN value.
        print('Before NaN Drop we have {} rows and {} columns'.format(self.price_data.shape[0], self.price_data.shape[1]))

        # Any row that has a `NaN` value will be dropped.
        self.price_data = self.price_data.dropna()

        # Display how much we have left now.
        print('After NaN Drop we have {} rows and {} columns'.format(self.price_data.shape[0], self.price_data.shape[1]))

        # Print the head.
        #return price_data

In [7]:
obj_rand = random_forest_preprocessing("ADANIPORTS", 30, 14, 9)

In [8]:

obj_rand.values_sort_price_change_calculation()

class methods

In [10]:
obj_rand.row_symbol_change()

In [11]:
obj_rand.RSI()

In [12]:
obj_rand.Stochastic_Oscillator()

In [13]:
obj_rand.Williams ()

In [14]:
obj_rand.MACD()

In [15]:
obj_rand.rate_of_change()

In [16]:
obj_rand.apply_obv_to_groups()

In [17]:
obj_rand.close_group()


In [18]:
obj_rand.close_group_nan_remove()

Before NaN Drop we have 3552 rows and 23 columns
After NaN Drop we have 3526 rows and 23 columns


In [19]:
obj_rand.price_data

Unnamed: 0,Date,Symbol,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,...,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume,Prediction
1036,2012-02-06,ADANIPORTS,150.25,152.85,152.85,146.30,148.15,148.10,150.34,1518103,...,53.951317,135.0,157.6,57.964602,-42.035398,0.324604,0.115609,0.009199,2722959,-1.0
1037,2012-02-07,ADANIPORTS,148.10,149.15,149.45,137.85,137.85,139.15,142.49,1735290,...,39.886073,137.0,157.6,10.436893,-89.563107,-0.184165,0.053468,-0.072642,987669,-1.0
1038,2012-02-08,ADANIPORTS,139.15,139.00,143.30,136.60,140.00,140.30,140.87,1532614,...,42.123109,136.6,157.6,17.619048,-82.380952,-0.498153,-0.060051,-0.020251,2520283,1.0
1039,2012-02-09,ADANIPORTS,140.30,139.65,141.80,135.20,138.00,137.95,138.09,1907732,...,38.725216,135.2,157.6,12.276786,-87.723214,-0.876860,-0.227176,-0.062203,612551,-1.0
1040,2012-02-10,ADANIPORTS,137.95,138.15,144.15,137.20,142.95,142.90,140.28,1780060,...,48.769195,135.2,155.4,38.118812,-61.881188,-0.844026,-0.352810,0.032514,2392611,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018,2012-01-10,MUNDRAPORT,132.20,133.85,137.00,132.85,136.30,136.45,135.67,869679,...,73.223117,111.0,137.0,97.884615,-2.115385,-0.298741,-2.412779,0.098631,75003139,1.0
1019,2012-01-11,MUNDRAPORT,136.45,136.00,140.70,134.75,135.75,137.05,137.28,1810746,...,73.976523,111.0,140.7,87.710438,-12.289562,0.514220,-1.827379,0.141608,76813885,1.0
1020,2012-01-12,MUNDRAPORT,137.05,137.50,137.50,131.20,131.60,132.65,133.23,1748799,...,59.751127,111.0,140.7,72.895623,-27.104377,0.794298,-1.303044,0.156495,75065086,-1.0
1021,2012-01-13,MUNDRAPORT,132.65,132.50,133.00,129.55,131.80,131.55,131.93,1099713,...,56.610918,111.0,140.7,69.191919,-30.808081,0.916931,-0.859049,0.093516,73965373,-1.0


In [None]:
obj_rand.grouping_signal_flag()
obj_rand.signal_flag()

In [21]:
from joblib import dump, load

In [22]:
class RandomForest:
    def __init__(self, input_df, stock_name: str):
        self.input_df = input_df
        self.X = input_df[['RSI','k_percent','r_percent',
                     'Price_Rate_Of_Change','MACD',
                     'On Balance Volume']]
        self.Y = input_df['Prediction']
        self.name = stock_name
        self.n_estimators = list(range(200, 2000, 200))
        self.max_features = ['auto', 'sqrt', None, 'log2']
        self.max_depth = list(range(10, 110, 10))
        self.max_depth.append(None)
        self.min_samples_split = [2, 5, 10, 20, 30, 40]
        self.min_samples_leaf = [1, 2, 7, 12, 14, 16 ,20]
        self.bootstrap = [True, False]
        self.random_grid = {'n_estimators': self.n_estimators,
                       'max_features': self.max_features,
                       'max_depth': self.max_depth,
                       'min_samples_split': self.min_samples_split,
                       'min_samples_leaf': self.min_samples_leaf,
                       'bootstrap': self.bootstrap}

    def train_model(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X_Cols, self.Y_Cols, random_state = 0)
        rf = RandomForestClassifier()
        rf_random = RandomizedSearchCV(estimator = rf, param_distributions = self.random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
        rf_random.fit(X_train, y_train)        
        dump(rf_random, '../models/rf_models/' + self.name)

