# Random Forest Classifier

In [2]:
import os
import numpy
import pandas as pd

Scikit learn imports

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Specify stock names

In [4]:
stocks = ['AMZN', 'APPL', 'FB', 'MSFT', 'TSLA']
# stocks = ['AMZN']
# stocks = ['APPL']

Function to get the stock dataset file

In [5]:
def get_dataset_file(stock):
    return './prices_sentiment/%s.csv'%(stock)

In [6]:
def feature_split(stock_df,remove_stock_volume,remove_volume,remove_close,random):
    X = stock_df.copy()
    X.drop("stock_trend", axis = 1, inplace = True)
    X.drop("price_diff", axis = 1, inplace = True)
    X.drop("Date", axis = 1, inplace = True)
    
    if (remove_stock_volume):
        X.drop("Volume", axis = 1, inplace = True)
    if (remove_volume):
        X.drop("twitter_volume", axis = 1, inplace = True)
    if (remove_close):
        X.drop("Adj Close", axis = 1, inplace = True)
    
    y = stock_df["stock_trend"].values.reshape(-1,1)
    
    return X,y

In [7]:
def train_test_split(X, y):
    SPLIT = int(0.8 * len(X))
    
    X_train = X[:SPLIT]
    X_test = X[SPLIT:]

    y_train = y[:SPLIT]
    y_test = y[SPLIT:]
    
    return X_train,X_test,y_train,y_test

In [8]:
def scale_data(X_train,X_test):
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    
    return X_train_scaled,X_test_scaled

In [9]:
def random_forest_predict(X_train_scaled,X_test_scaled,y_train,y_test):
    # constants
#     NUMBER_OF_ESTIMATORS = 750
#     RANDOM_STATE = 80
    
    # configure classifier
    classifier = RandomForestClassifier(random_state = 80, n_estimators = 500)
        
#     classifier = RandomForestClassifier(n_estimators = 750, 
#                                         random_state = 80, 
#                                         max_depth = 10,
#                                         max_features = 5, 
#                                         min_samples_leaf = 2,
#                                         min_samples_split = 3)
    
    classifier = classifier.fit(X_train_scaled, y_train.ravel())
    
    # make prediction
    predictions = classifier.predict(X_test_scaled)
    
    # compare prediction with actual
    comparison_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test.ravel()})
    
    return predictions

In [10]:
def get_accuracy(y_test,predictions):
    accuracy = accuracy_score(y_test, predictions)
    return accuracy

In [19]:
def predict(stock,remove_stock_volume,remove_volume,remove_close,random=True,scale=True):
    file = get_dataset_file(stock)
    stock_df = pd.read_csv(file, skipinitialspace=True)
    
    if (random):
        stock_df = stock_df.sample(frac=1, random_state=60).reset_index(drop=True)
    
    X,y = feature_split(stock_df,remove_stock_volume,remove_volume,remove_close,random)

    X_train,X_test,y_train,y_test = train_test_split(X,y)
    
    if (scale):
        X_train_scaled,X_test_scaled = scale_data(X_train,X_test)
        predictions = random_forest_predict(X_train_scaled,X_test_scaled,y_train,y_test)
    else:
        predictions = random_forest_predict(X_train,X_test,y_train,y_test)
    
    accuracy = get_accuracy(y_test,predictions)
    print("Accuracy:")
    print(f'{stock}: {accuracy}\n')
    
    print(f'Classification Report:\n{classification_report(y_test,predictions)}')
    
    confusion_matrix_results = confusion_matrix(y_test,
                                            predictions)

    confusion_matrix_dataframe = pd.DataFrame(confusion_matrix_results,
                                      index   = ["Actual 0","Actual 1"],
                                      columns = ["Predicted 0","Predicted 1"])
    
#     print(f'Confusion Matrix:\n{display(confusion_matrix_dataframe)}')
    display(confusion_matrix_dataframe)

In [20]:
for stock in stocks:
    random=True
    scale=True
    remove_stock_volume=False
    remove_volume=True
    remove_close=False
    predict(stock,remove_stock_volume,remove_volume,remove_close,random,scale)

Accuracy:
AMZN: 0.5833333333333334

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.50      0.51        26
           1       0.63      0.65      0.64        34

    accuracy                           0.58        60
   macro avg       0.57      0.57      0.57        60
weighted avg       0.58      0.58      0.58        60



Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13,13
Actual 1,12,22


Accuracy:
APPL: 0.5254237288135594

Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.44      0.44        25
           1       0.59      0.59      0.59        34

    accuracy                           0.53        59
   macro avg       0.51      0.51      0.51        59
weighted avg       0.53      0.53      0.53        59



Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11,14
Actual 1,14,20


Accuracy:
FB: 0.7

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.74      0.69        27
           1       0.76      0.67      0.71        33

    accuracy                           0.70        60
   macro avg       0.70      0.70      0.70        60
weighted avg       0.71      0.70      0.70        60



Unnamed: 0,Predicted 0,Predicted 1
Actual 0,20,7
Actual 1,11,22


Accuracy:
MSFT: 0.6

Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.64      0.54        22
           1       0.73      0.58      0.65        38

    accuracy                           0.60        60
   macro avg       0.60      0.61      0.59        60
weighted avg       0.64      0.60      0.61        60



Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14,8
Actual 1,16,22


Accuracy:
TSLA: 0.6166666666666667

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.36      0.47        28
           1       0.60      0.84      0.70        32

    accuracy                           0.62        60
   macro avg       0.63      0.60      0.58        60
weighted avg       0.63      0.62      0.59        60



Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10,18
Actual 1,5,27
