# Gradient Boosting Classifier

In [5]:
import os
import numpy
import pandas as pd

Scikit learn imports

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Specify stock names

In [7]:
# stocks = ['AMZN']
stocks = ['AMZN', 'APPL', 'FB', 'MSFT', 'TSLA']

Function to get the stock dataset file

In [8]:
def get_dataset_file(stock):
    return './prices_sentiment/%s.csv'%(stock)

In [9]:
def feature_split(stock_df,remove_stock_volume,remove_volume,remove_close,random):
    X = stock_df.copy()
    X.drop("stock_trend", axis = 1, inplace = True)
    X.drop("price_diff", axis = 1, inplace = True)
    X.drop("Date", axis = 1, inplace = True)
    
    if (remove_stock_volume):
        X.drop("Volume", axis = 1, inplace = True)
    if (remove_volume):
        X.drop("twitter_volume", axis = 1, inplace = True)
    if (remove_close):
        X.drop("Adj Close", axis = 1, inplace = True)
    
    y = stock_df["stock_trend"].values.reshape(-1,1)
    
    return X,y

In [10]:
def train_test_split(X, y):
    SPLIT = int(0.8 * len(X))
    
    X_train = X[:SPLIT]
    X_test = X[SPLIT:]

    y_train = y[:SPLIT]
    y_test = y[SPLIT:]
    
    return X_train,X_test,y_train,y_test

In [11]:
def scale_data(X_train,X_test):
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    
    return X_train_scaled,X_test_scaled

In [30]:
def gradient_boosting_predict(learning_rates,X_train_scaled,X_test_scaled,y_train,y_test):    
    # constants
    NUMBER_OF_ESTIMATORS  = 30
    MAX_NUMBER_OF_FEATURES = 5
    MAX_DEPTH = 3
    RANDOM_STATE_CONSTANT = 0
    
    # store_best_learning_rate
    learning_rate_dict = {}
    
    # configure classifier
    for learning_rate in learning_rates:
        classifier = GradientBoostingClassifier(n_estimators  = NUMBER_OF_ESTIMATORS,
                                                  learning_rate = learning_rate,
                                                  max_features  = MAX_NUMBER_OF_FEATURES,
                                                  max_depth     = MAX_DEPTH,
                                                  random_state  = RANDOM_STATE_CONSTANT)

        classifier.fit(X_train_scaled,y_train.ravel())
        training_score = classifier.score(X_train_scaled, y_train.ravel())
        testing_score  = classifier.score(X_test_scaled, y_test.ravel())
        learning_rate_dict[learning_rate] = [training_score,testing_score]
        

    best_learning_rate = learning_rates[0]
    best_accuracy_train = learning_rate_dict[best_learning_rate][0]
    best_accuracy_test = learning_rate_dict[best_learning_rate][1]
    
    for rate, accuracies in learning_rate_dict.items():
        train = accuracies[0]
        test = accuracies[1]
        
        if (train < 0.5) or (test < 0.5):
            continue
        else:
            if (test > best_accuracy_test):
                best_learning_rate = rate
                best_accuracy_train = train
                best_accuracy_test = test
            elif (test == best_accuracy_test):
                if (train > best_accuracy_train):
                    best_learning_rate = rate
                    best_accuracy_train = train
                    best_accuracy_test = test
                    
            
    print("Learning rate: ", best_learning_rate)
    
    classifier = GradientBoostingClassifier(n_estimators  = NUMBER_OF_ESTIMATORS,
                                            learning_rate = best_learning_rate,
                                            max_features  = MAX_NUMBER_OF_FEATURES,
                                            max_depth     = MAX_DEPTH,
                                            random_state  = RANDOM_STATE_CONSTANT)
    classifier = classifier.fit(X_train_scaled, y_train.ravel())
    
    training_score = classifier.score(X_train_scaled, y_train)
    testing_score  = classifier.score(X_test_scaled, y_test)
#     print("Training Score: ", training_score)
#     print("Testing Score: ", testing_score)

    # make prediction
    predictions = classifier.predict(X_test_scaled)

    # compare prediction with actual
    comparison_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test.ravel()})

    return predictions

In [31]:
def get_accuracy(y_test,predictions):
    accuracy = accuracy_score(y_test, predictions)
    return accuracy

In [34]:
def predict(stock,remove_stock_volume,remove_volume,remove_close,random=True,scale=True):
    file = get_dataset_file(stock)
    stock_df = pd.read_csv(file, skipinitialspace=True)
    
    if (random):
        stock_df = stock_df.sample(frac=1, random_state=60).reset_index(drop=True)
    
    X,y = feature_split(stock_df,remove_stock_volume,remove_volume,remove_close,random)

    X_train,X_test,y_train,y_test = train_test_split(X,y)
    
#     learning_rates = [0.75]
    learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1.25, 1.5, 2.0]
    
    if (scale):
        X_train_scaled,X_test_scaled = scale_data(X_train,X_test)
        predictions = gradient_boosting_predict(learning_rates,X_train_scaled,X_test_scaled,y_train,y_test)
    else:
        predictions = gradient_boosting_predict(learning_rates,X_train,X_test,y_train,y_test)
    
    accuracy = get_accuracy(y_test,predictions)
#     print("Accuracy:")
    print(f'{stock}: {accuracy}\n')
    
    print(f'\nClassification Report:\n{classification_report(y_test,predictions)}')
    
    confusion_matrix_results = confusion_matrix(y_test,
                                            predictions)

    confusion_matrix_dataframe = pd.DataFrame(confusion_matrix_results,
                                      index   = ["Actual 0","Actual 1"],
                                      columns = ["Predicted 0","Predicted 1"])

    display(confusion_matrix_dataframe)

In [35]:
for stock in stocks:
    print(stock)
    print("----")
    random=True
    scale=True
    remove_stock_volume=False
    remove_volume=False
    remove_close=False
    predict(stock,remove_stock_volume,remove_volume,remove_close,random,scale)

AMZN
----
Learning rate:  0.75
AMZN: 0.6333333333333333


Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.54      0.56        26
           1       0.67      0.71      0.69        34

    accuracy                           0.63        60
   macro avg       0.62      0.62      0.62        60
weighted avg       0.63      0.63      0.63        60



Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14,12
Actual 1,10,24


APPL
----
Learning rate:  1.25
APPL: 0.5423728813559322


Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.60      0.53        25
           1       0.63      0.50      0.56        34

    accuracy                           0.54        59
   macro avg       0.55      0.55      0.54        59
weighted avg       0.56      0.54      0.54        59



Unnamed: 0,Predicted 0,Predicted 1
Actual 0,15,10
Actual 1,17,17


FB
----
Learning rate:  0.1
FB: 0.65


Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.48      0.55        27
           1       0.65      0.79      0.71        33

    accuracy                           0.65        60
   macro avg       0.65      0.63      0.63        60
weighted avg       0.65      0.65      0.64        60



Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13,14
Actual 1,7,26


MSFT
----
Learning rate:  0.05
MSFT: 0.6


Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.50      0.48        22
           1       0.69      0.66      0.68        38

    accuracy                           0.60        60
   macro avg       0.58      0.58      0.58        60
weighted avg       0.61      0.60      0.60        60



Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11,11
Actual 1,13,25


TSLA
----
Learning rate:  0.1
TSLA: 0.65


Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.43      0.53        28
           1       0.63      0.84      0.72        32

    accuracy                           0.65        60
   macro avg       0.67      0.64      0.63        60
weighted avg       0.66      0.65      0.63        60



Unnamed: 0,Predicted 0,Predicted 1
Actual 0,12,16
Actual 1,5,27
