<a href="https://colab.research.google.com/github/sakusakupanda03/BachelorThesis/blob/main/notebooks/03_predict_price_change.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import pacf, adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import f1_score

path = "../data"
path = "/content/drive/MyDrive/sotsuron/data"

# Create functions to experiment

In [None]:
def prepare_datasets():
    btc = pd.read_pickle(path + "/03_primary/btc.pickle")
    eth = pd.read_pickle(path + "/03_primary/eth.pickle")

    btc["price_change_binary_shifted"] = btc["price_change_binary"].shift(-1)
    eth["price_change_binary_shifted"] = eth["price_change_binary"].shift(-1)
    btc = btc.dropna()
    eth = eth.dropna()

    btc_X = btc[['market-cap', 'price_change', 'avg-block-size', 'avg-confirmation-time',
                'hash-rate', 'difficulty', 'miners-revenue', 'transaction-fees',
                'fees-usd-per-transaction', 'n-unique-addresses', 'n-transactions', 'price_change_binary']]
    btc_y = btc['price_change_binary_shifted']
    print("Num of columns on BTC independent variables:", len(btc_X.columns))

    eth_X = eth[['export-MarketCap', 'price_change', 'export-BlockSize', 'export-BlockTime', 
                'export-NetworkHash', 'export-BlockDifficulty', 'export-GasUsed', 'export-TransactionFee',
                'export-AverageDailyTransactionFee', 'export-DailyActiveEthAddress', 'export-verified-contracts', 'price_change_binary']]
    eth_y = eth['price_change_binary_shifted']
    print("Num of columns on ETH independent variables:", len(eth_X.columns))

    return btc_X, btc_y, eth_X, eth_y

In [None]:
def split_train_test_ts(X, y, train_start_month, train_end_month, test_month):
    X_train = X[train_start_month: train_end_month]
    y_train = y[train_start_month: train_end_month]
    X_test = X[test_month: test_month]
    y_test = y[test_month: test_month]

    return X_train, y_train, X_test, y_test

In [None]:
def build_lr(X_train, y_train, X_test, y_test, best_params, model):
    StdSc_train = StandardScaler()
    X_train = pd.DataFrame(StdSc_train.fit_transform(X_train), columns = X_train.columns)
    X_test = pd.DataFrame(StdSc_train.transform(X_test), columns = X_test.columns)

    m = model(**best_params, random_state=202209)
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    test_score = f1_score(y_test, y_pred)

    print("Mean of actual:       ", np.mean(y_test))
    print("Mean of predictions:  ", np.mean(y_pred))
    print("Test score:           ", test_score)

    return test_score


def build_rf(X_train, y_train, X_test, y_test, best_params, model):
    m = model(**best_params, random_state=202209)
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    test_score = f1_score(y_test, y_pred)

    print("Mean of actual:       ", np.mean(y_test))
    print("Mean of predictions:  ", np.mean(y_pred))
    print("Test score:           ", test_score)

    m_fi = model(**best_params, random_state=202209)
    m_fi.fit(X_test, y_test)
    fi = m_fi.feature_importances_
    series_fi = pd.Series(fi, index=X_train.columns)

    return test_score, series_fi

In [None]:
def tune_hyperparams(X_train, y_train, model):
    m = model()
    if model == RandomForestClassifier:
        cv_params = {
        'n_estimators': list(range(300, 800+1)),
        'max_depth' : list(range(3, 8+1))
        }
    elif model == LogisticRegression:
        cv_params = {"solver": ['newton-cg', 'lbfgs', 'liblinear'],
            "penalty": ['l2'],
            "C": [100, 10, 1.0, 0.1, 0.01],
            "max_iter": [1500]
        }
    else:
        None

    cv = RandomizedSearchCV(estimator=m, param_distributions=cv_params, cv = 5, n_iter=10, scoring="f1_weighted", random_state=202209)
    cv.fit(X_train, y_train)
    best_params = cv.best_params_
    train_score = cv.best_score_
    print("Best params: ", best_params)
    print("Train score: ", train_score)

    return best_params, train_score

## https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [None]:
def iterate_predictions(num_train_months, X, y, model = RandomForestClassifier):
    month_list = ["2019-08","2019-09","2019-10","2019-11",
                "2019-12","2020-01","2020-02","2020-03",
                "2020-04","2020-05","2020-06","2020-07",
                "2020-08","2020-09","2020-10","2020-11",
                "2020-12","2021-01","2021-02","2021-03",
                "2021-04","2021-05","2021-06","2021-07",
                "2021-08","2021-09","2021-10","2021-11",
                "2021-12","2022-01","2022-02","2022-03",
                "2022-04","2022-05","2022-06","2022-07",
                "2022-08"]
    windows_list = [{month_list[m + num_train_months]: [month_list[m], month_list[m + num_train_months - 1]]} for m in range(len(month_list)-num_train_months)]

    test_months = []
    pacf_scores= []
    train_scores = []
    test_scores = []
    list_fi = []
    for l in windows_list:
        train_start_month = list(l.values())[0][0]
        train_end_month = list(l.values())[0][1]
        test_month = list(l.keys())[0]
        print(test_month)
        test_months.append(test_month)

        pacf_score = pacf(y[test_month: test_month], nlags=1)[1]
        pacf_scores.append(pacf_score)

        X_train, y_train, X_test, y_test = split_train_test_ts(X, y, train_start_month, train_end_month, test_month)
        best_params, train_score = tune_hyperparams(X_train, y_train, model = model)

        if model == LogisticRegression:
            test_score = build_lr(X_train, y_train, X_test, y_test, best_params = best_params, model = model)
            list_fi = [0]*len(windows_list)
        if model == RandomForestClassifier:
            test_score, series_fi = build_rf(X_train, y_train, X_test, y_test, best_params = best_params, model = model)
            list_fi.append(series_fi)

        train_scores.append(train_score)
        test_scores.append(test_score)

    return test_months, pacf_scores, train_scores, test_scores, list_fi

In [None]:
def iterate_predictions_btc_eth(num_train_months, model = RandomForestClassifier):
    print("Num train months: ", num_train_months)
    print("Model: ", model)
    print("BTC")
    btc_test_months, btc_pacf_scores, btc_train_scores, btc_test_scores, btc_list_fi = iterate_predictions(num_train_months = num_train_months, X = btc_X, y = btc_y, model = model) 
    print("ETH")
    eth_test_months, eth_pacf_scores, eth_train_scores, eth_test_scores, eth_list_fi = iterate_predictions(num_train_months = num_train_months, X = eth_X, y = eth_y, model = model)

    res_df = pd.DataFrame({"BTC PACF Scores": btc_pacf_scores, "ETH PACF Scores": eth_pacf_scores, "BTC Train Scores": btc_train_scores, "ETH Train Scores": eth_train_scores, "BTC Test Scores": btc_test_scores, "ETH Test Scores": eth_test_scores, "BTC Features Importance": btc_list_fi, "ETH Features Importance": eth_list_fi}, index=btc_test_months).T
    if model == LogisticRegression:
        res_df.to_pickle(path + "/04_reporting" + f"/num_train_months_{num_train_months}_lr.pickle")
    if model == RandomForestClassifier:
        res_df.to_pickle(path + "/04_reporting" + f"/num_train_months_{num_train_months}_rf.pickle")


    return btc_test_months, btc_pacf_scores, btc_train_scores, btc_test_scores, eth_test_months, eth_pacf_scores, eth_train_scores, eth_test_scores

# Load the datasets

In [None]:
btc_X, btc_y, eth_X, eth_y = prepare_datasets()

Num of columns on BTC independent variables: 12
Num of columns on ETH independent variables: 12


# Predict the price change binary

In [None]:
for i in [4, 9]:
    for j in [LogisticRegression, RandomForestClassifier]:
        iterate_predictions_btc_eth(num_train_months = i, model = j)

Num train months:  4
Model:  <class 'sklearn.linear_model._logistic.LogisticRegression'>
BTC
2019-12
Best params:  {'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 1500, 'C': 10}
Train score:  0.5483904051130942
Mean of actual:        0.3870967741935484
Mean of predictions:   0.3870967741935484
Test score:            0.3333333333333333
2020-01
Best params:  {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 1500, 'C': 100}
Train score:  0.5701136363636363
Mean of actual:        0.5483870967741935
Mean of predictions:   0.3870967741935484
Test score:            0.6206896551724139
2020-02
Best params:  {'solver': 'newton-cg', 'penalty': 'l2', 'max_iter': 1500, 'C': 1.0}
Train score:  0.6127042387814846
Mean of actual:        0.5172413793103449
Mean of predictions:   0.27586206896551724
Test score:            0.3478260869565218
2020-03
Best params:  {'solver': 'newton-cg', 'penalty': 'l2', 'max_iter': 1500, 'C': 1.0}
Train score:  0.5661225634269113
Mean of actual:        0.548387096774