In [1]:
# A basic analysis to try and fix our mistakes from last time
# We do:
# * Pre-processing of data
# * Feature Selection
# * Model Evaluation
# * Scoring Metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

### Data Collection and Labeling

In [2]:
# Focus on Bitcoin Here

coin_df = pd.read_csv('Data/coin_Bitcoin.csv')

coin_df.index = pd.to_datetime(coin_df['Date']).dt.date
coin_df.index = pd.DatetimeIndex(coin_df.index)

# Not useful
coin_df.drop('Date', axis=1, inplace=True)
coin_df.drop('SNo', axis=1, inplace=True)
coin_df.drop('Symbol', axis=1, inplace=True)
coin_df.drop('Name', axis=1, inplace=True)

coin_df.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Marketcap
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-04-29,147.488007,134.0,134.444,144.539993,0.0,1603769000.0
2013-04-30,146.929993,134.050003,144.0,139.0,0.0,1542813000.0
2013-05-01,139.889999,107.720001,139.0,116.989998,0.0,1298955000.0
2013-05-02,125.599998,92.281898,116.379997,105.209999,0.0,1168517000.0
2013-05-03,108.127998,79.099998,106.25,97.75,0.0,1085995000.0


In [3]:
# Add a classification column. The class of each data point is determined by the percent change in close price
# day by day
coin_df['Return'] = coin_df['Close'].pct_change()
# First row in df has NaN 'return' since there is no previous day
coin_df.drop(coin_df.index[0], inplace=True)
coin_df.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Marketcap,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-04-30,146.929993,134.050003,144.0,139.0,0.0,1542813000.0,-0.038328
2013-05-01,139.889999,107.720001,139.0,116.989998,0.0,1298955000.0,-0.158345
2013-05-02,125.599998,92.281898,116.379997,105.209999,0.0,1168517000.0,-0.100692
2013-05-03,108.127998,79.099998,106.25,97.75,0.0,1085995000.0,-0.070906
2013-05-04,115.0,92.5,98.099998,112.5,0.0,1250317000.0,0.150895


In [4]:
# Sentiment on return: 
# > .2 then very good 0 < < 0.2 then good converse for bad and very bad
def get_sentiment(return_val):
    if return_val >= 0.15:
        return 'very good'
    elif return_val > 0 and return_val < 0.15:
        return 'good'
    elif return_val <= 0 and return_val > -0.15:
        return 'bad'
    elif return_val <= -0.15:
        return 'very bad'
    else:
        print(return_val)
        return None
# Replace Return column with label
coin_df['Return'] = [get_sentiment(return_val) for return_val in coin_df['Return']]
coin_df.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Marketcap,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-04-30,146.929993,134.050003,144.0,139.0,0.0,1542813000.0,bad
2013-05-01,139.889999,107.720001,139.0,116.989998,0.0,1298955000.0,very bad
2013-05-02,125.599998,92.281898,116.379997,105.209999,0.0,1168517000.0,bad
2013-05-03,108.127998,79.099998,106.25,97.75,0.0,1085995000.0,bad
2013-05-04,115.0,92.5,98.099998,112.5,0.0,1250317000.0,very good


In [5]:
# We are faced with a new problem: Class Imbalance. This will be addressed in the next section
class_counts = {}
for class_ in coin_df['Return']:
    class_counts[class_] = class_counts.get(class_, 0) + 1
print(class_counts)

{'bad': 1291, 'very bad': 19, 'very good': 18, 'good': 1533}


### Model Evaluation/Parameter Tuning

In [6]:
# To decide on the best model, we should fine tune the parameters of the models in consideration to see
# the best performances of our models.

# Because Bitcoin prices can be very volatile with plenty of outliers,
# we use RobustScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV

X = coin_df[['Open', 'High', 'Low', 'Volume', 'Marketcap', 'Close']]
y = coin_df['Return']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

transformer = RobustScaler().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

In [7]:
# We should now deal with the issue of class_weights
from sklearn.utils import class_weight
labels = np.unique(y_train)
class_weights = dict(zip(labels, class_weight.compute_class_weight('balanced', classes=labels, y=y_train)))
print(class_weights)

{'bad': 0.5561111111111111, 'good': 0.4642857142857143, 'very bad': 41.708333333333336, 'very good': 41.708333333333336}


In [8]:
# Logistic Regression Parameter Tuning
from sklearn.linear_model import LogisticRegression
param_grid = {  'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
                }

lr = LogisticRegression(max_iter=2000, class_weight=class_weights)
grid = GridSearchCV(lr, param_grid)
grid.fit(X_train, y_train)
# lr_pred = lr.predict(X_test)
# print('Logistic Reg Score: {:.2f}'.format(lr.score(X_test, y_test)))
print("Best grid score: {:.2f}".format(grid.best_score_))
print("Grid test score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best grid parameters: {}".format(grid.best_params_))
lr_best_params = grid.best_params_

Best grid score: 0.88
Grid test score: 0.86
Best grid parameters: {'C': 1000}


In [9]:
# Decision Tree Parameter Tuning
from sklearn.tree import DecisionTreeClassifier
n_components = list(range(1,X.shape[1]+1,1))
criterion = ['gini', 'entropy']
max_depth = [2,4,6,8,10,12]
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2,4,6,8,10,12,14,16,18,20,25,30,40,50,70]
}
dt = DecisionTreeClassifier(class_weight=class_weights)
grid = GridSearchCV(dt, param_grid)
grid.fit(X_train, y_train)
print("Best grid score: {:.2f}".format(grid.best_score_))
print("Grid test score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best grid parameters: {}".format(grid.best_params_))
dt_best_params = grid.best_params_

Best grid score: 0.71
Grid test score: 0.73
Best grid parameters: {'criterion': 'gini', 'max_depth': 50}


In [10]:
# K-Nearest Neighbors Parameter Tuning
from sklearn.neighbors import KNeighborsClassifier
param_grid = {
    'n_neighbors': [3, 5, 11, 19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn = KNeighborsClassifier() # No class_weight param
grid = GridSearchCV(knn, param_grid)
grid.fit(X_train, y_train)
print("Best grid score: {:.2f}".format(grid.best_score_))
print("Grid test score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best grid parameters: {}".format(grid.best_params_))
knn_best_params = grid.best_params_

Best grid score: 0.64
Grid test score: 0.68
Best grid parameters: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}


In [11]:
# SVC Parameter Tuning
from sklearn.svm import SVC
param_grid = {
    'C':[1,10,100,1000],
    'gamma':[1,0.1,0.001,0.0001],
    'kernel':['linear','rbf']
}
svc = SVC(class_weight=class_weights)
grid = GridSearchCV(svc, param_grid)
grid.fit(X_train, y_train)
print("Best grid score: {:.2f}".format(grid.best_score_))
print("Grid test score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best grid parameters: {}".format(grid.best_params_))
svc_best_params = grid.best_params_

Best grid score: 0.87
Grid test score: 0.89
Best grid parameters: {'C': 1000, 'gamma': 1, 'kernel': 'linear'}


In [12]:
# Let's retrain the models using the fine-tuned parameters found
# so we can examine the confusion matrix and scoring report and make a conclusion.
lr = LogisticRegression(max_iter=2000, C=lr_best_params['C'], class_weight=class_weights)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print('Logistic Regression Score: {:.2f}'.format(lr.score(X_test, y_test)))

dt = DecisionTreeClassifier(criterion=dt_best_params['criterion'], max_depth=dt_best_params['max_depth'], class_weight=class_weights)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print('Decision Tree Score: {:.2f}'.format(dt.score(X_test, y_test)))

knn = KNeighborsClassifier(metric=knn_best_params['metric'], n_neighbors=knn_best_params['n_neighbors'], weights=knn_best_params['weights'])
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print('KNN Score: {:.2f}'.format(knn.score(X_test, y_test)))

svc = SVC(C=svc_best_params['C'], gamma=svc_best_params['gamma'], kernel=svc_best_params['kernel'], class_weight=class_weights)
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)
print('SVC Score: {:.2f}'.format(svc.score(X_test, y_test)))

Logistic Regression Score: 0.86
Decision Tree Score: 0.73
KNN Score: 0.68
SVC Score: 0.89


In [13]:


from sklearn.metrics import confusion_matrix
labels = ["Bad", "Good", "Very Good", "Very Bad"]
print("Logistic Regression Confusion Matrix:")
print(pd.DataFrame(confusion_matrix(y_test, lr_pred), index=labels, columns=labels), '\n')
print("Decision Tree Classifier Confusion Matrix:")
print(pd.DataFrame(confusion_matrix(y_test, dt_pred), index=labels, columns=labels), '\n')
print("KNN Classifier Confusion Matrix:")
print(pd.DataFrame(confusion_matrix(y_test, knn_pred), index=labels, columns=labels), '\n')
print("SVC Classifier Confusion Matrix:")
print(pd.DataFrame(confusion_matrix(y_test, svc_pred), index=labels, columns=labels), '\n')


Logistic Regression Confusion Matrix:
           Bad  Good  Very Good  Very Bad
Bad        357     3         31         0
Good        55   365         30         5
Very Good    0     0          7         0
Very Bad     0     0          0         6 

Decision Tree Classifier Confusion Matrix:
           Bad  Good  Very Good  Very Bad
Bad        276   114          1         0
Good       106   346          1         2
Very Good    4     2          1         0
Very Bad     0     5          0         1 

KNN Classifier Confusion Matrix:
           Bad  Good  Very Good  Very Bad
Bad        243   148          0         0
Good       115   340          0         0
Very Good    5     1          1         0
Very Bad     0     5          0         1 

SVC Classifier Confusion Matrix:
           Bad  Good  Very Good  Very Bad
Bad        362     5         24         0
Good        44   389         20         2
Very Good    0     0          7         0
Very Bad     0     2          1         3 



In [14]:
from sklearn.metrics import classification_report
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_pred), '\n')
print("Decision Tree Classifier Classification Report:")
print(classification_report(y_test, dt_pred), '\n')
print("KNN Classifier Classification Report:")
print(classification_report(y_test, knn_pred), '\n')
print("SVC Classifier Classification Report:")
print(classification_report(y_test, svc_pred), '\n')

Logistic Regression Classification Report:
              precision    recall  f1-score   support

         bad       0.87      0.91      0.89       391
        good       0.99      0.80      0.89       455
    very bad       0.10      1.00      0.19         7
   very good       0.55      1.00      0.71         6

    accuracy                           0.86       859
   macro avg       0.63      0.93      0.67       859
weighted avg       0.92      0.86      0.88       859
 

Decision Tree Classifier Classification Report:
              precision    recall  f1-score   support

         bad       0.72      0.71      0.71       391
        good       0.74      0.76      0.75       455
    very bad       0.33      0.14      0.20         7
   very good       0.33      0.17      0.22         6

    accuracy                           0.73       859
   macro avg       0.53      0.44      0.47       859
weighted avg       0.72      0.73      0.72       859
 

KNN Classifier Classification Repor

The confusion matrices and the classification reports above give us more insight into the performance of the models. From lecture, "When the skew in the class distributions are severe, accuracy can become an unreliable measure of model performance". This is apparent here due to the disproportion between "bad" and "good" data points vs. "very bad" and "very good" data points.

Looking at the metrics for our **LogisticRegression** model, we see from the confusion matrix that its accuracy is not all due to the correct "good" and "bad" classifications. We see that it was able to classify "very good" and "very bad" data points accurately. However, when we examing the classification report, we see that the f1-score for "very bad" is not ideal. This is due to the fact that while we have the maximum recall, we have a low precision. This is a con of this model that needs to be considered.

We can safely eliminate the **DecisionTreeClassifier** and the **KNeighborsClassifier** from our candidate models as they are unable to have any success in predicting the two classes with the fewest data points. This is supported by their respective classification reports.

Finally, we have the **SVC Classifier**. This classifier's performance is up-to-par with our **LogisticReegression** classifier (which is the best model seen thus far). We see that while its success with "very bad" is not as good as its Logistic counterpart, we observe higher f1-scores. While the f1-scores for "very bad" and "very good" leave something to be desired, we can say that, overall, the SVC provides us with better metrics.

Thus, we will proceed with the SVC Classifier for feature selection. 

### Model-Based Feature Selection

In [15]:
from sklearn.feature_selection import SelectFromModel

labels = ["Bad", "Good", "Very Good", "Very Bad"]

svc = SVC(C=svc_best_params['C'],
          gamma=svc_best_params['gamma'],
          kernel=svc_best_params['kernel'],
          class_weight=class_weights)

select = SelectFromModel(svc, threshold='median')
select.fit(X_train, y_train)
X_train_fs = select.transform(X_train)

# With all features
svc.fit(X_train, y_train)
print('Score with all features: {:.3f}'.format(svc.score(X_test, y_test)))
svc_pred = svc.predict(X_test)
print(pd.DataFrame(confusion_matrix(y_test, svc_pred), index=labels, columns=labels), '\n')
print(classification_report(y_test, svc_pred))

Score with all features: 0.886
           Bad  Good  Very Good  Very Bad
Bad        362     5         24         0
Good        44   389         20         2
Very Good    0     0          7         0
Very Bad     0     2          1         3 

              precision    recall  f1-score   support

         bad       0.89      0.93      0.91       391
        good       0.98      0.85      0.91       455
    very bad       0.13      1.00      0.24         7
   very good       0.60      0.50      0.55         6

    accuracy                           0.89       859
   macro avg       0.65      0.82      0.65       859
weighted avg       0.93      0.89      0.90       859



In [16]:
# With selected features
svc.fit(X_train_fs, y_train)
X_test_fs = select.transform(X_test)
svc_pred = svc.predict(X_test_fs)

mask = select.get_support()
selected_features = []
for i in range(len(X.columns)):
    if mask[i]:
        selected_features += [X.columns[i]]

print(f'Selected Features: {selected_features}')
print('Score with selected features: {:.3f}'.format(svc.score(X_test_fs, y_test)))
print(pd.DataFrame(confusion_matrix(y_test, svc_pred), index=labels, columns=labels), '\n')
print(classification_report(y_test, svc_pred))

Selected Features: ['Open', 'High', 'Close']
Score with selected features: 0.827
           Bad  Good  Very Good  Very Bad
Bad        365     2         24         0
Good        92   333         30         0
Very Good    0     0          7         0
Very Bad     0     0          1         5 

              precision    recall  f1-score   support

         bad       0.80      0.93      0.86       391
        good       0.99      0.73      0.84       455
    very bad       0.11      1.00      0.20         7
   very good       1.00      0.83      0.91         6

    accuracy                           0.83       859
   macro avg       0.73      0.87      0.70       859
weighted avg       0.90      0.83      0.85       859



While we observed a drop in accuracy with the selected features, this is not always a bad sign. We want to avoid overfitting, and too many features may contribute to this. Starch improvements in the f1-score for "very good" have been noted as well as the noticeable drop in the f1-score of "very bad". However, the f1-score improvement for "very good" outweighs f1-score decrease for "very bad".

### Cross Fold Validation

In [17]:
# Now that we have fine-tuned our chose model, we can observe its performance with a cross-fold validation.
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

# "Reset" our data
X = coin_df[['Open', 'High', 'Low', 'Volume', 'Marketcap', 'Close']]
y = coin_df['Return']

# Our best model
svc = SVC(C=svc_best_params['C'],
          gamma=svc_best_params['gamma'],
          kernel=svc_best_params['kernel'],
          class_weight=class_weights)

# Use a pipeline to include scaling and feature selection
clf = make_pipeline(RobustScaler(), SelectFromModel(svc), svc)
cross_val_score(clf, X, y, cv=10)

array([0.34146341, 0.84965035, 0.64335664, 0.88461538, 0.86013986,
       0.97552448, 0.96503497, 0.96853147, 0.98601399, 0.98951049])