In [1]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy.special import expit
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [2]:
# Read in the data. Starting with just bitcoin for now. We can loop to do the rest
bitcoin_df = pd.read_csv('Data/coin_bitcoin.csv')
bitcoin_df.head()

Unnamed: 0,SNo,Name,Symbol,Date,High,Low,Open,Close,Volume,Marketcap
0,1,Bitcoin,BTC,2013-04-29 23:59:59,147.488007,134.0,134.444,144.539993,0.0,1603769000.0
1,2,Bitcoin,BTC,2013-04-30 23:59:59,146.929993,134.050003,144.0,139.0,0.0,1542813000.0
2,3,Bitcoin,BTC,2013-05-01 23:59:59,139.889999,107.720001,139.0,116.989998,0.0,1298955000.0
3,4,Bitcoin,BTC,2013-05-02 23:59:59,125.599998,92.281898,116.379997,105.209999,0.0,1168517000.0
4,5,Bitcoin,BTC,2013-05-03 23:59:59,108.127998,79.099998,106.25,97.75,0.0,1085995000.0


In [3]:
# A bit of pre-processing.

# We don't really need the Name of the coin if we have the symbol. Same with SNo
adjusted_df = bitcoin_df.drop('Name', axis='columns').drop('SNo', axis='columns')

# Then we truncate the date to remove the time
adjusted_df['Date'] = pd.to_datetime(adjusted_df['Date'])
adjusted_df['Date'] = adjusted_df['Date'].dt.date

# Rename some columns
new_names = {
    'Symbol': 'symbol',
    'Date': 'date',
    'High': 'high',
    'Low': 'low',
    'Open': 'open',
    'Close': 'close',
    'Marketcap': 'market_cap',
    'Volume': 'volume'
}

adjusted_df.rename(columns=new_names, inplace=True)

# Then we add percent change between open and close
# adjusted_df['percent'] = adjusted_df['close']
adjusted_df['percent_change_open_close'] = (adjusted_df['close'] - adjusted_df['open']) / adjusted_df['open']

# This probably isn't the best way of getting the average price
col = adjusted_df.loc[:, "high":"low"]
adjusted_df['average_price'] = col.mean(axis=1)

In [4]:
adjusted_df.head()

Unnamed: 0,symbol,date,high,low,open,close,volume,market_cap,percent_change_open_close,average_price
0,BTC,2013-04-29,147.488007,134.0,134.444,144.539993,0.0,1603769000.0,0.075094,140.744003
1,BTC,2013-04-30,146.929993,134.050003,144.0,139.0,0.0,1542813000.0,-0.034722,140.489998
2,BTC,2013-05-01,139.889999,107.720001,139.0,116.989998,0.0,1298955000.0,-0.158345,123.805
3,BTC,2013-05-02,125.599998,92.281898,116.379997,105.209999,0.0,1168517000.0,-0.095979,108.940948
4,BTC,2013-05-03,108.127998,79.099998,106.25,97.75,0.0,1085995000.0,-0.08,93.613998


In [5]:
# Normalize the price columns, the market_cap columns, and volume columns to compare general trends with market index
# without the influence of differing volumes and other things

# min_max_scaler = preprocessing.MinMaxScaler()
standard_scaler = StandardScaler()
normalized_df = adjusted_df.copy()
normalized_df[['high', 
               'low', 
               'open', 
               'close', 
               'volume', 
               'market_cap', 
               'percent_change_open_close', 
               'average_price']] =\
standard_scaler.fit_transform(
    normalized_df[['high', 
                   'low', 
                   'open', 
                   'close', 
                   'volume', 
                   'market_cap', 
                   'percent_change_open_close', 
                   'average_price']]
)


In [6]:
normalized_df.head()

Unnamed: 0,symbol,date,high,low,open,close,volume,market_cap,percent_change_open_close,average_price
0,BTC,2013-04-29,-0.671513,-0.684231,-0.678248,-0.675026,-0.541575,-0.655012,1.703843,-0.677892
1,BTC,2013-04-30,-0.67159,-0.684223,-0.67687,-0.67582,-0.541575,-0.655486,-0.885943,-0.677929
2,BTC,2013-05-01,-0.67257,-0.688173,-0.677591,-0.678976,-0.541575,-0.65738,-3.801325,-0.680338
3,BTC,2013-05-02,-0.674558,-0.690489,-0.680854,-0.680665,-0.541575,-0.658393,-2.330543,-0.682485
4,BTC,2013-05-03,-0.676989,-0.692466,-0.682315,-0.681735,-0.541575,-0.659034,-1.953721,-0.684699


In [7]:
# Try co classify each day as "good" or "bad" based on close. 
# If close is negative (? Not sure what this is useful. Mean is 0, so kind of indicative),
# then below average, else above average

# See the means are 0. low, market_cap, percent_change_open_close, average_price are close enough to 0 in practice
print(normalized_df.mean()) 

# Let's try to evaluate performance by percent change. If positive, good, else bad
normalized_df['performance_classifier'] = ['good' if res else 'bad' for res in normalized_df['percent_change_open_close'] > 0]



high                         0.000000e+00
low                         -3.972286e-17
open                         0.000000e+00
close                        0.000000e+00
volume                       0.000000e+00
market_cap                   1.588915e-16
percent_change_open_close   -7.448037e-18
average_price                3.972286e-17
dtype: float64


In [8]:
# Ok so now let's see what happens when we try to see how the average price of the day affects our "good" vs "bad"
# sentiment

# Let's try the open price vs sentiment first
X = np.column_stack([normalized_df['open']])
Y = np.array(normalized_df['performance_classifier'])
le = LabelEncoder()
le.fit(Y)
Y = le.transform(Y)
# le.fit([1, 2, 2, 6])
print(le.classes_)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)


['bad' 'good']
(2289, 1)
(2289,)
(573, 1)
(573,)


In [9]:
# plt.scatter(X_train, Y_train, marker='o', s=50)
# plt.grid()
# plt.tight_layout()
# plt.show()

In [10]:
clf = LogisticRegression()
clf.fit(X_train, Y_train)
# plt.scatter(X_train, Y_train, marker='o', s=50)

# plt.figure(1, figsize=(4, 3))
# plt.clf()
# plt.grid()
# plt.tight_layout()
# plt.show()
# plt.show()

LogisticRegression()

In [11]:
y_pred = clf.predict(X_test)

In [12]:
# So 50%. Basically a random guess... We'll want something better
accuracy_score(Y_test, y_pred)

0.5078534031413613