In [3]:
import pandas as pd
import datetime as datetime
import numpy as np
import matplotlib.pyplot
%matplotlib inline 

In [4]:
df = pd.read_csv('monero_price.csv', index_col=False)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371 entries, 0 to 1370
Data columns (total 7 columns):
Date          1371 non-null object
Open          1371 non-null float64
High          1371 non-null float64
Low           1371 non-null float64
Close         1371 non-null float64
Volume        1371 non-null object
Market Cap    1371 non-null object
dtypes: float64(4), object(3)
memory usage: 58.9+ KB


In [6]:
df.sample(10)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Market Cap
834,"Nov 09, 2015",0.49906,0.503041,0.463946,0.472776,33642,4974750
879,"Sep 25, 2015",0.470271,0.476337,0.442823,0.451515,72674,4436980
1120,"Jan 27, 2015",0.31654,0.330593,0.288746,0.320371,17844,1877370
859,"Oct 15, 2015",0.383465,0.403249,0.382849,0.390841,26749,3710570
59,"Dec 23, 2017",349.76,417.65,344.38,382.04,190117000,5423160000
515,"Sep 23, 2016",10.12,11.01,10.03,10.77,3564780,131137000
463,"Nov 14, 2016",7.11,8.13,7.09,7.71,5270540,94853000
657,"May 04, 2016",0.91748,0.950575,0.899844,0.916455,102053,10803000
1369,"May 22, 2014",1.59,2.19,1.36,2.1,132918,1371470
218,"Jul 17, 2017",29.47,34.98,29.46,34.83,13708900,435925000


In [7]:
# drop multiple columns in one go
df.drop(['Date', 'Volume', 'Market Cap'], axis='columns', inplace=True)

In [111]:
df.apply(pd.to_numeric)

In [98]:
df.head(5)

Unnamed: 0,Open,High,Low,Close
0,317.0,321.73,303.79,304.4
1,299.7,319.73,297.65,316.49
2,325.96,327.77,294.42,300.12
3,296.0,334.71,293.13,325.67
4,301.01,302.1,280.22,296.48


In [104]:
array = df.values

X = df[['Open', 'High', 'Low']]
y = np.asarray(df['Close'], dtype="|S6")

In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371 entries, 0 to 1370
Data columns (total 4 columns):
Open     1371 non-null float64
High     1371 non-null float64
Low      1371 non-null float64
Close    1371 non-null float64
dtypes: float64(4)
memory usage: 42.9 KB


In [106]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [108]:
# Feature selection using filter method chi squ
test = SelectKBest(score_func=chi2, k=3)  # 4 most important target variables
fit = test.fit(X,y)

In [109]:
np.set_printoptions(precision=3)
print(fit.scores_)

[262772.783 279883.561 239565.521]


In [19]:
y

array(['4,989,710,000', '4,716,460,000', '5,128,420,000', ...,
       '1,816,200', '1,371,470', '2,079,640'], dtype='<U13')

In [112]:
# Scale features
features = fit.transform(X)

In [113]:
# Feature selection - Filter Method - correlation coefficient 
features.dtype
features_df = pd.DataFrame(features)
features_df.corr()   # Get rid of variables with more than .7 correlation 

Unnamed: 0,0,1,2
0,1.0,0.997871,0.994916
1,0.997871,1.0,0.995701
2,0.994916,0.995701,1.0


In [114]:
# Feature Selection - Wrapper Method - RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [115]:
#  Create the logistic regression model 
model = LogisticRegression()
rfe = RFE(model, 3)    # chose top 3 feature

fit = rfe.fit(X,y)





In [116]:
print("selected features %s" %(fit.support_))
print("Feature ranking: %s" %(fit.ranking_))

selected features [ True  True  True]
Feature ranking: [1 1 1]


In [117]:
# Feature Selection - Embedded methods - regularization - ridge regression
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)
ridge.fit(X,y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [117]:
# function for the linear regression 
def pretty_print_coefs(coefs, names=None, sort=False):
    if names== None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst, key= lambda  x:-np.abs(x[0]))
    return " + ".join("%s * %s " %(round(coef, 3), name)
                      for coef, name in lst)

In [119]:
print("Ridge model: ", pretty_print_coefs(ridge.coef_))

Ridge model:  -0.432 * X0  + 0.91 * X1  + 0.506 * X2 


In [120]:
# Predictor variables
X = df[['Open', 'High', 'Low']]

In [121]:
X.info()

In [141]:
# Target variable
y = df['Close']
# y = np.asarray(df['Close'], dtype="|S6")

In [142]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [143]:
# create kNN model and fit model to training data

from sklearn import neighbors, linear_model

knn = neighbors.KNeighborsRegressor(n_neighbors=3)

knn_model = knn.fit(X_train, y_train)

In [144]:
# model evaluation
print( 'kNN accuracy for training set: %f' %knn_model.score(
    X_train, y_train))
print('knn accuracy for test set: %f' %knn_model.score(
    X_test, y_test))

kNN accuracy for training set: 0.998626
knn accuracy for test set: 0.997862


In [145]:
#Preprocessing predictor variables
from sklearn.preprocessing import scale
X_scaled = scale(X)

In [146]:
# train/test split
from sklearn.model_selection import train_test_split
X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [147]:
# kNN#2 using scaled data
knn_model2 = knn.fit(X_scaled_train, y_train)

In [148]:
print('kNN score for trainging set: ', knn_model2.score(X_scaled_train, y_train))
print('kNN score test set: ', knn_model2.score(X_scaled_test, y_test))

kNN score for trainging set:  0.9984999723039995
kNN score test set:  0.9978601780149005


In [160]:
y = np.asarray(df['Close'], dtype="|S6")

In [162]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

logreg = linear_model.LogisticRegression()

#fit model
lr = logreg.fit(X_scaled_train, y_train)



ValueError: Found input variables with inconsistent numbers of samples: [1096, 1371]

In [163]:
# create random forest classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [165]:
clf_model = clf.fit(X_scaled_train, y_train)

ValueError: Number of labels=1371 does not match number of samples=1096