In [1]:
import pandas as pd
import pickle

In [2]:
df = pd.read_csv('all_stocks_5yr.csv')
df.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


In [3]:
df['target'] = 0
for idx, row in df.iterrows():
    today_close = df.iloc[idx]['close']
    try:
        tomorrow_high = df.iloc[idx+1]['high'] 
    except IndexError:
        break
    if (0.99*tomorrow_high) <= today_close:
        df.iat[idx, 7] = 0
    else:
        df.iat[idx, 7] = 1

In [4]:
split_date ='2017-05-01'
df_training = df.loc[df['date'] <= split_date]
df_test = df.loc[df['date'] > split_date]
len(df_training), len(df_test)

(520897, 98143)

In [5]:
y_train = pd.DataFrame(df_training['target'].reset_index().fillna(0))
x_train = df_training.drop(['target', 'date', 'Name'], 1).reset_index().fillna(0)
y_test = df_test['target'].reset_index().fillna(0)
x_test = pd.DataFrame(df_test.drop(['target', 'date', 'Name'], 1).reset_index().fillna(0))

In [6]:
y_train = y_train.values[:,1]
x_train = x_train.values[:,1:]
y_test = y_test.values[:,1]
x_test = x_test.values[:,1:]
pickle.dump((x_train, y_train, x_test, y_test), open("stock_data.pickle",'wb'))   

In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report

In [8]:
classifiers = [
    GaussianNB(),
    #  RidgeClassifier(tol=1e-2, solver="lsqr"),
    QuadraticDiscriminantAnalysis(),
    LinearDiscriminantAnalysis(),
    DecisionTreeClassifier(max_depth=5),
    KNeighborsClassifier(3, n_jobs=-1),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=-1),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    # SVC(kernel="linear", C=0.025, probability=True),
    # SVC(gamma=2, C=1, probability=True),
    # SVC(),
    MLPClassifier(alpha=1),
    # GaussianProcessClassifier(1.0 * RBF(1.0), n_jobs=-1),
]

In [9]:
for clf in classifiers:
    print('_' * 80)
    print(clf.__class__.__name__)
    clf.fit(x_train, y_train)
    print('Train/test accuracy: ', clf.score(x_train, y_train), clf.score(x_test, y_test))
    print('Classification report of Test data')
    print(classification_report(y_test, clf.predict(x_test)))

________________________________________________________________________________
GaussianNB
Train/test accuracy:  0.609108902527755 0.6490427233730373
Classification report of Test data
              precision    recall  f1-score   support

           0       0.66      0.97      0.78     63974
           1       0.46      0.04      0.08     34169

   micro avg       0.65      0.65      0.65     98143
   macro avg       0.56      0.51      0.43     98143
weighted avg       0.59      0.65      0.54     98143

________________________________________________________________________________
QuadraticDiscriminantAnalysis
Train/test accuracy:  0.6153634979660086 0.6520281629866623
Classification report of Test data
              precision    recall  f1-score   support

           0       0.67      0.94      0.78     63974
           1       0.50      0.12      0.20     34169

   micro avg       0.65      0.65      0.65     98143
   macro avg       0.58      0.53      0.49     98143
weighted 

  'precision', 'predicted', average, warn_for)
