# Stock Movement Prediction
Dataset: [S&P 500](https://www.sharecast.com/index/SP_500/prices/download)<br>
Goal: Predict future **close price**

Implement by three methods:
1. Linear Regression
2. SVM
3. Neural Network(LSTM)

In [1]:
from sklearn import linear_model, metrics
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
from datetime import datetime
from sklearn.metrics import accuracy_score

### Build training and testing data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
def build_data(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['year'] = df['Date'].dt.year
    df['month'] = df['Date'].dt.month
    df['date'] = df['Date'].dt.day
    df['day'] = df['Date'].dt.dayofweek
    df = df.drop(['Date'], axis=1)
    rise = [0]
    
    # Create a new column 'Rise' which indicates if the 'Close Price' is up or down than the day before.
    for i, close in enumerate(df['Close Price'][1:]):
        if df['Close Price'][i+1] >= df['Close Price'][i]:
            rise.append(1)
        else:
            rise.append(0)
    df['Rise'] = pd.DataFrame(rise)
    return df
train = build_data(train)
test = build_data(test)
train.head()

Unnamed: 0,Open Price,Close Price,High Price,Low Price,Volume,year,month,date,day,Rise
0,902.99,931.8,934.73,899.35,4048270080,2009,1,2,4,0
1,929.17,927.45,936.63,919.53,5413910016,2009,1,5,0,0
2,931.17,934.7,943.85,927.28,5392620032,2009,1,6,1,1
3,927.45,906.65,927.45,902.37,4704940032,2009,1,7,2,0
4,905.73,909.73,910.0,896.81,4991549952,2009,1,8,3,1


產生training和testing資料，Rise是要預測的label，所以單獨切出來。<br>

In [4]:
# Create labels
train_y = train['Rise'].values
test_y = test['Rise'].values

train_x = train
test_x = test

In [5]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(2264, 10)
(2264,)
(271, 10)
(271,)


## Logistic Regression
x_train: 利用前2天的Close Price作為Features，shape為(2262, 2)

In [6]:
PASTDAY = 2
x_train = []
x_test = []
def past_month(df, out):
    for i, day in enumerate(range(df.shape[0] - PASTDAY)):
        val = np.asarray(df.loc[i:i+PASTDAY-1]['Close Price'])
        out.append(val)

past_month(train_x, x_train)
past_month(test_x, x_test)
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
train_y = train_y[PASTDAY:]
test_y = test_y[PASTDAY:]

In [7]:
print("x_train:", x_train.shape)
print("train_y:", train_y.shape)
print("x_test:", x_test.shape)
print("test_y:", test_y.shape)

x_train: (2262, 2)
train_y: (2262,)
x_test: (269, 2)
test_y: (269,)


In [8]:
clf = linear_model.LogisticRegression(solver='lbfgs')
clf.fit(x_train, train_y)
y_pred = clf.predict(x_test)
#plt.plot(y_pred, label='Predict')
#plt.plot(test_y, label='Real')
#plt.grid()
#plt.legend(loc='lower left')
#plt.show()
#correct = 0
#for pred, real in zip(y_pred, test_y):
 #   if pred == real:
  #      correct += 1
#print("Accuracy: ", correct/len(y_pred))
acc_rate = accuracy_score(test_y, y_pred) * 100
print("Accuracy = %.2f %%" % acc_rate)

Accuracy = 54.65 %


## SVM

In [9]:
from sklearn.svm import SVC

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

x_train_std = scaler.transform(x_train)
x_test_std = scaler.transform(x_test)

In [11]:
# Custom kernel function
def my_kernel(X, Y):
    return np.dot(X, Y.T)
  
penalty = 0.05

In [12]:
svm = SVC(C=penalty, kernel="linear", probability=True)
svm.fit(x_train_std, train_y)

SVC(C=0.05, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [13]:
acc_rate = accuracy_score(test_y, svm.predict(x_test_std)) * 100
print("Penalty = %.2f, Accuracy = %.2f %%" % (penalty, acc_rate))

Penalty = 0.05, Accuracy = 52.79 %


## Neural Network(LSTM)