### Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import grid_search
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge 
from sklearn.svm import LinearSVR



### Data Preprcessing

In [2]:
mks = pd.read_csv("data/MKSfull.csv")

In [3]:
#data preprocessing 
#remove NaN values
nans = mks[mks['Close'].isnull()]
mks.drop(nans.index, inplace=True) 

### Create Features Dataframes

In [4]:
def create_dataframes(X_len, days):
    use = mks.tail(X_len + days)
    Χ_columns = []
    for j in range(days):
        Χ_columns.append('i+%s' % str(j))

    X = pd.DataFrame(index=range(X_len),columns = Χ_columns)
    y = pd.DataFrame(index=range(X_len), columns = ['Close'])

    for i in range(X_len):
        for j in range(days):
            X.iloc[i]['i+%s' % str(j)] = use.iloc[i + j]['Close']
        y.iloc[i] = use.iloc[(i + j + 1)]['Close']
        
    return X, y

In [5]:
def create_dataframes_ta(X_len, days):
    use = mks.tail(X_len + days)
    Χ_columns = []
    for j in range(days):
        Χ_columns.append('i+%s' % str(j))
    
    Χ_columns.append('Range')
    Χ_columns.append('Change')
    Χ_columns.append('ChangePercentage')
    
    X = pd.DataFrame(index=range(X_len),columns = Χ_columns)
    y = pd.DataFrame(index=range(X_len), columns = ['Close'])

    for i in range(X_len):
        stock_range = []
        stock_change = []
        stock_change_perc = []        
        for j in range(days):
            X.iloc[i]['i+%s' % str(j)] = use.iloc[i + j]['Close']
            stock_range.append(use.iloc[i + j]['High'] - use.iloc[i + j]['Low'])
            stock_change.append(use.iloc[i + j]['Close'] - use.iloc[i + j]['Open'])
            stock_change_perc.append(((use.iloc[i + j]['Close'] - use.iloc[i + j]['Open']) / use.iloc[i + j]['Open'])* 100) 

        X.iloc[i]['Range'] = np.mean(stock_range)
        X.iloc[i]['Change'] = np.mean(stock_change)
        X.iloc[i]['ChangePercentage'] = np.mean(stock_change_perc)
        y.iloc[i] = use.iloc[(i + j + 1)]['Close']
        
    return X, y

### Split train/test set

In [6]:
def split_train_test_set(X, y, test_size=0.3):  
    if len(X) != len(y):
        return "Error"
    split_index = int(len(X) * (1-test_size))
    X_train = X[:split_index]
    X_test = X[split_index:]
    y_train = y[:split_index]
    y_test = y[split_index:]
    return X_train, X_test, y_train, y_test

### Define metrics

In [7]:
def rmse(test, pred):
    return np.sqrt(((test - pred) ** 2).mean())

### Benchmark model, persistence

In [8]:
def identity(x):
    return x

def model_persistence(test_X, days):
    predictions = list()
    for x in test_X['i+%s' % str(days-1)]:
        yhat = identity(x)
        predictions.append(yhat)
    return predictions

### Define, Train Regressors: Linear, Ridge, Linear SVM

In [9]:
def get_linear_model(X, y):
    regressor = LinearRegression()
    parameters = {'fit_intercept':[True, False], 'normalize':[True, False], 'copy_X':[True, False]}
    grid = GridSearchCV(regressor, parameters)
    grid.fit(X, y)
    return grid.best_estimator_

def get_ridge_model(X, y):
    r_reg = Ridge()
    r_reg.fit(X, y)
    return r_reg

def get_lsvr_model(X, y):
    svr = LinearSVR()
    svr.fit(X, y.values.ravel())
    return svr

### Grid Search for days, data set size, usage of technical analysis features

In [10]:
i = 0
result = pd.DataFrame(index=range(60),columns = ['UseTA', 'days', 'X_size','Benchmark', 'Linear', 'Linear_Train', 'Ridge', 'Ridge_Train','LSVR', 'LSVR_Train'])
for use_ta in [True, False]:
    for days in [10, 15, 20]:
        for X_size in [100, 300, 500, 800, 1000, 1200, 1500, 2000, 3000, 7000]:
            X = []
            y = []
            print('iteration {0}'.format(i))
            if use_ta: 
                X,y = create_dataframes_ta(X_size, days)
            else: 
                X,y = create_dataframes(X_size, days)

            X_train, X_test, y_train, y_test = split_train_test_set(X, y)

            benchmark_pred = model_persistence(X_test,days )
            benchmark_err = rmse(y_test['Close'], benchmark_pred)

            linear_model = get_linear_model(X_train, y_train)
            lr_pred = linear_model.predict(X_test)
            lr_err = rmse(y_test, lr_pred)
            lr_pred_train = linear_model.predict(X_train)
            lr_err_train = rmse(y_train, lr_pred_train)

            
            ridge_model = get_ridge_model(X_train, y_train)
            rr_pred = ridge_model.predict(X_test)
            rr_err = rmse(y_test, rr_pred)
            rr_pred_train = ridge_model.predict(X_train)
            rr_err_train = rmse(y_train, rr_pred_train)
            

            lsvr_model = get_lsvr_model(X_train, y_train)
            lsvr_pred = lsvr_model.predict(X_test)
            lsvr_err = rmse(y_test.values.ravel(), lsvr_pred)
            lsvr_pred_train = lsvr_model.predict(X_train)
            lsvr_err_train = rmse(y_train.values.ravel(), lsvr_pred_train)
            

            result.iloc[i]['UseTA'] = str(use_ta)
            result.iloc[i]['days'] = days
            result.iloc[i]['X_size'] = X_size
            result.iloc[i]['Benchmark'] = benchmark_err
            
            result.iloc[i]['Linear'] = float(lr_err)
            result.iloc[i]['Linear_Train'] = float(lr_err_train)            
            
            result.iloc[i]['Ridge'] = float(rr_err)
            result.iloc[i]['Ridge_Train'] = float(rr_err_train)
            
            result.iloc[i]['LSVR'] = float(lsvr_err)
            result.iloc[i]['LSVR_Train'] = float(lsvr_err_train)
            
            #result.iloc[i]['coefs'] = linear_model.coef_
            print (result.iloc[i])
            i = i + 1
            
result

iteration 0
UseTA              True
days                 10
X_size              100
Benchmark       4.09834
Linear          5.04258
Linear_Train    3.60864
Ridge           5.06587
Ridge_Train     3.44433
LSVR            5.03487
LSVR_Train      4.76473
Name: 0, dtype: object
iteration 1
UseTA              True
days                 10
X_size              300
Benchmark       4.05541
Linear          4.13512
Linear_Train    4.98057
Ridge           4.10876
Ridge_Train      4.9684
LSVR            4.62711
LSVR_Train       5.4628
Name: 1, dtype: object
iteration 2
UseTA              True
days                 10
X_size              500
Benchmark       4.44984
Linear          4.77824
Linear_Train    7.01418
Ridge           4.78602
Ridge_Train     7.01431
LSVR            11.1618
LSVR_Train      12.9177
Name: 2, dtype: object
iteration 3
UseTA              True
days                 10
X_size              800
Benchmark       4.56521
Linear          4.83362
Linear_Train    7.37212
Ridge           4.8

UseTA             False
days                 10
X_size              100
Benchmark       4.09834
Linear          4.93059
Linear_Train    3.53602
Ridge           4.92935
Ridge_Train     3.53602
LSVR             4.8932
LSVR_Train      3.89018
Name: 30, dtype: object
iteration 31
UseTA             False
days                 10
X_size              300
Benchmark       4.05541
Linear          4.09929
Linear_Train    4.97364
Ridge           4.09933
Ridge_Train     4.97364
LSVR            4.35012
LSVR_Train      5.20939
Name: 31, dtype: object
iteration 32
UseTA             False
days                 10
X_size              500
Benchmark       4.44984
Linear          4.53445
Linear_Train    7.12103
Ridge           4.53443
Ridge_Train     7.12103
LSVR            7.91913
LSVR_Train      10.3383
Name: 32, dtype: object
iteration 33
UseTA             False
days                 10
X_size              800
Benchmark       4.56521
Linear             4.69
Linear_Train    7.42478
Ridge           4.68737
R

Unnamed: 0,UseTA,days,X_size,Benchmark,Linear,Linear_Train,Ridge,Ridge_Train,LSVR,LSVR_Train
0,True,10,100,4.09834,5.04258,3.60864,5.06587,3.44433,5.03487,4.76473
1,True,10,300,4.05541,4.13512,4.98057,4.10876,4.9684,4.62711,5.4628
2,True,10,500,4.44984,4.77824,7.01418,4.78602,7.01431,11.1618,12.9177
3,True,10,800,4.56521,4.83362,7.37212,4.80212,7.37316,4.79076,7.47794
4,True,10,1000,4.79599,5.08474,7.213,5.04733,7.21312,5.02884,7.32849
5,True,10,1200,5.12383,5.32725,7.26221,5.357,7.25792,9.56173,13.2062
6,True,10,1500,6.52561,6.54513,6.38926,6.54922,6.38931,10.0634,11.3874
7,True,10,2000,6.72916,6.72558,6.04491,6.72431,6.04089,7.95609,7.28381
8,True,10,3000,6.78857,6.82093,7.75065,6.82091,7.75065,7.3322,8.32901
9,True,10,7000,6.28203,6.28725,8.02582,6.28425,8.02188,11.6684,13.9245
