### Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import grid_search
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge 
from sklearn.svm import LinearSVR



### Data Preprcessing

In [2]:
mks = pd.read_csv("data/MKSfull.csv")

In [3]:
#data preprocessing 
#remove NaN values
nans = mks[mks['Close'].isnull()]
mks.drop(nans.index, inplace=True) 

### Create Features Dataframes

In [4]:
def create_dataframes(X_len, days):
    use = mks.tail(X_len + days)
    Χ_columns = []
    for j in range(days):
        Χ_columns.append('i+%s' % str(j))

    X = pd.DataFrame(index=range(X_len),columns = Χ_columns)
    y = pd.DataFrame(index=range(X_len), columns = ['Close'])

    for i in range(X_len):
        for j in range(days):
            X.iloc[i]['i+%s' % str(j)] = use.iloc[i + j]['Close']
        y.iloc[i] = use.iloc[(i + j + 1)]['Close']
        
    return X, y

In [5]:
def create_dataframes_ta(X_len, days):
    use = mks.tail(X_len + days)
    Χ_columns = []
    for j in range(days):
        Χ_columns.append('i+%s' % str(j))
    
    Χ_columns.append('Range')
    Χ_columns.append('Change')
    Χ_columns.append('ChangePercentage')
    
    X = pd.DataFrame(index=range(X_len),columns = Χ_columns)
    y = pd.DataFrame(index=range(X_len), columns = ['Close'])

    for i in range(X_len):
        stock_range = []
        stock_change = []
        stock_change_perc = []        
        for j in range(days):
            X.iloc[i]['i+%s' % str(j)] = use.iloc[i + j]['Close']
            stock_range.append(use.iloc[i + j]['High'] - use.iloc[i + j]['Low'])
            stock_change.append(use.iloc[i + j]['Close'] - use.iloc[i + j]['Open'])
            stock_change_perc.append(((use.iloc[i + j]['Close'] - use.iloc[i + j]['Open']) / use.iloc[i + j]['Open'])* 100) 

        X.iloc[i]['Range'] = np.mean(stock_range)
        X.iloc[i]['Change'] = np.mean(stock_change)
        X.iloc[i]['ChangePercentage'] = np.mean(stock_change_perc)
        y.iloc[i] = use.iloc[(i + j + 1)]['Close']
        
    return X, y

### Split train/test set

In [6]:
def split_train_test_set(X, y, test_size=0.2):  
    if len(X) != len(y):
        return "Error"
    split_index = int(len(X) * (1-test_size))
    X_train = X[:split_index]
    X_test = X[split_index:]
    y_train = y[:split_index]
    y_test = y[split_index:]
    return X_train, X_test, y_train, y_test

### Define metrics

In [7]:
def rmse(test, pred):
    return np.sqrt(((test - pred) ** 2).mean())

### Benchmark model, persistence

In [8]:
def identity(x):
    return x

def model_persistence(test_X, days):
    predictions = list()
    for x in test_X['i+%s' % str(days-1)]:
        yhat = identity(x)
        predictions.append(yhat)
    return predictions

### Define, Train Regressors

In [9]:
def get_linear_model(X, y):
    regressor = LinearRegression()
    parameters = {'fit_intercept':[True, False], 'normalize':[True, False], 'copy_X':[True, False]}
    grid = GridSearchCV(regressor, parameters)
    grid.fit(X_train, y_train)
    return grid.best_estimator_

def get_ridge_model(X, y):
    r_reg = Ridge()
    r_reg.fit(X, y)
    return r_reg

def get_lsvr_model(X, y):
    svr = LinearSVR()
    svr.fit(X, y.values.ravel())
    return svr

### Grid Search for days, data set size, usage of technical analysis features

In [10]:
i = 0
for use_ta in [True, False]:
    result = pd.DataFrame(index=range(9),columns = ['UseTA', 'days', 'X_size','Benchmark', 'Linear','Ridge','LSVR'])
    for days in [10, 15, 20]:
        for X_size in [1000, 3000, 7000]:
            if use_ta: 
                X,y = create_dataframes_ta(X_size, days)
            else: 
                X,y = create_dataframes(X_size, days)

            X_train, X_test, y_train, y_test = split_train_test_set(X, y)

            benchmark_pred = model_persistence(X_test,days )
            benchmark_err = rmse(y_test['Close'], benchmark_pred)

            linear_model = get_linear_model(X_train, y_train)
            lr_pred = linear_model.predict(X_test)
            lr_err = rmse(y_test, lr_pred)

            rr_pred = get_ridge_model(X_train, y_train).predict(X_test)
            rr_err= rmse(y_test, rr_pred)

            lsvr_pred = get_lsvr_model(X_train, y_train).predict(X_test)
            lsvr_err = rmse(y_test.values.ravel(), lsvr_pred)

            result.iloc[i]['UseTA'] = use_ta
            result.iloc[i]['days'] = days
            result.iloc[i]['X_size'] = X_size
            result.iloc[i]['Benchmark'] = benchmark_err
            result.iloc[i]['Linear'] = float(lr_err)
            result.iloc[i]['Ridge'] = float(rr_err)
            result.iloc[i]['LSVR'] = float(lsvr_err)
            #result.iloc[i]['coefs'] = linear_model.coef_
            i = i + 1
            
result

Using Techical Analysis
  days X_size Benchmark   Linear    Ridge     LSVR
0   10   1000   4.73516  5.02061  4.97879  7.63729
1   15   1000   4.73516  4.94254   4.9137  4.98641
2   20   1000   4.73516  5.00514  4.98203  4.96606
3  NaN    NaN       NaN      NaN      NaN      NaN
4  NaN    NaN       NaN      NaN      NaN      NaN
5  NaN    NaN       NaN      NaN      NaN      NaN
6  NaN    NaN       NaN      NaN      NaN      NaN
7  NaN    NaN       NaN      NaN      NaN      NaN
8  NaN    NaN       NaN      NaN      NaN      NaN
NOT Using Techical Analysis
  days X_size Benchmark   Linear    Ridge     LSVR
0   10   1000   4.73516  4.84776  4.84438  4.86147
1   15   1000   4.73516   4.8504  4.84612  5.03705
2   20   1000   4.73516  4.93373  4.92866  4.97872
3  NaN    NaN       NaN      NaN      NaN      NaN
4  NaN    NaN       NaN      NaN      NaN      NaN
5  NaN    NaN       NaN      NaN      NaN      NaN
6  NaN    NaN       NaN      NaN      NaN      NaN
7  NaN    NaN       NaN      N