In [1]:
import numpy as np
import pandas as pd

In [2]:
mks = pd.read_csv("data/TSCO.csv")

In [3]:
#data preprocessing 
#remove NaN values
nans = mks[mks['Close'].isnull()]
mks.drop(nans.index, inplace=True) 

In [4]:
# Initialize a data frames to hold the Features and the value
X_len = 900
days = 10

Χ_columns = []
for j in range(days):
    Χ_columns.append('i+%s' % str(j))
   
X = pd.DataFrame(index=range(X_len),columns = Χ_columns)
y = pd.DataFrame(index=range(X_len), columns = ['Close'])
for i in range(X_len):
    for j in range(days):
        X.iloc[i]['i+%s' % str(j)] = mks.iloc[i + j]['Close']
    y.iloc[i] = mks.iloc[(i + j + 1)]['Close']
v = pd.concat([X, y], axis=1)
v.head()

Unnamed: 0,i+0,i+1,i+2,i+3,i+4,i+5,i+6,i+7,i+8,i+9,Close
0,293.4,292.65,291.7,287.4,283.3,287.85,284.2,284.05,281.05,289.35,286.3
1,292.65,291.7,287.4,283.3,287.85,284.2,284.05,281.05,289.35,286.3,293.8
2,291.7,287.4,283.3,287.85,284.2,284.05,281.05,289.35,286.3,293.8,289.7
3,287.4,283.3,287.85,284.2,284.05,281.05,289.35,286.3,293.8,289.7,292.4
4,283.3,287.85,284.2,284.05,281.05,289.35,286.3,293.8,289.7,292.4,297.35


In [5]:
def split_train_test_set(X, y, test_size=0.2):  
    # Train-test split
    if len(X) != len(y):
        return "Error"
    split_index = int(len(X) * (1-test_size))
    X_train = X[:split_index]
    X_test = X[split_index:]
    y_train = y[:split_index]
    y_test = y[split_index:]
    
    return X_train, X_test, y_train, y_test

In [6]:
# Import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error

def rmsp(test, pred):
    return np.sqrt(np.mean(((test - pred)/test)**2)) * 100

def rmse(test, pred):
    return np.sqrt(((test - pred) ** 2).mean())

def print_metrics(test, pred):
    print('RMSE %f'%  rmse(test, pred))
    print('Root Mean Squared Percentage Error %f'% rmsp(test, pred))
    print('Mean Absolute Error: %f'% mean_absolute_error(test, pred))
    print('Explained Variance Score: %f'% explained_variance_score(test, pred))
    print('Mean Squared Error: %f'% mean_squared_error(test, pred))
    print('R2 score: %f'% r2_score(test, pred))
    print('')

def create_cv_sets(n_splits = 3, size = 300):
    cv_sets = []
    for index in range(n_splits):
        X_train, X_test, y_train, y_test = split_train_test_set(X[index:index + size], y[index:index + size])    
        cv_sets.append([X_train, X_test, y_train, y_test])
    return cv_sets 

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn import grid_search
from sklearn.model_selection import GridSearchCV
def fit_model(X, y):
    regressor = LinearRegression()
    parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
    grid = GridSearchCV(regressor, parameters)
    grid.fit(X_train, y_train)
    return grid.best_estimator_



In [8]:
regressors = []
for X_train, X_test, y_train, y_test in create_cv_sets():
    regressor = fit_model(X_train, y_train)
    regressors.append(regressor)
    pred = regressor.predict(X_test)
    #print("X_train.head(): ", X_train.head())
    #print("X_train.tail(): ", X_train.tail())
    #print("Pred: ", pred[:5])
    #print("Test: ", y_test[:5]) 
    #print(y_test['Close'])
    print_metrics(y_test, pred)  

RMSE 3.438699
Root Mean Squared Percentage Error 1.520920
Mean Absolute Error: 2.626410
Explained Variance Score: 0.930810
Mean Squared Error: 11.824648
R2 score: 0.930776

RMSE 3.456071
Root Mean Squared Percentage Error 1.529108
Mean Absolute Error: 2.656870
Explained Variance Score: 0.929012
Mean Squared Error: 11.944424
R2 score: 0.929003

RMSE 3.525891
Root Mean Squared Percentage Error 1.561427
Mean Absolute Error: 2.704144
Explained Variance Score: 0.923932
Mean Squared Error: 12.431907
R2 score: 0.923918



In [9]:
regressors[0].coef_

array([[-0.08150452,  0.10709057, -0.1203331 ,  0.11904607, -0.06614372,
         0.08690841, -0.0706236 ,  0.0831637 , -0.19804524,  1.13946471]])

In [10]:
from sklearn.ensemble import RandomForestRegressor

def random_forest(X_train, X_test, y_train, y_test):
    # Classify and predict
    reg = RandomForestRegressor()
    reg.fit(X_train, y_train.values.ravel())
    pred = reg.predict(X_test)
    #print("Pred: ", pred[:5])
    #print("Test: ", y_test[:5])
    print_metrics(y_test.values.ravel(), pred)
    return rmse(y_test.values.ravel(), pred)

In [11]:
for X_train, X_test, y_train, y_test in create_cv_sets():
    random_forest(X_train,X_test,y_train,y_test)

RMSE 5.001888
Root Mean Squared Percentage Error 2.271888
Mean Absolute Error: 3.828917
Explained Variance Score: 0.856101
Mean Squared Error: 25.018887
R2 score: 0.853534

RMSE 5.116621
Root Mean Squared Percentage Error 2.344214
Mean Absolute Error: 3.749584
Explained Variance Score: 0.844477
Mean Squared Error: 26.179813
R2 score: 0.844389

RMSE 5.410552
Root Mean Squared Percentage Error 2.521919
Mean Absolute Error: 4.262917
Explained Variance Score: 0.858181
Mean Squared Error: 29.274075
R2 score: 0.820846



In [12]:
from sklearn.svm import LinearSVR
def svr(X_train, X_test, y_train, y_test):
    # Classify and predict
    reg = LinearSVR()
    reg.fit(X_train, y_train.values.ravel())
    pred = reg.predict(X_test)

    #print("Pred: ", pred[:5])
    #print("Test: ", y_test[:5])

    print_metrics(y_test.values.ravel(), pred)
    return rmse(y_test.values.ravel(), pred)

In [13]:
for X_train, X_test, y_train, y_test in create_cv_sets(): 
    svr(X_train,X_test,y_train,y_test)

RMSE 4.602475
Root Mean Squared Percentage Error 2.032721
Mean Absolute Error: 3.673274
Explained Variance Score: 0.931794
Mean Squared Error: 21.182780
R2 score: 0.875991

RMSE 26.527930
Root Mean Squared Percentage Error 11.817048
Mean Absolute Error: 26.293026
Explained Variance Score: 0.926248
Mean Squared Error: 703.731076
R2 score: -3.182926

RMSE 4.492130
Root Mean Squared Percentage Error 2.001760
Mean Absolute Error: 3.595170
Explained Variance Score: 0.922428
Mean Squared Error: 20.179231
R2 score: 0.876506



In [14]:
last = np.array([293.399994,292.649994,291.700012,287.399994,283.299988,287.850006,284.200012,284.049988,281.049988,289.350006]).reshape(1,-1)
pred = regressor.predict(last)
pred

array([[290.72658768]])

In [15]:
last = np.array([316.200012, 316.299988,314.799988,316.100006,320.5,308.799988,313.700012,310.80,318.30,324.00]).reshape(1,-1)
pred = regressor.predict(last)
pred


array([[322.92203994]])