In [1]:
import numpy as np
import pandas as pd

In [2]:
mks = pd.read_csv("data/MKS.csv")

In [3]:
#data preprocessing 
#remove NaN values
nans = mks[mks['Close'].isnull()]
mks.drop(nans.index, inplace=True) 

In [4]:
# Initialize a data frames to hold the Features and the value
X_len = 900
days = 10

Χ_columns = []
for j in range(days):
    Χ_columns.append('i+%s' % str(j))
   
X = pd.DataFrame(index=range(X_len),columns = Χ_columns)
y = pd.DataFrame(index=range(X_len), columns = ['Close'])
for i in range(X_len):
    for j in range(days):
        X.iloc[i]['i+%s' % str(j)] = mks.iloc[i + j]['Close']
    y.iloc[i] = mks.iloc[(i + j + 1)]['Close']
v = pd.concat([X, y], axis=1)
v.head()

Unnamed: 0,i+0,i+1,i+2,i+3,i+4,i+5,i+6,i+7,i+8,i+9,Close
0,459.8,469.9,471.6,461.9,452.9,453.3,456.0,442.0,435.5,426.7,416.6
1,469.9,471.6,461.9,452.9,453.3,456.0,442.0,435.5,426.7,416.6,427.7
2,471.6,461.9,452.9,453.3,456.0,442.0,435.5,426.7,416.6,427.7,434.5
3,461.9,452.9,453.3,456.0,442.0,435.5,426.7,416.6,427.7,434.5,442.3
4,452.9,453.3,456.0,442.0,435.5,426.7,416.6,427.7,434.5,442.3,440.2


In [5]:
def split_train_test_set(X, y, test_size=0.2):  
    # Train-test split
    if len(X) != len(y):
        return "Error"
    split_index = int(len(X) * (1-test_size))
    X_train = X[:split_index]
    X_test = X[split_index:]
    y_train = y[:split_index]
    y_test = y[split_index:]
    
    return X_train, X_test, y_train, y_test

In [6]:
# Import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error

def rmsp(test, pred):
    return np.sqrt(np.mean(((test - pred)/test)**2)) * 100

def rmse(test, pred):
    return np.sqrt(((test - pred) ** 2).mean())

def print_metrics(test, pred):
    print('RMSE %f'%  rmse(test, pred))
    print('Root Mean Squared Percentage Error %f'% rmsp(test, pred))
    print('Mean Absolute Error: %f'% mean_absolute_error(test, pred))
    print('Explained Variance Score: %f'% explained_variance_score(test, pred))
    print('Mean Squared Error: %f'% mean_squared_error(test, pred))
    print('R2 score: %f'% r2_score(test, pred))
    print('')

def create_cv_sets(n_splits = 3, size = 300):
    cv_sets = []
    for index in range(n_splits):
        X_train, X_test, y_train, y_test = split_train_test_set(X[index:index + size], y[index:index + size])    
        cv_sets.append([X_train, X_test, y_train, y_test])
    return cv_sets 

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn import grid_search
from sklearn.model_selection import GridSearchCV
def fit_model(X, y):
    regressor = LinearRegression()
    parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
    grid = GridSearchCV(regressor, parameters)
    grid.fit(X_train, y_train)
    return grid.best_estimator_



In [8]:
regressors = []
for X_train, X_test, y_train, y_test in create_cv_sets():
    regressor = fit_model(X_train, y_train)
    regressors.append(regressor)
    pred = regressor.predict(X_test)
    #print("X_train.head(): ", X_train.head())
    #print("X_train.tail(): ", X_train.tail())
    #print("Pred: ", pred[:5])
    #print("Test: ", y_test[:5]) 
    #print(y_test['Close'])
    print_metrics(y_test, pred)  

RMSE 6.500098
Root Mean Squared Percentage Error 1.156586
Mean Absolute Error: 4.574820
Explained Variance Score: 0.829707
Mean Squared Error: 42.251269
R2 score: 0.829559

RMSE 6.597446
Root Mean Squared Percentage Error 1.173894
Mean Absolute Error: 4.736517
Explained Variance Score: 0.811301
Mean Squared Error: 43.526294
R2 score: 0.811288

RMSE 6.518486
Root Mean Squared Percentage Error 1.157825
Mean Absolute Error: 4.657421
Explained Variance Score: 0.807487
Mean Squared Error: 42.490660
R2 score: 0.807339



In [9]:
regressors[0].coef_

array([[ 0.07465039, -0.01583566, -0.00959529, -0.05746968,  0.15631765,
        -0.12891805, -0.08020704,  0.07736399, -0.0873028 ,  1.07202876]])

In [10]:
from sklearn.ensemble import RandomForestRegressor

def random_forest(X_train, X_test, y_train, y_test):
    # Classify and predict
    reg = RandomForestRegressor()
    reg.fit(X_train, y_train.values.ravel())
    pred = reg.predict(X_test)
    #print("Pred: ", pred[:5])
    #print("Test: ", y_test[:5])
    print_metrics(y_test.values.ravel(), pred)
    return rmse(y_test.values.ravel(), pred)

In [11]:
for X_train, X_test, y_train, y_test in create_cv_sets():
    random_forest(X_train,X_test,y_train,y_test)

RMSE 31.230246
Root Mean Squared Percentage Error 5.418819
Mean Absolute Error: 27.315000
Explained Variance Score: 0.004139
Mean Squared Error: 975.328250
R2 score: -2.934455

RMSE 33.940910
Root Mean Squared Percentage Error 5.899361
Mean Absolute Error: 30.410833
Explained Variance Score: -0.023047
Mean Squared Error: 1151.985375
R2 score: -3.994541

RMSE 32.893139
Root Mean Squared Percentage Error 5.714962
Mean Absolute Error: 29.445000
Explained Variance Score: -0.009266
Mean Squared Error: 1081.958583
R2 score: -3.905823



In [12]:
from sklearn.svm import LinearSVR
def svr(X_train, X_test, y_train, y_test):
    # Classify and predict
    reg = LinearSVR()
    reg.fit(X_train, y_train.values.ravel())
    pred = reg.predict(X_test)

    #print("Pred: ", pred[:5])
    #print("Test: ", y_test[:5])

    print_metrics(y_test.values.ravel(), pred)
    return rmse(y_test.values.ravel(), pred)

In [13]:
for X_train, X_test, y_train, y_test in create_cv_sets(): 
    svr(X_train,X_test,y_train,y_test)

RMSE 6.639852
Root Mean Squared Percentage Error 1.180685
Mean Absolute Error: 4.640370
Explained Variance Score: 0.823254
Mean Squared Error: 44.087634
R2 score: 0.822151

RMSE 16.356537
Root Mean Squared Percentage Error 2.919859
Mean Absolute Error: 15.190324
Explained Variance Score: 0.796186
Mean Squared Error: 267.536303
R2 score: -0.159929

RMSE 6.700826
Root Mean Squared Percentage Error 1.187808
Mean Absolute Error: 4.588357
Explained Variance Score: 0.800505
Mean Squared Error: 44.901064
R2 score: 0.796409



In [14]:
last = np.array([310.5,312.5, 313.5,316.200012,316.299988,314.799988,316.100006,320.5,308.79998,308.700012]).reshape(1,-1)
pred = regressor.predict(last)
pred

array([[309.39884063]])

In [15]:
last = np.array([316.200012, 316.299988,314.799988,316.100006,320.5,308.799988,313.700012,310.80,318.30,324.00]).reshape(1,-1)
pred = regressor.predict(last)
pred


array([[326.34387598]])

In [16]:
from sklearn.externals import joblib
joblib.dump(regressors[0], 'prediction_model.pkl') 

['prediction_model.pkl']