In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from pandas.plotting import autocorrelation_plot
from sklearn.model_selection import TimeSeriesSplit
import talib as ta
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

In [2]:
'''We have data of 12 diffenret stocks,but we will pick only one for exploration'''
infy_dataset = pd.read_csv("complete_data_set_v1/INFY.NS.csv")

In [3]:
def data_preprocessing(dataset):
    dataset['Date'] = pd.to_datetime(dataset['Date'],format='%Y-%m')
    dataset['Adj Close'] = pd.to_numeric(dataset['Adj Close'],errors='coerce')
    dataset['Volume'] = pd.to_numeric(dataset['Volume'],errors='coerce')
    dataset['Close'] = pd.to_numeric(dataset['Close'],errors='coerce')
    dataset['Low'] = pd.to_numeric(dataset['Low'],errors='coerce')
    dataset['High'] = pd.to_numeric(dataset['High'],errors='coerce')
    dataset['Open'] = pd.to_numeric(dataset['Open'],errors='coerce')
    
    '''remove rows with any null value'''
    dataset = dataset.dropna(axis=0,how='any')
    
    '''Create new column to show closing price after 30th day'''
    forecast_out = int(30) # predicting 30 days into future
    dataset['PriceNextMonth'] = dataset[['Adj Close']].shift(-forecast_out)
    dataset = dataset[:-forecast_out]# remove last 30 from X
    
    return dataset

In [4]:
def evaluate_model(dataset,features,response,model):
    tscv = TimeSeriesSplit(n_splits=6)
    X = dataset[features].values
    y = np.array(dataset[response])
    index = 1
    total_mae = 0
    for train_index, test_index in tscv.split(dataset.values):
        #print('Observations: %d' % (len(train_index) + len(train_index)))
        #print('Training Observations: %d' % (len(train_index)))
        #print('Testing Observations: %d' % (len(test_index)))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        #print('Fold No: %d'% index)
        #print("mean_absolute_error: ", mae)
        total_mae = total_mae+mae
        index += 1
    print("average MAE:",total_mae/index)

In [5]:
def create_models(dataset,features,response,model,name):
    tscv = TimeSeriesSplit(n_splits=6)
    X = dataset[features].values
    y = np.array(dataset[response])
    index = 1
    model_list = []
    for train_index, test_index in tscv.split(dataset.values):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        model_list.append((name+str(index), model))
        index += 1
    return model_list

In [6]:
processed_dataset = data_preprocessing(infy_dataset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [7]:
'''Let us create numpy array for TA '''
high = processed_dataset['High'].values
low = processed_dataset['Low'].values
close = processed_dataset['Close'].values
open = processed_dataset['Open'].values
volume = processed_dataset['Volume'].values

In [8]:
test_dataset = processed_dataset.copy()
test_dataset["HL_Perc"] = (high-low) / low * 100#average MAE: 64.4633232001
test_dataset["CO_Perc"] = (close - open) / open * 100
test_dataset["AROONOSC"] = ta.AROONOSC(high, low, timeperiod=30)
test_dataset["CMO"] = ta.CMO(close, timeperiod=5)
test_dataset["ADOSC"] = ta.ADOSC(high, low, close, volume, fastperiod=50, slowperiod=10)
test_dataset["ROCR100"] = ta.ROCR100(close, timeperiod=20)
test_dataset["WILLR"] = ta.WILLR(high, low, close, timeperiod=20)
test_dataset = test_dataset[["Adj Close","HL_Perc","CO_Perc","AROONOSC","CMO","ADOSC","ROCR100","WILLR","PriceNextMonth"]] 
test_dataset = test_dataset.dropna(axis=0,how='any')
features = ["Adj Close","HL_Perc","CO_Perc","AROONOSC","CMO","ADOSC","ROCR100","WILLR"]
response = "PriceNextMonth"

In [9]:
random_forest = RandomForestRegressor(random_state=0)
evaluate_model(test_dataset,features,response,random_forest)

average MAE: 57.2655986016


In [10]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression,Ridge,BayesianRidge
from sklearn.neighbors import KNeighborsRegressor

# prepare models
models = []
models.append(('LinearRegression', LinearRegression()))
#models.append(('RandomForestRegressor', RandomForestRegressor(random_state=0)))
#models.append(('DecisionTreeRegressor', DecisionTreeRegressor(random_state=0)))
#models.append(('SVR', SVR()))
#models.append(('Ridge', Ridge(random_state=0)))
#models.append(('KNeighborsRegressor', KNeighborsRegressor()))
models.append(('BayesianRidge', BayesianRidge()))

# evaluate each model in turn
for name, model in models:
    print(name)
    evaluate_model(test_dataset,features,response,model)
'''Evaluated many models but turns out LinearRegression and BayesianRidge perrform better'''
    

LinearRegression
average MAE: 52.2756511879
BayesianRidge
average MAE: 51.572917865


'Evaluated many models but turns out LinearRegression and BayesianRidge perrform better'

In [11]:
model_list = []
model_list.append(create_models(test_dataset,features,response,LinearRegression(),"linear"))
model_list.append(create_models(test_dataset,features,response,BayesianRidge(),"bayridge"))

In [12]:
#Create 12 columns for 12 values predicted by 12 models
predicted_dataset = test_dataset.copy()
features = ["Adj Close","HL_Perc","CO_Perc","AROONOSC","CMO","ADOSC","ROCR100","WILLR"]
response = "PriceNextMonth"
X = predicted_dataset[features].values
y = np.array(predicted_dataset[response])

for i in range(len(model_list)):
    for name, model in model_list[i]:
        predicted_dataset[name] = model.predict(X)

In [13]:
predicted_dataset.head()

Unnamed: 0,Adj Close,HL_Perc,CO_Perc,AROONOSC,CMO,ADOSC,ROCR100,WILLR,PriceNextMonth,linear1,...,linear3,linear4,linear5,linear6,bayridge1,bayridge2,bayridge3,bayridge4,bayridge5,bayridge6
49,487.763977,1.300621,0.462465,40.0,-0.525828,8017593.0,102.130986,-65.607748,588.151672,588.64015,...,588.64015,588.64015,588.64015,588.64015,592.825884,592.825884,592.825884,592.825884,592.825884,592.825884
50,492.586487,1.455852,0.681152,40.0,15.894006,7928992.0,102.041704,-57.84376,595.097839,589.987235,...,589.987235,589.987235,589.987235,589.987235,593.097234,593.097234,593.097234,593.097234,593.097234,593.097234
51,490.935455,1.719765,-0.312672,40.0,8.32236,7594680.0,102.224123,-60.501766,597.906067,589.922992,...,589.922992,589.922992,589.922992,589.922992,592.24528,592.24528,592.24528,592.24528,592.24528,592.24528
52,476.992401,4.492496,-2.425861,40.0,-35.892551,7519924.0,100.391891,-82.949726,598.912903,596.332045,...,596.332045,596.332045,596.332045,596.332045,590.680966,590.680966,590.680966,590.680966,590.680966,590.680966
53,486.989105,2.590057,2.03121,40.0,0.504582,7157805.0,103.017678,-66.855121,605.78894,586.301479,...,586.301479,586.301479,586.301479,586.301479,587.903587,587.903587,587.903587,587.903587,587.903587,587.903587


In [14]:
'''Now we have 12 predicted values from 12 different models,one way to select final values is averaging output of 12 models
but a biased model can give outlier as result which in turn will afect averaging.
So new approach is considered:Treat output of 12 models as features'''

'Now we have 12 predicted values from 12 different models,one way to select final values is averaging output of 12 models\nbut a biased model can give outlier as result which in turn will afect averaging.\nSo new approach is considered:Treat output of 12 models as features'

In [15]:
lin_col = predicted_dataset.loc[: , "linear1":"linear6"]
predicted_dataset['lin_reg_mean'] = lin_col.mean(axis=1)

In [16]:
bay_col = predicted_dataset.loc[: , "bayridge1":"bayridge6"]
predicted_dataset['bay_ridge_mean'] = bay_col.mean(axis=1)

In [17]:
both_col = predicted_dataset.loc[: , ["lin_reg_mean","bay_ridge_mean"]]
predicted_dataset['both_model_mean'] = both_col.mean(axis=1)

In [18]:
lin_reg_mae = mean_absolute_error(predicted_dataset['PriceNextMonth'], predicted_dataset['lin_reg_mean'])
bay_ridge_mae = mean_absolute_error(predicted_dataset['PriceNextMonth'], predicted_dataset['bay_ridge_mean'])
both_model_mae = mean_absolute_error(predicted_dataset['PriceNextMonth'], predicted_dataset['both_model_mean'])

In [19]:
print(lin_reg_mae)
print(bay_ridge_mae)
print(both_model_mae)

50.5117516341
50.4201930409
50.4518512599


In [20]:

new_x = predicted_dataset.loc[: , "linear1":"bayridge6"]
new_y = predicted_dataset['PriceNextMonth']
model = Ridge()
model.fit(new_x,new_y)
mean_absolute_error(new_y, model.predict(new_x))

50.351955686701757

In [21]:
'''If we consider values predicted by 12 models as features and create new model then MAE improves'''

'If we consider values predicted by 12 models as features and create new model then MAE improves'