In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression,Ridge,BayesianRidge
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from pandas.plotting import autocorrelation_plot
from sklearn.model_selection import TimeSeriesSplit
import talib as ta
import glob
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

In [121]:
def data_preprocessing(dataset):
    #dataset['Date'] = pd.to_datetime(dataset['Date'],format='%Y-%m')
    return_dataset = pd.DataFrame()
    return_dataset['Date'] = pd.to_datetime(dataset['Date'])
    return_dataset['Adj Close'] = pd.to_numeric(dataset['Adj Close'],errors='coerce')
    return_dataset['Volume'] = pd.to_numeric(dataset['Volume'],errors='coerce')
    return_dataset['Close'] = pd.to_numeric(dataset['Close'],errors='coerce')
    return_dataset['Low'] = pd.to_numeric(dataset['Low'],errors='coerce')
    return_dataset['High'] = pd.to_numeric(dataset['High'],errors='coerce')
    return_dataset['Open'] = pd.to_numeric(dataset['Open'],errors='coerce')
    
    '''remove rows and  with any null value'''
    return_dataset = return_dataset.dropna(axis=0,how='any')
       
    '''Create new column to show closing price after 30th day'''
    forecast_out = int(30) # predicting 30 days into future
    return_dataset['PriceNextMonth'] = return_dataset[['Adj Close']].shift(-forecast_out)
    #return_dataset = return_dataset[:-forecast_out]# remove last 30 from X
    
    return_dataset['PriceNextMonth'].iloc[-30:] = return_dataset['PriceNextMonth'].iloc[-31]
    
    return return_dataset

In [80]:
def read_process_data(path,stock_names):
    list_ = []
    for stock_name in stock_names:
        df = pd.read_csv(path + "/"+stock_name+".csv")
        processed_dataset = data_preprocessing(df)
        list_.append((stock_name,processed_dataset))
    return list_

In [86]:
def create_models(dataset,features,response,model,name):
    tscv = TimeSeriesSplit(n_splits=6)
    X = dataset[features].values
    y = np.array(dataset[response])
    index = 1
    model_list = []
    for train_index, test_index in tscv.split(dataset.values):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        model_list.append((name+str(index), model))
        index += 1
    return model_list

In [90]:
def create_featured_dataset(processed_dataset):
    '''Let us create numpy array for TA '''
    high = processed_dataset['High'].values
    low = processed_dataset['Low'].values
    close = processed_dataset['Close'].values
    open = processed_dataset['Open'].values
    volume = processed_dataset['Volume'].values
    feature_dataset = processed_dataset.copy()
    feature_dataset["HL_Perc"] = (high-low) / low * 100
    feature_dataset["CO_Perc"] = (close - open) / open * 100
    feature_dataset["AROONOSC"] = ta.AROONOSC(high, low, timeperiod=30)
    feature_dataset["CMO"] = ta.CMO(close, timeperiod=5)
    feature_dataset["ADOSC"] = ta.ADOSC(high, low, close, volume, fastperiod=50, slowperiod=10)
    feature_dataset["ROCR100"] = ta.ROCR100(close, timeperiod=20)
    feature_dataset["WILLR"] = ta.WILLR(high, low, close, timeperiod=20)
    feature_dataset = feature_dataset[["Adj Close","HL_Perc","CO_Perc","AROONOSC","CMO","ADOSC","ROCR100","WILLR","PriceNextMonth"]] 
    feature_dataset = feature_dataset.dropna(axis=0,how='any')
    return feature_dataset
    

In [None]:
def create_final_model():
    

In [122]:
dataset_path = "complete_data_set_v1"
stock_names = ["TCS.NS","WIPRO.NS","AXISBANK.NS","HCLTECH.NS","HDFCBANK.NS","ICICIBANK.NS","INDUSINDBK.NS","INFY.NS",
               "KOTAKBANK.NS","SBIN.NS","YESBANK.NS","TECHM.NS"]
stock_dataset_list = read_process_data(dataset_path,stock_names)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [161]:
features = ["Adj Close","HL_Perc","CO_Perc","AROONOSC","CMO","ADOSC","ROCR100","WILLR"]
response = "PriceNextMonth"
predicted =[]
for stock_name,stock_dataset in stock_dataset_list:
    featured_dataset =create_featured_dataset(stock_dataset)
    model_list = []
    model_list.append(create_models(featured_dataset,features,response,LinearRegression(),"linear"))
    model_list.append(create_models(featured_dataset,features,response,BayesianRidge(),"bayridge"))
    predicted_dataset = featured_dataset.copy()
    X = predicted_dataset[features].values
    y = np.array(predicted_dataset[response])
    for i in range(len(model_list)):
        for name, model in model_list[i]:
            predicted_dataset[name] = model.predict(X)
    new_x = predicted_dataset.loc[: , "linear1":"bayridge6"]
    new_y = predicted_dataset['PriceNextMonth']
    model = Ridge()
    model.fit(new_x,new_y)
    #print(mean_absolute_error(new_y, model.predict(new_x)))
    final_value_to_predict = new_x.iloc[-1,:]
    final_value_to_predict = final_value_to_predict.reshape(1,-1)
    print(model.predict(final_value_to_predict))
    
    predicted.append(model.predict(final_value_to_predict))
actual = pd.Series([3428,274,536,929,1977,293,1888,1184,1242,246,348,657])
print("MAE:",mean_absolute_error(actual,predicted))



[ 2566.19213082]
[ 279.40765887]
[ 467.8518827]
[ 933.27770708]
[ 1925.37288844]
[ 268.27954045]
[ 1836.62306298]
[ 1111.07631461]
[ 1103.23687938]
[ 244.0873884]
[ 303.48605583]
[ 593.43515037]
MAE: 115.753672665
