In [47]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression,Ridge,BayesianRidge
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from pandas.plotting import autocorrelation_plot
from sklearn.model_selection import TimeSeriesSplit
import talib as ta
import glob
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6

In [48]:
def data_preprocessing(dataset):
    return_dataset = pd.DataFrame()
    return_dataset['Date'] = pd.to_datetime(dataset['Date'])
    return_dataset['Adj Close'] = pd.to_numeric(dataset['Adj Close'],errors='coerce')
    return_dataset['Volume'] = pd.to_numeric(dataset['Volume'],errors='coerce')
    return_dataset['Close'] = pd.to_numeric(dataset['Close'],errors='coerce')
    return_dataset['Low'] = pd.to_numeric(dataset['Low'],errors='coerce')
    return_dataset['High'] = pd.to_numeric(dataset['High'],errors='coerce')
    return_dataset['Open'] = pd.to_numeric(dataset['Open'],errors='coerce')
    
    '''remove rows and  with any null value'''
    return_dataset = return_dataset.dropna(axis=0,how='any')
       
    '''Create new column to show closing price after 30th day'''
    forecast_out = int(30) # predicting 30 days into future
    return_dataset['PriceNextMonth'] = return_dataset[['Adj Close']].shift(-forecast_out)
    '''Put some value in last 30 rows for PriceNextMonth column'''    
    return_dataset['PriceNextMonth'].iloc[-30:] = return_dataset['PriceNextMonth'].iloc[-31]
    
    return return_dataset

In [49]:
def read_process_data(path,stock_names):
    list_ = []
    for stock_name in stock_names:
        df = pd.read_csv(path + "/"+stock_name+".csv")
        processed_dataset = data_preprocessing(df)
        list_.append((stock_name,processed_dataset))
    return list_

In [50]:
def create_models(dataset,features,response,model,name):
    tscv = TimeSeriesSplit(n_splits=6)
    X = dataset[features].values
    y = np.array(dataset[response])
    index = 1
    model_list = []
    for train_index, test_index in tscv.split(dataset.values):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        model_list.append((name+str(index), model))
        index += 1
    return model_list

In [51]:
def create_featured_dataset(processed_dataset):
    '''Let us create numpy array for TA '''
    high = processed_dataset['High'].values
    low = processed_dataset['Low'].values
    close = processed_dataset['Close'].values
    open = processed_dataset['Open'].values
    volume = processed_dataset['Volume'].values
    feature_dataset = processed_dataset.copy()
    feature_dataset["HL_Perc"] = (high-low) / low * 100
    feature_dataset["CO_Perc"] = (close - open) / open * 100
    feature_dataset["AROONOSC"] = ta.AROONOSC(high, low, timeperiod=30)
    feature_dataset["CMO"] = ta.CMO(close, timeperiod=5)
    feature_dataset["ADOSC"] = ta.ADOSC(high, low, close, volume, fastperiod=50, slowperiod=10)
    feature_dataset["ROCR100"] = ta.ROCR100(close, timeperiod=20)
    feature_dataset["WILLR"] = ta.WILLR(high, low, close, timeperiod=20)
    feature_dataset = feature_dataset[["Adj Close","HL_Perc","CO_Perc","AROONOSC","CMO","ADOSC","ROCR100","WILLR","PriceNextMonth"]] 
    feature_dataset = feature_dataset.dropna(axis=0,how='any')
    return feature_dataset
    

In [52]:
dataset_path = "complete_data_set_v1"
stock_names = ["TCS.NS","WIPRO.NS","AXISBANK.NS","HCLTECH.NS","HDFCBANK.NS","ICICIBANK.NS","INDUSINDBK.NS","INFY.NS",
               "KOTAKBANK.NS","SBIN.NS","YESBANK.NS","TECHM.NS"]
stock_dataset_list = read_process_data(dataset_path,stock_names)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [53]:
features = ["Adj Close","HL_Perc","CO_Perc","AROONOSC","CMO","ADOSC","ROCR100","WILLR"]
response = "PriceNextMonth"
columns_explore=["Date","Symbol","Predicted Return","4thApril","16thMay"]
columns_csv = ["Date","Symbol","Predicted Return"]#final csv output
final_output = {}
final_csv = pd.DataFrame(columns =columns_explore)
for stock_name,stock_dataset in stock_dataset_list:
    featured_dataset =create_featured_dataset(stock_dataset)#create featured dataset
    model_list = []#create 12 models
    model_list.append(create_models(featured_dataset,features,response,LinearRegression(),"linear"))
    model_list.append(create_models(featured_dataset,features,response,BayesianRidge(),"bayridge"))
    predicted_dataset = featured_dataset.copy()
    X = predicted_dataset[features].values
    y = np.array(predicted_dataset[response])
    for i in range(len(model_list)):
        for name, model in model_list[i]:
            predicted_dataset[name] = model.predict(X)#Run 12 models
    new_x = predicted_dataset.loc[: , "linear1":"bayridge6"]
    new_y = predicted_dataset['PriceNextMonth']
    model = Ridge()#create final model 
    model.fit(new_x,new_y)#final model has features as outcome of 12 models
    
    final_value_to_predict = new_x.iloc[-1,:]
    final_value_to_predict = final_value_to_predict.reshape(1,-1)
    predicted_value = model.predict(final_value_to_predict) #Run final model on 4th April data
    today_value = stock_dataset.iloc[-1,1]
    predicted_return = ((predicted_value-today_value)/today_value)*100
    
    df2 = pd.DataFrame([["20180516",stock_name,np.asscalar(predicted_return),today_value,np.asscalar(predicted_value)]],
                       columns=columns_explore)
    final_csv = final_csv.append(df2)



In [54]:
final_csv#final output

Unnamed: 0,Date,Symbol,Predicted Return,4thApril,16thMay
0,20180516,TCS.NS,-11.841966,2910.899902,2566.192131
0,20180516,WIPRO.NS,-0.778532,281.600006,279.407659
0,20180516,AXISBANK.NS,-4.956448,492.25,467.851883
0,20180516,HCLTECH.NS,-2.707562,959.25,933.277707
0,20180516,HDFCBANK.NS,2.236713,1883.25,1925.372888
0,20180516,ICICIBANK.NS,-0.137894,268.649994,268.27954
0,20180516,INDUSINDBK.NS,2.071475,1799.349976,1836.623063
0,20180516,INFY.NS,-1.167376,1124.199951,1111.076315
0,20180516,KOTAKBANK.NS,2.312606,1078.300049,1103.236879
0,20180516,SBIN.NS,-1.299076,247.300003,244.087388


In [55]:
final_csv.to_csv("Submission.csv",columns=columns_csv,index=False)#create Submission file