In [53]:
#!pip install bokeh
#!pip install nsepy

from nsepy import get_history
import pandas as pd
import numpy as np
from datetime import date
import bokeh.io
from bokeh.plotting import figure, output_file, show
from bokeh.palettes import brewer
from bokeh.models.glyphs import Segment
bokeh.io.reset_output()
bokeh.io.output_notebook()
from statsmodels.tsa.stattools import pacf
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import preprocessing, cross_validation, svm

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


In [56]:
#Get daily data for NSE:TCS, INFY between 2015 and 2016
listOfStocks = ['TCS', 'INFY']
listOfIndices = ['NIFTY IT']
startDate = date(2015,1,1)
endDate = date(2016,12,31)

#Storing stocks' daily data in a Dictionary
dailyStockData = {}
dailyIndexData = {}
for stockSym in listOfStocks:
    dailyStockData[stockSym] = get_history(stockSym, startDate, endDate)
for indSym in listOfIndices:
    dailyIndexData[indSym] = get_history(indSym, startDate, endDate, index=True)

In [57]:
dstocksDf ={}
for s in listOfStocks:
    dstocksDf[s] = pd.DataFrame(dailyStockData[s])
dindexDf = {}
for i in listOfIndices:
    dindexDf[i] = pd.DataFrame(dailyIndexData[i])

In [58]:
#Functions for requisite operations:

def calculateMA(closeHistory, n):
    cumsum, moving_aves = [0], []
    for i, x in enumerate(closeHistory, 1):
        cumsum.append(cumsum[i-1] + x)
        if i>=n:
            moving_ave = (cumsum[i] - cumsum[i-n])/n
            #can do stuff with moving_ave here
            moving_aves.append(moving_ave)
    return moving_aves

def generateRollingWindow(closeHistory, n):
    movavg = calculateMA(closeHistory, n)
    nanarray = [np.nan for i in range(n-1)]
    rwindow = nanarray + movavg
    return rwindow

#VolumeShock-magnitude, VolumeShockDir - direction - 2% threshold
def genVolumeShocks(df):
    df['Prev Volume'] = df['Volume'].shift(1)
    volData = df[['Prev Volume', 'Volume']]
    volumeShocks = (volData['Volume'] >= 1.1*volData['Prev Volume']) | (volData['Volume'] <= 0.9*volData['Prev Volume'])
    volumeShocksDirections= (volData['Volume'] > volData['Prev Volume'])
    volumeShocks[0] = None
    volumeShocksDirections[0] = None
    
    return volumeShocks, volumeShocksDirections

#VolumeShock-magnitude, VolumeShockDir - direction
def genPriceShocks(df):
    priceData = df[['Prev Close', 'Close']]
    priceShocks = (priceData['Close'] >= 1.02*priceData['Prev Close']) | (priceData['Close'] <= 0.98*priceData['Prev Close'])
    priceShocksDirections = (priceData['Close'] > priceData['Prev Close'])
    return priceShocks, priceShocksDirections

In [59]:
#Generate Rolling Windows of different Moving Averages
windows = [4, 8, 12, 16, 20, 24, 28, 32, 48, 52 ]

for sym in listOfStocks:
    for win in windows:
        winStr = "Close"+str(win)+"W"
        dstocksDf[sym][winStr] = generateRollingWindow(dstocksDf[sym]['Close'], win*5)


In [60]:
#Generate dummy variables exercise - Volume Shock of +/-10%, Price Shock of +/-2%, Price Shock without Volume Shock
#Black Swan scenario - not described in the Question (copy of the Price Shock descript)
for i in listOfStocks:
    dstocksDf[i]['Volume Shock'], dstocksDf[i]['Volume Shock Direction']  = genVolumeShocks(dstocksDf[i])
    dstocksDf[i]['Price Shock'], dstocksDf[i]['Price Shock Direction']  = genPriceShocks(dstocksDf[i])
    dstocksDf[i]['Price Shock without Volume Shock'] = (dstocksDf[i]['Volume Shock']==True) & (dstocksDf[i]['Price Shock']==False)


In [63]:
dstocksDf['TCS'].tail()

Unnamed: 0_level_0,Symbol,Series,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,...,Close28W,Close32W,Close48W,Close52W,Prev Volume,Volume Shock,Volume Shock Direction,Price Shock,Price Shock Direction,Price Shock without Volume Shock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-12-26,TCS,EQ,2288.1,2266.0,2301.6,2266.0,2296.85,2293.1,2290.6,448704,...,2428.374643,2443.402188,2421.090208,2419.877692,559477.0,True,False,False,True,True
2016-12-27,TCS,EQ,2293.1,2281.5,2330.0,2281.5,2322.8,2323.4,2317.79,647346,...,2426.187857,2442.464688,2420.847708,2419.856154,448704.0,True,True,False,True,True
2016-12-28,TCS,EQ,2323.4,2326.7,2349.9,2303.15,2305.25,2313.4,2335.86,675209,...,2424.051429,2441.22,2420.606667,2419.6475,647346.0,False,True,False,False,False
2016-12-29,TCS,EQ,2313.4,2303.0,2359.9,2303.0,2355.0,2351.7,2346.25,981758,...,2422.055357,2440.15,2420.415417,2419.521154,675209.0,True,True,False,True,True
2016-12-30,TCS,EQ,2351.7,2354.9,2378.6,2345.65,2366.55,2365.55,2366.64,702874,...,2420.287143,2439.195313,2420.42875,2419.4425,981758.0,True,False,False,True,True


In [64]:
dstocksDf['INFY'].tail()

Unnamed: 0_level_0,Symbol,Series,Prev Close,Open,High,Low,Last,Close,VWAP,Volume,...,Close28W,Close32W,Close48W,Close52W,Prev Volume,Volume Shock,Volume Shock Direction,Price Shock,Price Shock Direction,Price Shock without Volume Shock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-12-26,INFY,EQ,988.45,988.0,994.0,975.1,983.35,982.75,983.28,1606914,...,1055.496429,1075.815313,1102.528958,1100.986923,3653018.0,True,False,False,False,True
2016-12-27,INFY,EQ,982.75,982.75,1001.9,980.0,996.9,998.95,993.09,2456408,...,1053.5825,1074.674375,1102.235625,1100.812115,1606914.0,True,True,False,True,True
2016-12-28,INFY,EQ,998.95,1002.95,1009.8,995.0,999.0,998.5,1003.15,2738408,...,1051.666429,1073.420938,1102.017708,1100.697885,2456408.0,True,True,False,False,True
2016-12-29,INFY,EQ,998.5,1003.75,1008.0,988.15,990.0,992.35,997.88,3120062,...,1049.773214,1072.043125,1101.722083,1100.486346,2738408.0,True,True,False,False,True
2016-12-30,INFY,EQ,992.35,998.0,1012.0,992.75,1011.0,1010.6,1006.91,3633884,...,1048.148214,1070.847813,1101.534167,1100.324038,3120062.0,True,True,False,True,True


In [275]:
#Exercise 2 - Visualization

In [347]:

#choose the stock - TCS or INFY
dfgraph = dstocksDf['TCS']

#1. Create timeseries plot of close prices of stocks/indices with the following features:
p1 = figure(plot_width=400, plot_height=400)

dfgraph['ind']= range(len(dstocksDf['INFY']))

#2. Color timeseries in simple blue color.
p1.line(dfgraph['ind'], dfgraph['Close'], line_width=2)

#3. Color timeseries between two volume shocks in a different color (Red)
for i in dfgraph['ind']:
    if i == len(dfgraph)-1:
        break;
    if dfgraph['Volume Shock'][i] == True & dfgraph['Volume Shock'][i+1] == True:
        p1.segment(x0=i, y0=dfgraph['Close'][i], x1=i+1, y1=dfgraph['Close'][i+1], line_color="red", line_width=3)


#5. Mark closing Pricing shock without volume shock to identify volumeless price movement.
markPoints = dfgraph.loc[dfgraph['Price Shock without Volume Shock'] == True]
p1.circle(markPoints['ind'], markPoints['Close'], size=5, line_color="green", fill_color="green", fill_alpha=0.5)

#6. Hand craft partial autocorrelation plot for each stock/index on upto all lookbacks on bokeh 
p2 = figure(plot_width=400, plot_height=400)
pcf = pacf(dfgraph['Close'], nlags=len(dfgraph)-5)
p2.line(range(len(dfgraph)-len(pcf), len(dfgraph)), pcf, line_width=2)

show(p1)
show(p2)

  return rho, np.sqrt(sigmasq)


In [348]:
#Exercise 3 - scikit learn

In [3]:
#Get recent daily data for NSE:TCS, INFY between 2015 and 2016
listOfStocks = ['TCS', 'INFY']
listOfIndices = ['NIFTY IT']
startDate = date(2017,1,1)
endDate = date.today()

#Storing stocks' daily data in a Dictionary
pdailyStockData = {}
pdailyIndexData = {}
for stockSym in listOfStocks:
    pdailyStockData[stockSym] = get_history(stockSym, startDate, endDate)
for indSym in listOfIndices:
    pdailyIndexData[indSym] = get_history(indSym, startDate, endDate, index=True)

In [5]:

pdstocksDf ={}
for s in listOfStocks:
    pdstocksDf[s] = pd.DataFrame(pdailyStockData[s])
pdindexDf = {}
for i in listOfIndices:
    pdindexDf[i] = pd.DataFrame(pdailyIndexData[i])

In [33]:

#pdstocksDf['TCS']['Avg Price']= pdstocksDf['TCS'][['Open','Low','High','Close']].mean(axis=1)
pdstocksDf['TCS']['Price Range']= pdstocksDf['TCS']['High'] - pdstocksDf['TCS']['Low'] #pdstocksDf['TCS'][['Open','Low','High','Close']].mean(axis=1)

#pdstocksDf['INFY']['Avg Price']=pdstocksDf['INFY'][['Open','Low','High','Close']].mean(axis=1)
pdstocksDf['INFY']['Price Range']= pdstocksDf['INFY']['High'] - pdstocksDf['INFY']['Low'] #pdstocksDf['TCS'][['Open','Low','High','Close']].mean(axis=1)


#Choose the stock - TCS or INFY
predDf = pdstocksDf['TCS'][['Close','Price Range']]

timeWindow = 1
predDf['Prediction'] = predDf['Close'].shift(-timeWindow)

X = np.array(predDf.drop(['Prediction'], 1))
X = preprocessing.scale(X)
X_pred = X[-timeWindow:] # set X_forecast equal to last 30
X = X[:-timeWindow] # remove last 30 from X

y = np.array(predDf['Prediction'])
y = y[:-timeWindow]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [40]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.2)

def MAPE(y, ypred):
    return round(np.mean(np.abs((y - ypred) / y)) * 100, 3)


# Training
#clf = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01, max_depth=1, random_state=0, loss='huber')
lr = LinearRegression()
gb = GradientBoostingRegressor(n_estimators=1000)

from sklearn.metrics import make_scorer
scr = make_scorer(MAPE)

#1. Quick Build - Grid Searching with 8 different permutation combinations
lrparameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
gbParameters = {'loss':('ls','huber'),'alpha':[0.97, 0.99], 'learning_rate':[0.1, 0.01]}

lmodel = GridSearchCV(lr, lrparameters)
lmodel.fit(X_train,y_train)

gbmodel = GridSearchCV(gb, gbParameters)
gbmodel.fit(X_train,y_train)

# Testing
lconfidence = lmodel.score(X_test, y_test)
gbconfidence = gbmodel.score(X_test, y_test)

if lconfidence>gbconfidence:
    print('Linear Regressions wins. Score:'+str(lconfidence) + "MAPE:"+str(MAPE(lmodel.predict(X_test), y_test))+"%")
    forecast_prediction = lmodel.predict(X_pred)
    print("Tomorrow's expected closing price: "+str(forecast_prediction))
else:
    print('Gradient Boosting Regressions wins. Score:'+str(round(gbconfidence, 2)) + " MAPE:"+str(MAPE(gbmodel.predict(X_test), y_test))+"%")
    forecast_prediction = gbmodel.predict(X_pred)
    print("Tomorrow's expected closing price: "+ str(forecast_prediction))


Linear Regressions wins. Score:0.9921084818307363MAPE:1.029%
Tomorrow's expected closing price: [2125.41757867]
