In [20]:
import numpy as np
import pandas as pd
import yfinance as yf
import os

In [21]:
data_dir = 'stock_data'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [22]:
#Download data from yahoo finance API
data_path = os.path.join(data_dir, 'Raw.csv')
data = yf.download("^NSEI", start="2011-01-01", end="2018-01-01")

[*********************100%***********************]  1 of 1 completed


In [23]:
#Drop Adj Close and volume , we do not need it 
data=data.drop(['Adj Close','Volume'], axis=1)

In [24]:
# Classify each day as a 1 or a 0 and near zero move as negative to address transaction cost.

def clas(df):
    n = len(df)
    arr = []
    for i in range(0,len(df)-1):
        if (100*((df.Close[i+1]-df.Open[i+1])/df.Open[i+1]))>=0: # Classificing near zero moves to negative move. 
            arr.append(1)
        else:
            arr.append(0)
    arr.append('N')
    return arr

clas=clas(data)

#Add Class to our dataframe
data['Class'] = clas

In [25]:
#Calculate percentage change as returns
data_pctchange=data.Close.pct_change()
data['return'] = data_pctchange

In [26]:
#Create feature lagged returns
lags = 5

cols = []
for lag in range(1, lags+1):
    col = 'ret_%d' % lag
    data[col] = data['return'].shift(lag)
    cols.append(col) 


In [27]:
#Calculate the return sign
data.dropna(inplace=True)
data['return_sign'] = np.sign(data['return'].values)

In [28]:
# calculate EMA for each day
# formula: EMA = (2/(n+1))*ClosePrice + (1-(2/(n+1)))*previousEMA

def EMA(df, n):
    m = len(df)
    arr = []
    arr.append('N')
    prevEMA = df.Close[0]
    for i in range(1,m):
        close = df.Close[i]
        EMA = ((2/(n+1))*close) + ((1-(2/(n+1)))*prevEMA)
        arr.append(EMA)
        prevEMA = EMA
    return arr

#Calculate EMA with n=12 and n=26

EMA_12 = EMA(data, 12)
EMA_26 = EMA(data, 26)

#add EMA to dataframe 

data['EMA_12'] = EMA_12
data['EMA_26'] = EMA_26

In [29]:
#ROI function

def ROI(df,n):
    m = len(df)
    arr = []
    for i in range(0,n):
        arr.append('N')
    for j in range(n,m):
        roi= (df.Close[j] - df.Close[j-n])/df.Close[j-n] #Equation for ROI
        arr.append(roi)
    return arr

#Run the ROI function for 10, 20, and 30 day periods

ROI10=ROI(data,10)
ROI20=ROI(data,20)
ROI30=ROI(data,30)


#Add all 3 ROI results to dataframe 

data['10 Day ROI']=ROI10
data['20 Day ROI']=ROI20
data['30 Day ROI']=ROI30

In [30]:
# calculate momentum for each day
# 5-day momentum

def momentum(df):
    n = len(df)
    arr = []
    for i in range(0,5):
        arr.append('N')
    for j in range(5,n):
        momentum = df.Close[j] - df.Close[j-5] #Equation for momentum
        arr.append(momentum)
    return arr

momentum = momentum(data)

# add momentum to data
data['Momentum'] = momentum

In [31]:
#MACD
# Moving Average of EMA(n) - EMA(m2) for each row
# where n = 12 and m2 = 26
def MACD(df):
    n = 12
    m2 = 26
    arr = []
    arr.append('N')
    ema_12 = EMA(df,n)
    ema_26 = EMA(df,m2)
    m = len(df)
    for i in range(1,m):
        arr.append(ema_12[i] - ema_26[i])
    return arr

MACD = MACD(data)

#Add MACD to our dataframe 
data['MACD_12_26'] = MACD

In [32]:
# calculate Williams %R oscillator for each day

def Williams(df,n):
    m = len(df)
    arr = []
    for i in range(0,n-1):
        arr.append('N')
    for j in range(n-1,m):
        maximum = max(data.High[(j-n+1):j+1])
        minimum = min(data.Low[(j-n+1):j+1])
        val = (-100)*(maximum-df.Close[j])/(maximum-minimum)
        arr.append(val)
    return arr


williams = Williams(data,14)

#Add Williams%R to our dataframe
data['Williams'] = williams

In [33]:
# True Range
# TR = MAX(high[today] - close[yesterday]) - MIN(low[today] - close[yesterday])
def TR(df,n):
    high = df.High[n]
    low = df.Low[n]
    close = df.Close[n-1]
    l_max = list()
    l_max.append(high)
    l_max.append(close)
    l_min = list()
    l_min.append(low)
    l_min.append(close)
    return (max(l_max) - min(l_min))

# Average True Range
# Same as EMA except use TR in lieu of close (prevEMA = TR(dataframe,14days))
def ATR(df,n):
    m = len(df)
    arr = []
    prevEMA = TR(df,n+1)
    for i in range(0,n):
        arr.append('N')
    for j in range(n,m):
        TR_ = TR(df,j)
        EMA = ((2/(n+1))*TR_) + ((1-(2/(n+1)))*prevEMA)
        arr.append(EMA)
        prevEMA = EMA
    return arr

ATR = ATR(data,14)  

#Add ATR to our dataframe
data['ATR_14'] = ATR

In [34]:
# calculate Commodity Channel Index (CCI) for each day

import numpy as np
def CCI(df,n):
    m = len(df)
    arr = []
    tparr = []
    for i in range(0,n-1):
        arr.append('N')
        tp = (df.High[i]+df.Low[i]+df.Close[i])/3
        tparr.append(tp)
    for j in range(n-1,m):
        tp = (df.High[j]+df.Low[j]+df.Close[j])/3
        tparr.append(tp) 
        tps = np.array(tparr[(j-n+1):(j+1)])
        val = (tp-tps.mean())/(0.015*tps.std())
        arr.append(val)
    return arr

cci = CCI(data,20) 

#Add CCI to our dataframe
data['CCI'] = cci

In [35]:
# calculate RSI for each day


def RSI(df,period):
    # get average of upwards of last 14 days: Ct - Ct-1
    # get average of downwards of last 14 days: Ct-1 - Ct
    n = len(df)
    arr = []
    for i in range(0,period):
        arr.append('N')
    for j in range(period,n):
        total_upwards = 0
        total_downwards = 0
        # this will find average of upwards
        for k in range(j,j-period,-1):
            if(df.Close[k-1] > df.Close[k]):
                total_downwards = total_downwards + (df.Close[k-1] - df.Close[k])    
        avg_down = total_downwards / period
        for l in range(j,j-period,-1):
            if(df.Close[l] > df.Close[l-1]):
                total_upwards = total_upwards + (df.Close[l] - df.Close[l-1])
        avg_up = total_upwards / period
        RS = avg_up / avg_down
        RSI  = 100 - (100/(1+RS))
        arr.append(RSI)
    return arr


#Run RSI for 10, 14, and 30 day periods

RSI_14 = RSI(data,14)
RSI_10 = RSI(data,10)
RSI_30 = RSI(data,30)

# add RSI to data

data['10_day_RSI'] = RSI_10
data['14_day_RSI'] = RSI_14
data['30_day_RSI'] = RSI_30



In [36]:
#SRSI: Stochastic RSI
#SRSI = (RSI_today - min(RSI_past_n)) / (max(RSI_past_n) - min(RSI_past_n))
def SRSI(df,n):
    m = len(df)
    arr = []
    list_RSI = RSI(df,n)
    for i in range(0,n):
        arr.append('N')
    for j in range(n,n+n):
        last_n = list_RSI[n:j]
        if(not(last_n == []) and not(max(last_n) == min(last_n))):
            SRSI = (list_RSI[j] - min(last_n)) / (max(last_n)- min(last_n))
            if SRSI > 1:
                arr.append(1)
            else:
                arr.append(SRSI)
        else:
            arr.append(0)
    for j in range(n+n,m):
        last_n = list_RSI[2*n:j]
        if(not(last_n == []) and not(max(last_n) == min(last_n))):
            SRSI = (list_RSI[j] - min(last_n)) / (max(last_n)- min(last_n))
            if SRSI > 1:
                arr.append(1)
            else:
                arr.append(SRSI)
        else:
            arr.append(0)
    return arr

#Run SRSI for 10, 14, and 30 day periods
SRSI_10 = SRSI(data,10)
SRSI_14 = SRSI(data,14)
SRSI_30 = SRSI(data,30)

#Add SRSI to our dataframe
data['SRSI_10'] = SRSI_10
data['SRSI_14'] = SRSI_14
data['SRSI_30'] = SRSI_30



In [37]:
#Remove the first 30 index which could have a value 'N'
newdata=data.drop(data.index[0:30])

#Remove the last row of data because class has value 'N'
newdata=newdata.drop(newdata.index[-1])

#Time series data
#----------------------------------------------------------------
#Remove our 'Open' column  
newdata=newdata.drop(['Open'], axis=1)

#Remove our 'Close' column  
newdata=newdata.drop(['Close'], axis=1)

#Remove 'High' and 'Low' columns to improve the algorithm
newdata=newdata.drop(['High','Low'], axis=1)

newdata.reset_index(drop=True, inplace=True)

In [38]:
newdata.to_csv(data_path) # SAVE DATA INTO LOCAL FILES