#  End-to-end AWS Quantitative Analysis

https://towardsdatascience.com/end-to-end-quantitative-trading-part-1-798dcfeb165a

In [1]:
#!pip install yfinance

In [2]:
import yfinance as yf
import pandas as pd

In [3]:
stock_df = yf.download('AAPL', 
                       start='2015-02-13', 
                       end='2020-02-13', 
                       progress=False)

In [4]:
stock_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-13,127.279999,127.279999,125.650002,127.080002,116.946587,54272200
2015-02-17,127.489998,128.880005,126.919998,127.830002,117.636772,63152400
2015-02-18,127.629997,128.779999,127.449997,128.720001,118.455811,44891700
2015-02-19,128.479996,129.029999,128.330002,128.449997,118.207336,37362400
2015-02-20,128.619995,129.500000,128.050003,129.500000,119.173599,48948400
...,...,...,...,...,...,...
2020-02-06,322.570007,325.220001,320.260010,325.209991,324.440002,26356400
2020-02-07,322.369995,323.399994,318.000000,320.029999,320.029999,29421000
2020-02-10,314.179993,321.549988,313.850006,321.549988,321.549988,27337200
2020-02-11,323.600006,323.899994,318.709991,319.609985,319.609985,23580800


In [5]:
def get_stock_data(STOCK_ID, start_time, end_time):
    stock_df = yf.download(STOCK_ID, start=start_time, end=end_time, progress=False)
    return stock_df

In [6]:
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [7]:
end_date = datetime.now()
start_date = datetime.now() - relativedelta(years=5)

start_date = f'{start_date.year}-{start_date.month}-{start_date.day}'
end_date = f'{end_date.year}-{end_date.month}-{end_date.day}'

print(end_date)
print(start_date)

2020-3-30
2015-3-30


In [8]:
def get_start_and_end_dates(difference=5):
    end_date = datetime.now()
    start_date = datetime.now() - relativedelta(years=difference)

    start_date = f'{start_date.year}-{start_date.month}-{start_date.day}'
    end_date = f'{end_date.year}-{end_date.month}-{end_date.day}'
    
    return start_date, end_date

In [9]:
get_start_and_end_dates()

('2015-3-30', '2020-3-30')

In [10]:
#stock_df.drop('Adj Close', axis=1, inplace=True)
stock_df['Return'] = (stock_df['Close'] - stock_df['Open'])/stock_df['Open']

In [11]:
stock_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-02-13,127.279999,127.279999,125.650002,127.080002,116.946587,54272200,-0.001571
2015-02-17,127.489998,128.880005,126.919998,127.830002,117.636772,63152400,0.002667
2015-02-18,127.629997,128.779999,127.449997,128.720001,118.455811,44891700,0.008540
2015-02-19,128.479996,129.029999,128.330002,128.449997,118.207336,37362400,-0.000233
2015-02-20,128.619995,129.500000,128.050003,129.500000,119.173599,48948400,0.006842
...,...,...,...,...,...,...,...
2020-02-06,322.570007,325.220001,320.260010,325.209991,324.440002,26356400,0.008184
2020-02-07,322.369995,323.399994,318.000000,320.029999,320.029999,29421000,-0.007259
2020-02-10,314.179993,321.549988,313.850006,321.549988,321.549988,27337200,0.023458
2020-02-11,323.600006,323.899994,318.709991,319.609985,319.609985,23580800,-0.012330


In [12]:
stock_df['Open']

Date
2015-02-13    127.279999
2015-02-17    127.489998
2015-02-18    127.629997
2015-02-19    128.479996
2015-02-20    128.619995
                 ...    
2020-02-06    322.570007
2020-02-07    322.369995
2020-02-10    314.179993
2020-02-11    323.600006
2020-02-12    321.470001
Name: Open, Length: 1258, dtype: float64

In [13]:
import numpy as np

In [14]:
# Adding features
for column in ['Open', 'Close', 'High', 'Low', 'Volume']:
    stock_df[f'{column}_pct'] = stock_df[column].pct_change()
#    stock_df.drop(column, axis=1, inplace=True)

# Change the target to "tomorrow's target"
stock_df['Return'] = stock_df['Return'].shift(-1)
# Adding target
stock_df['Target'] = np.where(stock_df['Return'] > 0.0, 1.0, 0.0)

# Don't need return or the date
#stock_df.drop(['Return', 'Date'], axis=1, inplace=True)

stock_df.dropna(inplace=True) # Using pct_change introduces NA 
                              # values

In [15]:
stock_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Return,Open_pct,Close_pct,High_pct,Low_pct,Volume_pct,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-02-17,127.489998,128.880005,126.919998,127.830002,117.636772,63152400,0.008540,0.001650,0.005902,0.012571,0.010107,0.163623,1.0
2015-02-18,127.629997,128.779999,127.449997,128.720001,118.455811,44891700,-0.000233,0.001098,0.006962,-0.000776,0.004176,-0.289153,0.0
2015-02-19,128.479996,129.029999,128.330002,128.449997,118.207336,37362400,0.006842,0.006660,-0.002098,0.001941,0.006905,-0.167721,1.0
2015-02-20,128.619995,129.500000,128.050003,129.500000,119.173599,48948400,0.022920,0.001090,0.008174,0.003643,-0.002182,0.310098,1.0
2015-02-23,130.020004,133.000000,129.660004,133.000000,122.394508,70974100,-0.005792,0.010885,0.027027,0.027027,0.012573,0.449978,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-02-05,323.519989,324.760010,318.950012,321.450012,320.688934,29706700,0.008184,0.026038,0.008154,0.016018,0.016963,-0.130216,1.0
2020-02-06,322.570007,325.220001,320.260010,325.209991,324.440002,26356400,-0.007259,-0.002936,0.011697,0.001416,0.004107,-0.112779,0.0
2020-02-07,322.369995,323.399994,318.000000,320.029999,320.029999,29421000,0.023458,-0.000620,-0.015928,-0.005596,-0.007057,0.116275,1.0
2020-02-10,314.179993,321.549988,313.850006,321.549988,321.549988,27337200,-0.012330,-0.025406,0.004750,-0.005720,-0.013050,-0.070827,0.0
