In [10]:
import pandas as pd
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view

import yfinance as yf

from datetime import datetime as dt

import plotly.graph_objects as go

### Creating the Training Data (create labels)

In [40]:
# read the stock names
stocks = pd.read_excel('Data/universe.xlsx')['symbol'][:50]

# convert pd Series of strings into one long string (that the format yf wants)
string_format_stocks = stocks.str.cat(sep=' ')

# fetch the data
prices = yf.download(string_format_stocks, start='2010-01-01', end=dt.today().strftime('%Y-%m-%d'))

[*********************100%***********************]  50 of 50 completed


In [41]:
prices.reset_index(inplace=True)
prices.head()


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



Unnamed: 0_level_0,Date,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,Unnamed: 1_level_1,ABBV,ACN,AEP,AIZ,ALLE,AMAT,AMP,AMZN,AVB,...,PYPL,RE,ROL,ROST,UNH,URI,V,VRSK,WRK,XOM
0,2010-01-04,,32.970917,21.340599,23.350733,,11.356115,29.852739,6.695,54.045593,...,,444800,839363,15743600,12199500,1692500,20180000,390000,,27809100
1,2010-01-05,,33.174686,21.096291,24.047543,,11.268762,30.548887,6.7345,53.639587,...,,327200,701663,9369600,11180700,1459200,25833600,430000,,30174700
2,2010-01-06,,33.52734,21.310064,23.977859,,11.244937,30.975569,6.6125,53.133747,...,,266400,841388,13144800,9761100,1072900,16254000,848900,,35044700
3,2010-01-07,,33.496006,21.493296,24.272064,,11.125815,31.207617,6.5,53.240238,...,,354600,536119,23984800,11789800,2052800,27841200,426600,,27192100
4,2010-01-08,,33.362774,21.74983,24.28755,,11.554647,31.319906,6.676,52.840893,...,,276900,330581,15926000,7228700,1399000,11907200,253200,,24891800


In [42]:
# extract the dates to use in the sliding window
dates = prices['Date']

# create windows of 15 days that jump every 5 days (5 days overlap)
windows = sliding_window_view(dates, window_shape = 15)[::5]

In [33]:
# create an array to store the associated following week returns for each window
associated_returns = np.zeros(len(windows)-1)

for i in range(len(windows)-1):
    
    # slice the dataframe
    window_data = prices.loc[prices['Date'].isin(windows[i])]
    
    # make the figure
    fig = go.Figure(data=[go.Candlestick(x=window_data['Date'], open=window_data['Open'], high=window_data['High'],
                                         low=window_data['Low'], close=window_data['Close'])])
    
    # remove uneccessary stuff from the figure
    fig.update_yaxes(showticklabels=False)
    fig.update_xaxes(showticklabels=False)
    fig.update_layout(xaxis_rangeslider_visible=False)
    
    # save the figure
    fig.write_image(f'pattern_images/fig {i}.png')
    
    # find the associated next week return and write it to the associated_returns array
    next_window_data = prices.loc[prices['Date'].isin(windows[i+1])]
    next_window_data.reset_index(inplace=True, drop=True) # to have the index always from 0 to 9
    following_week_return = next_window_data['Adj Close'].pct_change(periods=5)[5].round(3)
    associated_returns[i] = following_week_return

### Create labeled data for the CNN (stock pattern + its return for the following week)