In [233]:
import pandas as pd
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view

import os
import random
import yfinance as yf
from datetime import datetime as dt

import plotly.graph_objects as go
import glob
import cv2

from torch.optim import Adam
from torchvision import transforms
import torch.nn.functional as F
from torch.utils import data as data_u
from torch.utils.data import DataLoader
from torch import nn, optim
import torchvision
import torch

### Creating the Training Data (create labels)

In [229]:
# read the stock names
stocks = pd.read_excel('Data/universe.xlsx')['symbol'][:50]

# convert pd Series of strings into one long string (that the format yf wants)
string_format_stocks = stocks.str.cat(sep=' ')

# fetch the data
prices = yf.download(string_format_stocks, start='2010-01-01', end=dt.today().strftime('%Y-%m-%d'))

[*********************100%***********************]  50 of 50 completedd

2 Failed downloads:
- CARR: No data found for this date range, symbol may be delisted
- CNC: No data found for this date range, symbol may be delisted


In [230]:
prices.reset_index(inplace=True)
prices.head()


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



Unnamed: 0_level_0,Date,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,Unnamed: 1_level_1,ABBV,ACN,AEP,AIZ,ALLE,AMAT,AMP,AMZN,AVB,...,PYPL,RE,ROL,ROST,UNH,URI,V,VRSK,WRK,XOM
0,2010-01-04,,32.970909,21.340595,23.350735,,11.356114,29.852718,6.695,54.045597,...,,444800,839363,15743600,12199500,1692500,20180000,390000,,27809100
1,2010-01-05,,33.174675,21.096283,24.047535,,11.268759,30.548878,6.7345,53.639587,...,,327200,701663,9369600,11180700,1459200,25833600,430000,,30174700
2,2010-01-06,,33.527363,21.310066,23.977861,,11.244935,30.975563,6.6125,53.133728,...,,266400,841388,13144800,9761100,1072900,16254000,848900,,35044700
3,2010-01-07,,33.496006,21.493296,24.272068,,11.125817,31.207626,6.5,53.240242,...,,354600,536119,23984800,11789800,2052800,27841200,426600,,27192100
4,2010-01-08,,33.36277,21.749815,24.28755,,11.554646,31.319908,6.676,52.840878,...,,276900,330581,15926000,7228700,1399000,11907200,253200,,24891800


In [231]:
# extract the dates to use in the sliding window
dates = prices['Date']

# create windows of 15 days that jump every 5 days (5 days overlap)
windows = sliding_window_view(dates, window_shape = 15)[::5]

In [None]:
# create an array to store the associated following week returns for each window
counter = 0

# specify the test ratio
test_ratio = 0.2

# loop through each stock and then through the windows
for stock in stocks:
    # extract the current stock data
    current_stock = prices.iloc[:, (prices.columns.get_level_values(1)==stock) | (prices.columns.get_level_values(0)=='Date')]

    # drop the multiindex column names (stock name is uneccessary)
    current_stock = current_stock.droplevel(level=1, axis=1)
    
    # some stocks dont have historical data from year 2010, so drop na
    current_stock.dropna(inplace=True)
        
    for i in range(len(windows)-1):

        # slice the dataframe
        window_data = current_stock.loc[prices['Date'].isin(windows[i])]
        
        if len(window_data) == 15:
            
            # allocate test_ratio of images to the test folder, others to the train folder
            test = random.random() < test_ratio
            train_or_test = 'test' if test else 'train'

            # make the figure
            fig = go.Figure(data=[go.Candlestick(x=window_data['Date'], open=window_data['Open'], high=window_data['High'],
                                                 low=window_data['Low'], close=window_data['Close'])])

            # remove uneccessary stuff from the figure
            fig.update_yaxes(showticklabels=False)
            fig.update_xaxes(showticklabels=False)
            fig.update_layout(xaxis_rangeslider_visible=False)
            
            # find the associated next week return (if its -0.0 then take the absolute value)
            next_window_data = current_stock.loc[current_stock['Date'].isin(windows[i+1])]
            next_window_data.reset_index(inplace=True, drop=True) # to have the index always from 0 to 9
            following_week_return = next_window_data['Adj Close'].pct_change(periods=5)[5].round(3)
            associated_return = np.round(following_week_return, 1)
            abs(associated_return) if associated_return == -0.0 else associated_return
            
            
            path = f'pattern_images/{train_or_test}/{associated_return}/'

            if not os.path.exists(path):
                os.makedirs(path)
                
            # save the figure
            fig.write_image(path + f'fig {counter}.png')


            # increase the counter
            counter += 1
            

In [122]:
associated_returns_np = np.array(associated_returns)

In [123]:
np.round(associated_returns_np,2)

array([ 0.08,  0.03, -0.01, ...,  0.  , -0.03, -0.07])

In [226]:
len(np.unique(np.round(associated_returns_np,1)))

11

### Create labeled data for the CNN (stock pattern + its return for the following week)

In [150]:
num_images = len(glob.glob('pattern_images/*'))

In [204]:
data = []
for i in range(num_images):
    image = cv2.imread(f'pattern_images/fig {i}.png')
    image = cv2.resize(image, (150, 150))
    associated_return = associated_returns[i]
    data.append([image, associated_return])

In [215]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


model = Network()

In [216]:
loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

In [214]:
for epoch in range(20):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

NameError: name 'trainloader' is not defined