
# Predictive Modeling w/ PyTorch
This notebook is a work-in-progress.

- Code to predict stock market movements with large scale technical indicator data used in PyTorch layers.
- Model choice will likely be CNN but TBD.
- The original data is 'close' pricing and daily 'volume'. These are further extended with many technical indicators.
- The technical indicator data is then pulled for stocks that have close correlations in return with a 'given' input ticker.
- Data for several of these tickers (and their corrs) is then concatenated to construct training dataset.
- Pending: add PyTorch layers and process the data for training and testing.

In [1]:
import os
import os.path
from datetime import datetime
from typing import Union, Any

import pandas as pd
from pandas.errors import EmptyDataError
from sqlalchemy import create_engine, text

import torch
from torch.utils.data import Dataset, random_split, DataLoader
pd.options.mode.chained_assignment = None

In [2]:
BASE_DIR = '../../../../workspace/HelloPython/HistoricalMarketData/TechnicalIndicators'
TABLE_EQUITIES_DATA = 'equities_historic_data'
DURATIONS = (14, 30, 90, 200)  # Roughly for bi-weekly, monthly, quarterly, and 200 days running averages

In [3]:
try:
    DB = os.environ["DB"]
    DB_USER = os.environ["DB_USER"]
    DB_PWD = os.environ["DB_PWD"]
except KeyError:
    raise Exception("Required environment variables DB_USER and DB_PWD not set")
DB_URL = 'mysql+mysqlconnector://' + DB_USER + ':' + DB_PWD + '@localhost/' + DB
ENGINE = create_engine(DB_URL)

In [4]:
def generate_file_path(symbol, date=None):
    """
    Generates a file path for a given symbol
    to retrieve calculated tech indicator data
    from local CSV records
    :param symbol: ticker
    :param date: date embedded in the file name
    :return: file name and path
    """
    if date is not None:
        str_date = datetime.strftime(date, '%Y%m%d')
        file_name = symbol.lower() + '_' + str_date + '.csv'
    else:
        file_name = symbol.lower() + '.csv'
    file_path = os.path.join(BASE_DIR, file_name)
    if file_path is None:
        print('Could not find file for symbol:{}'.format(symbol))
    # print(file_path)
    return file_path, file_name

In [5]:
# def retrieve_technical_indicator_data_for_symbol_old(symbol):
#     """
#     Retrieves data from SQL. Superseded as we are now
#     retrieving historical data from CSV
#     :param symbol:
#     :return:
#     """
#     dt_last_entry = None
#     with ENGINE.connect() as conn:
#         res = conn.execute(text('select * from equities_historic_data where \
#                     symbol like \'' + symbol + '\''))
#     dfrm_existing = pd.DataFrame(res.mappings().all())
#     if dfrm_existing is not None and len(dfrm_existing) > 0:  # Don't do anything if nothing exists for symbol
#         dfrm_existing.set_index('date', inplace=True)
#         dfrm_existing.drop(columns=['netChange', 'pcntChange', 'open', 'high', 'low', 'symbol'], inplace=True)
#         reqd_columns = ['close', 'volume']
#         dfrm_existing.columns = [
#             symbol.upper()+'_'+column
#             for column in dfrm_existing.columns
#             if column in reqd_columns
#         ]
#         dfrm_existing.sort_index(inplace=True, ascending=True)
#     else:
#         print(f"No technical indicators data in DB for '{symbol}'.")
#     return dfrm_existing
#
# tmp_df = retrieve_technical_indicator_data_for_symbol_old('AAPL')
# tmp_df.tail(10)

In [6]:
def retrieve_technical_indicator_data_for_symbol(symbol, columns = None, feature=None)\
        -> Any:
    """
    Retrieves Technical Indicator Data for a given symbol
    for given columns and labels them as Y (dependent) or
    X (input features).
    :param symbol: Symbol for which to retrieve Technical Indicator Data
    :param columns: Subset of Technical Indicator Data to retrieve
    :param feature: Y or X_i label where 'i' is feature number
    :return:
    """
    file_path, _ = generate_file_path(symbol)
    if file_path is not None:
        try:
            dfrm = pd.read_csv(file_path)
            dfrm['date'] = pd.to_datetime(dfrm['date'])
            dfrm.set_index('date', inplace=True)
            dfrm.sort_index(ascending=True)
            dfrm.index.name = 'date'
            if feature is None:
                feature = symbol.upper()
            if columns is not None:
                dfrm = dfrm[columns]
                dfrm.columns = [
                feature.upper()+'_'+column
                for column in dfrm.columns
                if column in columns
                ]
            else:
                dfrm.columns = [
                feature.upper()+'_'+column
                for column in dfrm.columns
                ]
            return dfrm
        except FileNotFoundError as e:
            print('Exception reading input data for symbol {}. Generating metadata starting from baseline date.'.format(symbol.upper()))
            return None
        except EmptyDataError as e:
            print(f'No technical indicators found for {symbol.upper()}. Generating metadata starting from baseline date.')
            print('Generating all records.')
            return None
    else:
        print(f"No technical indicators data in DB for '{symbol}'.")
    return None

tmp_df = retrieve_technical_indicator_data_for_symbol('AAPL', ['volume', 'close'])
tmp_df.tail(10)

Unnamed: 0_level_0,AAPL_volume,AAPL_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2025-04-16,59732423,194.27
2025-04-17,52164675,196.98
2025-04-21,46742537,193.16
2025-04-22,52976371,199.74
2025-04-23,52929165,204.6
2025-04-24,47310989,208.37
2025-04-25,38222258,209.28
2025-04-28,37626816,210.14
2025-04-29,36827633,211.21
2025-04-30,52286454,212.5


In [7]:
def find_symbols_w_highest_correlations(symbol, correlations, count):
    """
    Finds tickers whose return have highest correlation
    with the returns of given symbol
    :param symbol:
    :param correlations:
    :param count:
    :return:
    """
    if len(correlations[correlations['level_0'] == symbol.upper()+'_close']) <= 0:
        print("No correlations found for symbol:{}.".format(symbol.upper()))
        print("Likely data does not go far enough back.")
        return None
    corrs = correlations[correlations['level_0'] == symbol.upper()+'_close']
    corrs.sort_values(0, ascending = False, inplace = True)
    return_symbols = [symbol.replace('_close', '') for symbol in corrs.loc[:,'level_1'].tolist()]
    return return_symbols[0:count]

In [8]:
def gen_corrs(dfrm):
    """
    Generate correlations numbers for entire
    input dataframe consisting of daily close values
    :param dfrm:
    :return:
    """
    correlations = dfrm[dfrm.columns].corr().abs().unstack().sort_values(kind="quicksort").reset_index()
    correlations = correlations[correlations['level_0'] != correlations['level_1']]
    #print(correlations)`
    return correlations

In [9]:
from functools import reduce
if __name__ == "__main__":
    MIN_ROW_COUNT = 5000 # Number of days to retrieve training data for
    COMP_COUNT = 10 # Number of stocks to find high correlations with

    query = 'SELECT symbol FROM equities_historic_data GROUP BY symbol HAVING COUNT(*) > '+str(MIN_ROW_COUNT)+''
    with ENGINE.connect() as conn:
        res = conn.execute(text(query))
    dfrm_symbols_list = pd.DataFrame(res.mappings().all())
    symbols = dfrm_symbols_list['symbol'].tolist() # this is our universe of symbol tickers

    # First calculate universal correlations among ALL these symbols
    dfrm_list_daily_closes = list()
    for symbol in symbols:
        dfrm = retrieve_technical_indicator_data_for_symbol(symbol, ['close', 'volume'])
        dfrm_list_daily_closes.append(dfrm)
    merged_dfrm_daily_close_values = reduce(lambda left, right: pd.merge(left, right, on='date'), dfrm_list_daily_closes)
    correlations = gen_corrs(merged_dfrm_daily_close_values)

    # Now merge a symbol's returns with returns of symbols
    # it has the highest correlations with. Then stack (concat)
    # all these returns across symbols to come up with our
    # training dataset

    # Data for a symbol and its corrs matches
    # Think of this as a single line in a matrix
    dfrm_list_single_row_data = list()
    # Data for all symbol and their corrs matches
    # Think of this as a matrix
    dfrm_list_full_matrix_data = list() # Data for
    tgt_symbols_list = ['JPM', 'C', 'MS', 'GS', 'WFC', 'BAC'] # Generate training data for this set only
    for symbol in tgt_symbols_list:
        # Our dependent variable Y - generalize the name so we can stack up many symbols for training dataset
        dfrm_symbol = retrieve_technical_indicator_data_for_symbol(symbol, None, 'Y')
        dfrm_list_single_row_data.append(dfrm_symbol)
        symbols_w_highest_corrs = find_symbols_w_highest_correlations(symbol, correlations, COMP_COUNT)
        if symbols_w_highest_corrs is None:
            continue # Likely not enough data for a symbol. Continue with rest
        print(f'Symbols with highest correlations with {symbol.upper()} are {symbols_w_highest_corrs}')
        counter = 1
        for comp_symbol in symbols_w_highest_corrs:
            feature = 'X_'+str(counter)
            dfrm_tmp = retrieve_technical_indicator_data_for_symbol(comp_symbol, None, feature)
            dfrm_list_single_row_data.append(dfrm_tmp)
            counter += 1
        # Now merge the symbol and all its corrs data against the same date
        merged_df = reduce(lambda left, right: pd.merge(left, right, on='date'), dfrm_list_single_row_data)
        dfrm_list_full_matrix_data.append(merged_df)
        dfrm_list_single_row_data.clear()
    # Now concatenate data. This means dates are repeated
    dfrm_aggr_training_data = pd.concat(dfrm_list_full_matrix_data)

Symbols with highest correlations with JPM are ['TXN', 'PNC', 'MAR', 'SPGI', 'SCHW', 'SIVB', 'SYK', 'WM', 'RJF', 'SNPS']
Symbols with highest correlations with C are ['AIG', 'RF', 'KEY', 'ZION', 'HIG', 'HBAN', 'ARNC', 'FITB', 'KIM', 'XRX']
Symbols with highest correlations with MS are ['HIG', 'DRE', 'ZION', 'KEY', 'FITB', 'LNC', 'MGM', 'RF', 'XRX', 'MET']
Symbols with highest correlations with GS are ['BK', 'STT', 'PRU', 'NTRS', 'CMA', 'CME', 'LNC', 'CBRE', 'SCHW', 'PFG']
Symbols with highest correlations with WFC are ['USB', 'DIS', 'PPG', 'UPS', 'SNA', 'IPG', 'OMC', 'MDLZ', 'AMGN', 'WHR']
No correlations found for symbol:BAC.
Likely data does not go far enough back.


In [10]:
dfrm_single_symbol_training_data = dfrm_list_full_matrix_data[4]
dfrm_single_symbol_training_data


Unnamed: 0_level_0,Y_symbol,Y_close,Y_volume,Y_mean_200,Y_stddev_200,Y_pcntleStdDevs_200,Y_pcntleVolume_200,Y_pcntleClosing_200,Y_oscillator_200,Y_accu_dist_200,...,X_10_stddev_90,X_10_accu_dist_200,X_10_bollingerLower_200,X_10_bollingerUpper_200,X_10_mean_200,X_10_oscillator_200,X_10_pcntleClosing_200,X_10_pcntleStdDevs_200,X_10_pcntleVolume_200,X_10_stddev_200
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2002-02-20,WFC,22.970,8890000,21.957782,1.087244,48.120301,53.759398,75.939850,79.662921,5.616507e+06,...,2.900307,192787.319606,54.530495,76.725144,65.627820,63.616558,43.609023,47.368421,51.127820,5.548662
2002-02-21,WFC,22.875,8520000,21.964627,1.086044,46.268657,47.761194,73.880597,77.528090,5.208397e+06,...,2.905524,222302.823790,54.577250,76.689914,65.633582,65.708061,47.761194,46.268657,67.910448,5.528166
2002-02-22,WFC,23.030,7470000,21.972519,1.085862,45.925926,28.888889,78.518519,81.011236,5.858275e+06,...,2.895570,199716.880497,54.621371,76.651814,65.636593,64.139434,45.185185,45.925926,46.666667,5.507611
2002-02-25,WFC,23.300,6900000,21.959586,1.089206,50.375940,21.052632,87.593985,87.078652,7.053475e+06,...,2.886939,340260.394450,54.521475,76.671306,65.596391,73.769063,64.661654,48.872180,92.857143,5.537458
2002-02-26,WFC,23.315,7350000,21.963571,1.093211,52.631579,27.067669,87.969925,87.415730,7.121648e+06,...,2.897748,667194.117647,54.494894,76.809166,65.652030,96.209150,97.744361,54.135338,97.744361,5.578568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-24,WFC,69.560,18420882,71.256277,5.741431,27.007299,90.510949,31.386861,51.112943,1.742183e+05,...,12.959162,-537789.698593,77.652943,132.443115,105.048029,3.440285,5.109489,100.000000,98.540146,13.697543
2025-04-25,WFC,69.730,17775914,71.245217,5.721914,26.811594,89.130435,31.884058,51.813685,2.865265e+05,...,12.297850,-577349.404689,77.149854,132.545508,104.847681,0.641711,2.898551,100.000000,92.753623,13.848913
2025-04-28,WFC,69.430,16694076,71.538750,5.372290,,86.764706,29.411765,49.345163,-1.056172e+05,...,11.482734,-591375.952469,76.361517,132.970689,104.666103,0.035651,1.470588,100.000000,86.029412,14.152293
2025-04-29,WFC,71.100,24563554,71.636912,5.238784,,93.382353,42.647059,49.510763,-8.045819e+04,...,10.567088,-598296.095588,75.759582,133.181007,104.470294,0.000000,0.735294,100.000000,91.176471,14.355356


In [11]:
# Start with separating independent vector matrix (X) and dependent vector (y)
# Take care of X first
batch_size = 5
len_data = len(dfrm_single_symbol_training_data)
tgt_field_column_name = 'Y_close'
tgt_symbol_column_name = 'Y_symbol'
train_to_total_ratio = 0.8
test_to_total_ratio = 1- train_to_total_ratio

# The train and test dataset must be perfect multiple of batch size
# Try to find some API to simplify this.

offset_test_data_start = len_data - int(test_to_total_ratio * len_data)
offset_test_data_start = offset_test_data_start - (offset_test_data_start % batch_size)
offset_test_date_end = offset_test_data_start + int(test_to_total_ratio * len_data)
offset_test_date_end = offset_test_date_end - (offset_test_date_end % batch_size)

In [12]:
dfrm_training = dfrm_single_symbol_training_data.iloc[batch_size:offset_test_data_start, :]
dfrm_test = dfrm_single_symbol_training_data.iloc[offset_test_data_start:offset_test_date_end, :]
print(len(dfrm_training))
print(len(dfrm_test))

4650
1160


In [13]:
x_columns = [column for column in dfrm_training.columns if column not in [tgt_field_column_name]]
Y_train = dfrm_training[tgt_field_column_name]
X_train = dfrm_training[x_columns]
Y_test = dfrm_test[tgt_field_column_name]
X_test = dfrm_test[x_columns]

print(Y_train.describe())
print(Y_test.describe())
print(X_train.describe())
print(X_test.describe())

count    4650.000000
mean       37.244594
std        11.469257
min         8.120000
25%        28.250000
50%        33.275000
75%        48.697500
max        65.930000
Name: Y_close, dtype: float64
count    1160.000000
mean       48.072129
std        11.865386
min        21.140000
25%        41.355000
50%        46.130000
75%        54.852500
max        81.420000
Name: Y_close, dtype: float64
           Y_volume   Y_mean_200  Y_stddev_200  Y_pcntleStdDevs_200  \
count  4.650000e+03  4650.000000   4650.000000          4650.000000   
mean   2.557886e+07    37.179850      2.234084            54.304659   
std    2.877056e+07    11.173868      1.759607            37.009937   
min    1.770000e+06    19.912029      0.429302             0.704225   
25%    1.079000e+07    28.583507      1.124180            18.705036   
50%    1.738000e+07    33.170504      1.736417            57.142857   
75%    3.000750e+07    48.738753      2.693612            93.525180   
max    4.787400e+08    57.500949    

In [14]:
# Create DataLoaders for training and testing
Y_train_loader = DataLoader(Y_train, batch_size=5, shuffle=False)
Y_test_loader = DataLoader(Y_test, batch_size=5, shuffle=False)
X_train_loader = DataLoader(X_train, batch_size=5, shuffle=False)
X_test_loader = DataLoader(X_test, batch_size=5, shuffle=False)