### Pickle adjustments

Since the pickle files were too large for the free version of Heroku/Docker, the "No Industry" classification is removed for the public version of the website.

In [52]:
import pickle
import pandas as pd

In [53]:
# Load industry lists and covariance matrix from pickled files
with open("sorted_industries.pkl", "rb") as f:
    industry_lists = pickle.load(f)
sigma_sorted = pd.read_pickle("sorted_covariance.pkl")

### Required size reduction

To use the free versions of Heroku and Docker, the size must be reduced. From my personal project dashboard, having cut both "Real Estate & Construction" and "No Industry" categories, 213.3% of memory is being used. To reduce this, the following simple calculations show that, with some leeway, the covariance matrix loaded to the website can have a max of about 3461 stocks.

In [54]:
industry_lists.keys()

dict_keys(['Energy', 'Manufacturing', 'Industrial Applications & Services', 'Financials', 'Life Sciences', 'Real Estate & Construction', 'Trade & Services', 'Technology', 'No Industry'])

In [55]:
for key in industry_lists.keys():
    print(f"{key} has length: {len(industry_lists[key])}")

Energy has length: 726
Manufacturing has length: 830
Industrial Applications & Services has length: 541
Financials has length: 450
Life Sciences has length: 925
Real Estate & Construction has length: 774
Trade & Services has length: 714
Technology has length: 889
No Industry has length: 3933


In [56]:
#Current size, meaning after trying to remove no industry and real estate
current = (9782 - 3933 - 774)**2
print(f"Current size at 213.3%: {current}")
max = current/2.15
print(f"Max can be: {max}")
print(f"Required matrix size: {max**0.5}")

Current size at 213.3%: 25755625
Max can be: 11979360.46511628
Required matrix size: 3461.121272812653


In [57]:
9782 - 3933 - 925 - 889 - 774
#No industry, life sciences, technology, real estate

3261

In [58]:
# Remove categories for Docker and Heroku feasibility using free license
del industry_lists["No Industry"]
del industry_lists["Life Sciences"]
del industry_lists["Technology"]
del industry_lists["Real Estate & Construction"]

# Remaining keys
industry_lists.keys()

dict_keys(['Energy', 'Manufacturing', 'Industrial Applications & Services', 'Financials', 'Trade & Services'])

In [59]:
# Get the tickers from the updated industry_lists (excluding "No Industry")
tickers_in_industries = [ticker for tickers in industry_lists.values() for ticker in tickers]

# Filter sigma_sorted to include only those tickers
sigma_reduced = sigma_sorted.loc[tickers_in_industries, tickers_in_industries]

sigma_reduced

Unnamed: 0,AAL,AAU,AAV,AAWW,ABX,ADSW,AEE,AEM,AEP,AES,...,YOGA,YQ,YUM,YUMC,YUME,ZCMD,ZEUS,ZOES,ZUMZ,ZVO
AAL,0.002362,0.000261,0.0,0.000441,0.0,0.000061,0.000259,0.000010,0.000211,0.000571,...,-0.000006,-0.000005,0.000381,0.000243,0.0,0.000116,0.000581,0.0,0.000629,0.000434
AAU,0.000261,0.003115,0.0,0.000224,0.0,0.000024,0.000207,0.000173,0.000161,0.000274,...,0.000019,0.000019,0.000181,0.000097,0.0,0.000130,0.000189,0.0,0.000211,0.000274
AAV,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000
AAWW,0.000441,0.000224,0.0,0.001543,0.0,0.000033,0.000180,0.000093,0.000135,0.000314,...,0.000030,0.000008,0.000209,0.000192,0.0,0.000117,0.000332,0.0,0.000303,0.000347
ABX,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZCMD,0.000116,0.000130,0.0,0.000117,0.0,0.000009,0.000073,0.000072,0.000055,0.000115,...,0.000020,0.000002,0.000055,0.000052,0.0,0.000970,0.000108,0.0,0.000076,0.000158
ZEUS,0.000581,0.000189,0.0,0.000332,0.0,0.000030,0.000146,0.000039,0.000118,0.000339,...,0.000013,-0.000004,0.000185,0.000159,0.0,0.000108,0.001536,0.0,0.000372,0.000278
ZOES,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000
ZUMZ,0.000629,0.000211,0.0,0.000303,0.0,0.000041,0.000210,0.000037,0.000171,0.000388,...,-0.000008,0.000013,0.000270,0.000164,0.0,0.000076,0.000372,0.0,0.001338,0.000309


In [60]:
# Save the updated industry_lists to a new pickle file
with open("new_sorted_industries.pkl", "wb") as f:
    pickle.dump(industry_lists, f)

# Save the reduced covariance matrix to a new pickle file
sigma_reduced.to_pickle("new_sorted_covariance.pkl")