In [1]:
import pandas as pd
import numpy as np

from numpy.linalg import inv

from itertools import product

from scipy.stats import zscore

# Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
import yfinance as yf

stocks = pd.read_excel('Data/universe.xlsx')['symbol']

# Convert pd Series of strings into one long string (that the format yf wants)
string_format_stocks = stocks.str.cat(sep=' ')

# Fetch the price data
prices = yf.download(string_format_stocks, start='2022-10-31', end='2022-11-04')

# Extract only the adj close data
adj_close = prices['Adj Close']
adj_close.drop(cryptos, axis=1, inplace=True)

next_week_returns = adj_close.pct_change(periods=4).loc['2022-11-04']
next_week_returns = next_week_returns.to_frame().sort_index()

[*********************100%***********************]  110 of 110 completed


In [4]:
next_week_returns = np.array(next_week_returns['2022-11-04'])

### Read the Data

In [5]:
data = pd.read_csv('Data/data.csv', header=[0, 1], index_col=0)
data.head()

Unnamed: 0_level_0,ABBV,ABBV,ABBV,ABBV,ABBV,ABBV,ABBV,ABBV,ABBV,ABBV,...,XRP-USD,XRP-USD,XRP-USD,XRP-USD,XRP-USD,XRP-USD,XRP-USD,XRP-USD,XRP-USD,XRP-USD
Unnamed: 0_level_1,Adj Close,Close,High,Low,MACD signal,OBV ema,Open,RSI,Volume,economy_fiscal_news_sentiment_label,...,Volume,economy_fiscal_news_sentiment_label,economy_fiscal_news_sentiment_score,economy_macro_news_sentiment_label,economy_macro_news_sentiment_score,economy_monetary_news_sentiment_label,economy_monetary_news_sentiment_score,financial_market_news_sentiment_label,financial_market_news_sentiment_score,news_sentiment
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2021-01-04,,105.410004,107.349998,103.860001,0.0,0.0,107.18,,9523400.0,,...,6141918380,,,,,,,,,
2021-01-05,0.010341,106.5,107.019997,104.629997,0.012478,1901450.0,105.410004,,6823800.0,,...,5378491929,,,,,,,,,
2021-01-06,-0.008639,105.580002,107.190002,104.18,0.008355,374077.9,104.75,,11017500.0,,...,7552726810,,,,,,,,,
2021-01-07,0.010703,106.709999,107.059998,105.57,0.015928,1212874.0,106.110001,,8196000.0,,...,15955049240,,,,,,,,,
2021-01-08,0.005248,107.269997,107.529999,105.760002,0.030796,2979660.0,106.839996,,5345900.0,,...,9825829516,,,,,,,,,


In [6]:
options = pd.read_csv('Data/options_data.csv')
options.columns = ['Ticker', 'Ratio']
options['Ratio'] = options['Ratio'].fillna(0)
options['Ratio'] = options['Ratio'].replace(np.inf, -1)
options.head()

Unnamed: 0,Ticker,Ratio
0,ABBV,0.624447
1,ACN,0.622912
2,AEP,1.009615
3,AIZ,2.444444
4,ALLE,0.541667


In [2]:
# Get the names of crypto currencies in the data to drop them
cryptos = list(pd.read_excel('Data/universe.xlsx')['symbol'][100:])

### Join the data and convert to a numpy array

In [7]:
# Get the names of tickers in the options_data
tickers = list(options['Ticker'])

# Add put-call ratio to the main data
for row, ticker in enumerate(tickers):

    # Get a df for one topic at a time
    ticker_ratio = options.loc[options['Ticker'] == ticker]['Ratio'][row]
    
    # Make a df with same index as data dataframe and add the ratio
    temp_df = pd.DataFrame(index=data.index)
    temp_df['Ratio'] = ticker_ratio
    
    # Convert columns to multiindex
    temp_df.columns = pd.MultiIndex.from_tuples([(ticker, 'PCR')])

    # Join with the data DataFrame
    data = data.merge(temp_df, left_index=True, right_index=True, how='left')

In [8]:
# Sort the columns
data.sort_index(axis=1, level=0, inplace=True)

In [9]:
### TEMPORARY ###

data = data.loc[:'2022-10-28'] # take data before end of last week for training

In [10]:
# Drop label columns
to_drop = np.array(['news_sentiment',
                    'Adj Close',
                    'Close',
                    'Open',
                    'economy_fiscal_news_sentiment_label', 
                    'economy_macro_news_sentiment_label', 
                    'economy_monetary_news_sentiment_label', 
                    'financial_market_news_sentiment_label'])

data.drop(to_drop, axis=1, level=1, inplace=True)

# Drop all cryptos
data.drop(cryptos, axis=1, level=0, inplace=True)

In [11]:
# Check that all stocks have the same number of columns
np.unique(data.columns.get_level_values(0), return_counts=True)[1]

array([11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11])

In [12]:
# Convert data to a numpy array
data = data.mean(skipna=True, axis=0).to_numpy()

# Convert the numpy array to a matrix
data = data.reshape(11, -1).T

# Convert data to z-scores
data = zscore(data, axis=0)

In [13]:
np.matmul(inv(np.matmul(data.T, data)), np.matmul(data.T, next_week_returns))

array([ 0.00247295,  0.00167409,  0.00048987, -0.00038759,  0.00870224,
       -0.00406843, -0.00322375, -0.00234222, -0.00228927, -0.0014536 ,
       -0.00011032])

In [19]:
next_week_returns

array([-1.35245615e-02, -9.51743205e-02,  8.64424650e-03, -9.16384285e-02,
       -2.31936660e-02, -2.25393345e-02, -3.88189122e-03, -1.28270198e-01,
       -2.86659956e-02, -1.17959302e-02, -6.15695492e-02, -6.34402727e-02,
       -2.70587697e-02,  1.30373321e-02, -1.07623578e-01, -9.80883818e-03,
        1.15758768e-04, -1.69580195e-02, -6.35710664e-02, -5.87283261e-04,
       -2.09717011e-02,  6.13055057e-02, -3.88492242e-02, -1.62359728e-02,
       -2.27798372e-02,  5.96858225e-02,  0.00000000e+00, -1.05739162e-01,
       -1.93050015e-02, -2.08900591e-02, -3.07057784e-02,  4.40993210e-02,
       -1.80036306e-03, -2.76144673e-02, -1.98720156e-02, -2.39998117e-04,
       -1.42653892e-02,  4.24623432e-03,  1.16314995e-02, -2.30047134e-02,
       -1.18001324e-01, -4.55412997e-03,  1.46452947e-02,  5.66214628e-03,
       -4.24770620e-03, -1.00105904e-01, -1.02529048e-02, -6.45592578e-04,
        6.88200489e-03,  8.23327989e-04, -6.23101854e-03,  1.74712498e-02,
        1.46399709e-02, -

In [21]:
np.matmul(data, arr)

array([[-3.10062236e-03],
       [-4.13159831e-03],
       [-7.32179607e-04],
       [ 3.13819534e-03],
       [-3.33039840e-03],
       [ 7.40980745e-03],
       [ 1.99619830e-02],
       [-1.76735067e-02],
       [-3.69563996e-03],
       [ 2.07786709e-02],
       [-2.14728342e-02],
       [-3.38015932e-03],
       [-6.59815316e-03],
       [ 2.37992681e-03],
       [-1.09479879e-03],
       [-2.91056281e-03],
       [-5.92669870e-03],
       [-2.89178870e-03],
       [-3.19723885e-03],
       [-5.38662001e-03],
       [-5.13311983e-03],
       [ 7.48644290e-02],
       [-3.07597736e-03],
       [-1.65495369e-03],
       [ 4.31411952e-03],
       [-1.01847966e-04],
       [-2.48507718e-03],
       [-2.59463692e-03],
       [-3.79664208e-03],
       [ 6.55618189e-03],
       [-2.56793832e-03],
       [ 1.69546683e-02],
       [-6.59187064e-03],
       [-1.56755831e-03],
       [-5.88724818e-04],
       [-1.95239098e-03],
       [-2.48974844e-03],
       [-2.15130293e-03],
       [ 3.0

In [15]:
arr = np.array([ 0.00247295,  0.00167409,  0.00048987, -0.00038759,  0.00870224,
       -0.00406843, -0.00322375, -0.00234222, -0.00228927, -0.0014536 ,
       -0.00011032])

In [16]:
arr = arr.reshape(-1,1)