In [1]:
import talib
from talib import RSI, BBANDS, MACD, ATR
import joblib
import numpy as np
from datetime import datetime
import yfinance as yf
import pandas as pd



### get tickers

In [2]:
import requests
from bs4 import BeautifulSoup

# get all components of merval index
url = "https://es.wikipedia.org/wiki/S%26P_Merval"

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

panel_lider_section = soup.find(id="Empresas_que_participan_Panel_Líder")
lider_table = panel_lider_section.find_next("table")

ba_tickers = []
for row in lider_table.find_all('tr')[1:]:  # Skip the header row
    cols = row.find_all('td')
    ticker = cols[0].text.strip()
    ba_tickers.append(ticker + ".BA")
    
panel_lider_section = soup.find(id="Empresas_panel_general")
general_table = panel_lider_section.find_next("table")

for row in general_table.find_all('tr')[1:]:  # Skip the header row
    cols = row.find_all('td')
    ticker = cols[0].text.strip()
    ba_tickers.append(ticker + ".BA")

len(ba_tickers)

62

### download from yf

In [51]:
import yfinance as yf
import pandas as pd

all_data = pd.DataFrame()
for symbol in ba_tickers:
	try:
		stock_data = yf.download(symbol, start='2024-05-18', end='2024-07-20', progress=False)
		stock_data['ticker'] = symbol
		stock_data['open'] = stock_data['Open']
		stock_data['high'] = stock_data['High']
		stock_data['low'] = stock_data['Low']
		stock_data['close'] = stock_data['Close']
		stock_data['volume'] = stock_data['Volume']
		stock_data['adj_open'] = stock_data['Open']
		stock_data['adj_high'] = stock_data['High']
		stock_data['adj_low'] = stock_data['Low']
		stock_data['adj_close'] = stock_data['Adj Close']
		stock_data['adj_volume'] = stock_data['Volume']
		stock_data['ex-dividend'] = 0  # Placeholder as Yahoo Finance does not provide this
		stock_data['split_ratio'] = 1  # Placeholder as Yahoo Finance does not provide this
		stock_data.reset_index(inplace=True)
		stock_data['date'] = stock_data['Date']

		stock_data = stock_data[['ticker', 'date', 'open', 'high', 'low', 'close', 'volume',
									'ex-dividend', 'split_ratio', 'adj_open', 'adj_high', 'adj_low',
									'adj_close', 'adj_volume']]
		all_data = pd.concat([all_data, stock_data])
	except Exception as e:
		# ba_tickers.remove(symbol)
		print(f"Failed to download data for {symbol}: {e}")



1 Failed download:
['ESME.BA']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')

1 Failed download:
['PGR.BA']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')

1 Failed download:
['RICH.BA']: YFInvalidPeriodError("%ticker%: Period 'max' is invalid, must be one of ['1d', '5d']")

1 Failed download:
['TGLT.BA']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')


### apply data engineering

In [52]:
prices = all_data.set_index(['ticker', 'date']).sort_index()
prices = prices[['open', 'close', 'high', 'low', 'volume']]
DATA_STORE = '../notebooks/data/assets.h5'
with pd.HDFStore(DATA_STORE) as store:
	metadata = (store['merval/stocks'].loc[:, ['marketcap', 'sector']])

prices.volume /= 1e3 # make vol figures a bit smaller
prices.index.names = ['symbol', 'date']
metadata.index.name = 'symbol'


In [53]:
prices

Unnamed: 0_level_0,Unnamed: 1_level_0,open,close,high,low,volume
symbol,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ALUA.BA,2024-05-20,937.0,969.0,972.0,925.0,1080.973
ALUA.BA,2024-05-21,972.0,1027.5,1031.0,972.0,1878.298
ALUA.BA,2024-05-22,1035.0,1051.0,1053.0,1018.0,998.396
ALUA.BA,2024-05-23,1055.0,1026.5,1055.0,1000.0,871.274
ALUA.BA,2024-05-24,1002.5,1001.5,1015.0,964.5,2.299
...,...,...,...,...,...,...
YPFD.BA,2024-07-15,30000.0,27350.0,30000.0,27225.0,356.566
YPFD.BA,2024-07-16,27200.0,26675.0,27200.0,25425.0,426.408
YPFD.BA,2024-07-17,26700.0,26100.0,26725.0,25600.0,280.663
YPFD.BA,2024-07-18,26325.0,26400.0,26750.0,26125.0,205.158


In [54]:
# RSI 
RSI(prices.close)


symbol   date      
ALUA.BA  2024-05-20          NaN
         2024-05-21          NaN
         2024-05-22          NaN
         2024-05-23          NaN
         2024-05-24          NaN
                         ...    
YPFD.BA  2024-07-15    55.240434
         2024-07-16    51.839437
         2024-07-17    49.068053
         2024-07-18    50.553344
         2024-07-19    54.041444
Length: 2378, dtype: float64

In [55]:
prices = all_data.set_index(['ticker', 'date']).sort_index()
prices = prices[['open', 'close', 'high', 'low', 'volume']]
DATA_STORE = '../notebooks/data/assets.h5'
with pd.HDFStore(DATA_STORE) as store:
	metadata = (store['merval/stocks'].loc[:, ['marketcap', 'sector']])

prices.volume /= 1e3 # make vol figures a bit smaller
prices.index.names = ['symbol', 'date']
metadata.index.name = 'symbol'

# RSI
rsi = prices.groupby(level='symbol').close.apply(RSI)
prices['rsi'] = rsi.values

# BB
def compute_bb(close):
	high, mid, low = BBANDS(close, timeperiod=20)
	return pd.DataFrame({'bb_high': high, 'bb_low': low}, index=close.index)
bb = prices.groupby(level='symbol').close.apply(compute_bb)
prices['bb_high'] = bb['bb_high'].values
prices['bb_low'] = bb['bb_low'].values
prices['bb_high'] = prices.bb_high.sub(prices.close).div(prices.bb_high).apply(np.log1p)
prices['bb_low'] = prices.close.sub(prices.bb_low).div(prices.close).apply(np.log1p)

# NATR
prices['NATR'] = prices.groupby(level='symbol', 
							group_keys=False).apply(lambda x: 
													talib.NATR(x.high, x.low, x.close))
def compute_atr(stock_data):
	df = ATR(stock_data.high, stock_data.low, 
			stock_data.close, timeperiod=14)
	return df.sub(df.mean()).div(df.std())
prices['ATR'] = (prices.groupby('symbol', group_keys=False)
				.apply(compute_atr))

# PPO
by_ticker = prices.groupby('symbol', group_keys=False)
prices['ppo'] = by_ticker.close.apply(talib.PPO)

# MACD
def compute_macd(close):
	macd = MACD(close)[0]
	return (macd - np.mean(macd))/np.std(macd)
prices['MACD'] = (prices
				.groupby('symbol', group_keys=False)
				.close
				.apply(compute_macd))

# Combine price and metadata
metadata.sector = pd.factorize(metadata.sector)[0].astype(int)
prices = prices.join(metadata[['sector']])

# Create dummy variables
prices['year'] = prices.index.get_level_values('date').year
prices['month'] = prices.index.get_level_values('date').month
prices['weekday'] = prices.index.get_level_values('date').weekday

In [65]:
prices.xs(key='ALUA.BA', level='symbol').isna().sum() + 1


open        1
close       1
high        1
low         1
volume      1
rsi        15
bb_high    20
bb_low     20
NATR       15
ATR        15
ppo        26
MACD       34
sector      1
year        1
month       1
weekday     1
dtype: int64

In [63]:
prices.xs(key='ALUA.BA', level='symbol')

Unnamed: 0_level_0,open,close,high,low,volume,rsi,bb_high,bb_low,NATR,ATR,ppo,MACD,sector,year,month,weekday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2024-05-20,937.0,969.0,972.0,925.0,1080.973,,,,,,,,0,2024,5,0
2024-05-21,972.0,1027.5,1031.0,972.0,1878.298,,,,,,,,0,2024,5,1
2024-05-22,1035.0,1051.0,1053.0,1018.0,998.396,,,,,,,,0,2024,5,2
2024-05-23,1055.0,1026.5,1055.0,1000.0,871.274,,,,,,,,0,2024,5,3
2024-05-24,1002.5,1001.5,1015.0,964.5,2.299,,,,,,,,0,2024,5,4
2024-05-27,990.0,1029.0,1034.0,953.0,953.186,,,,,,,,0,2024,5,0
2024-05-28,1022.0,1031.0,1035.0,991.5,939.374,,,,,,,,0,2024,5,1
2024-05-29,1034.0,998.5,1035.0,995.5,583.097,,,,,,,,0,2024,5,2
2024-05-30,1002.0,1011.5,1021.5,997.0,1199.265,,,,,,,,0,2024,5,3
2024-05-31,1009.0,1011.5,1020.0,992.0,1174.637,,,,,,,,0,2024,5,4
