<a href="https://colab.research.google.com/github/rvargas42/Value_Growth_Challenge_ETSFactory/blob/main/data/extract_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description
This notebook will take data from sources, filter it and create files to be used in model training

In [87]:
import requests as req
import pandas as pd
import os
import dotenv

Load Directories and Files

In [88]:
data_dir = os.getcwd()
root_dir = os.path.dirname(data_dir)
dotenv_file = os.path.join(root_dir, '.env')

dotenv.load_dotenv()
FMP_KEY = os.getenv("FMP_KEY")
TE_KEY = os.getenv("TE_KEY")
FRED_KEY = os.getenv("FRED_KEY")

In [89]:
#FILES
all_data = os.listdir(data_dir)
etf_list = os.path.join(data_dir,"etf_data","fmp_data_etf_list.csv")
#READ DATA
etf_list = pd.read_csv(etf_list, index_col=False)
etf_list

Unnamed: 0,symbol,name,price,exchange,exchangeShortName,type
0,ULTR,IQ Ultra Short Duration ETF,47.96000,New York Stock Exchange Arca,AMEX,etf
1,RXL,ProShares Ultra Health Care,102.61300,New York Stock Exchange Arca,AMEX,etf
2,FMNY,First Trust New York High Income Municipal ETF,26.66290,New York Stock Exchange Arca,AMEX,etf
3,UIMP.DE,UBS (Lux) Fund Solutions – MSCI USA Socially R...,196.32000,Frankfurt Stock Exchange,XETRA,etf
4,DANC.TO,Desjardins Alt Long/Short Equity Market Neutra...,22.26000,Toronto Stock Exchange,TSX,etf
...,...,...,...,...,...,...
11965,TQSM.TO,TD Q U.S. Small-Mid-Cap Equity ETF,22.43000,Toronto Stock Exchange,TSX,etf
11966,SSIL.L,WisdomTree Silver 1x Daily Short,6.45625,London Stock Exchange,LSE,etf
11967,GURU,Global X Guru Index ETF,42.95410,New York Stock Exchange Arca,AMEX,etf
11968,HMJI.TO,BetaPro Marijuana Companies Inverse ETF,23.61000,Toronto Stock Exchange,TSX,etf


#### Data Reading and Cleaning

In [90]:
etfs_us = etf_list[etf_list["exchangeShortName"].str.contains("AMEX")]
etfs_us.dropna()


Unnamed: 0,symbol,name,price,exchange,exchangeShortName,type
0,ULTR,IQ Ultra Short Duration ETF,47.9600,New York Stock Exchange Arca,AMEX,etf
1,RXL,ProShares Ultra Health Care,102.6130,New York Stock Exchange Arca,AMEX,etf
2,FMNY,First Trust New York High Income Municipal ETF,26.6629,New York Stock Exchange Arca,AMEX,etf
6,IDRV,iShares Self-Driving EV and Tech ETF,30.3700,New York Stock Exchange Arca,AMEX,etf
7,SPYB,SPDR S&P 500 Buyback ETF,58.9059,New York Stock Exchange Arca,AMEX,etf
...,...,...,...,...,...,...
11951,NACP,Impact Shares NAACP Minority Empowerment ETF,38.9000,New York Stock Exchange Arca,AMEX,etf
11957,BECO,BlackRock Future Climate and Sustainable Econo...,21.6881,New York Stock Exchange Arca,AMEX,etf
11958,XC,WisdomTree Emerging Markets ex-China Fund,31.0599,New York Stock Exchange Arca,AMEX,etf
11962,IVOL,Quadratic Interest Rate Volatility and Inflati...,18.3200,New York Stock Exchange Arca,AMEX,etf


In [91]:
etfs_us_growth = etfs_us[etfs_us["name"].str.contains("Growth", na=False)]
etfs_us_value = etfs_us[etfs_us["name"].str.contains("Value",na=False)]
etfs_growth_value = {
    "Growth": {i : {"description":"","price_history":[], "sector_exposure":[]} for i in etfs_us_growth.symbol}, 
    "Value": {i : {"description":"","price_history":[], "sector_exposure":[]} for i in etfs_us_value.symbol}
}
print(f"-> {len(etfs_growth_value["Growth"].keys())} US Growth ETFs")
print(f"-> {len(etfs_growth_value["Value"].keys())} US Value ETFs")
etfs_us_value

-> 94 US Growth ETFs
-> 102 US Value ETFs


Unnamed: 0,symbol,name,price,exchange,exchangeShortName,type
24,RZV,Invesco S&P SmallCap 600 Pure Value ETF,102.2810,New York Stock Exchange Arca,AMEX,etf
57,IWN,iShares Russell 2000 Value ETF,151.8200,New York Stock Exchange Arca,AMEX,etf
60,IVLU,iShares Edge MSCI Intl Value Factor ETF,28.7600,New York Stock Exchange Arca,AMEX,etf
83,JVAL,JPMorgan U.S. Value Factor ETF,40.5600,New York Stock Exchange Arca,AMEX,etf
140,JPSV,Jpmorgan Active Small Cap Value ETF,54.0500,New York Stock Exchange Arca,AMEX,etf
...,...,...,...,...,...,...
11363,IVE,iShares S&P 500 Value ETF,183.3000,New York Stock Exchange Arca,AMEX,etf
11397,GVLU,Gotham 1000 Value ETF,23.1700,New York Stock Exchange Arca,AMEX,etf
11497,IHYV,Invesco Corporate Income Value ETF,24.8350,New York Stock Exchange Arca,AMEX,etf
11591,VOOV,Vanguard S&P 500 Value Index Fund,177.1700,New York Stock Exchange Arca,AMEX,etf


#### Populate ETF Dictionary with relevant data

In [92]:
#main features we will request to apis
main_features = list(etfs_growth_value["Growth"]['AILG'].keys())
main_features

['description', 'price_history', 'sector_exposure']

In [98]:
import yfinance as yf
import random, time

for i in list(etfs_growth_value["Growth"].keys()):
    ticker = yf.Ticker(i)
    history = ticker.history(period="max", interval="1mo")
    description = yf.Ticker(i).info
    etfs_growth_value["Growth"][i]["description"] = description['longBusinessSummary'] if "longBusinessSummary" in description.keys() else ""
    etfs_growth_value["Growth"][i]["price_history"] = history.to_dict()

for i in list(etfs_growth_value["Value"].keys()):
    ticker = yf.Ticker(i)
    history = ticker.history(period="max", interval="1mo")
    description = ticker.info
    etfs_growth_value["Value"][i]["description"] = description['longBusinessSummary'] if "longBusinessSummary" in description.keys() else ""
    etfs_growth_value["Value"][i]["price_history"] = history.to_dict()

In [99]:
import json
import pickle

afile = open(os.path.join(data_dir, "etf_data","us_etfs_list.pkl"), 'wb')
pickle.dump(etfs_growth_value, afile)
afile.close()

#### OTHER ASSETS

In [100]:
currency_tickers = ["EURUSD=X"]
futures = ["GC=F", "CL=F", "HO=F", "ZB=F", "ZC=F"]

for c in currency_tickers:
    data = yf.Ticker(c).history(period='10y', interval="1mo").to_csv(os.path.join(data_dir, "currencies", f"{c.replace("=X","")}.csv"))

for t in futures:
    data = yf.Ticker(t).history(period='10y', interval="1mo").to_csv(os.path.join(data_dir, "commodities", f"{t.replace("=F","_F")}.csv"))

### Macro Data

In [96]:
from datetime import datetime
from fredapi import Fred

macro_data = {}
fred = Fred(api_key=FRED_KEY)
macro_series = ["FEDFUNDS", "IEABC"] 
for series in macro_series:
    macro_data[series] = fred.get_series(series)

afile = open(os.path.join(data_dir, "macro_data","macro_data.pkl"), 'wb')
pickle.dump(macro_data, afile)

In [97]:
macro_data

{'FEDFUNDS': 1954-07-01    0.80
 1954-08-01    1.22
 1954-09-01    1.07
 1954-10-01    0.85
 1954-11-01    0.83
               ... 
 2024-01-01    5.33
 2024-02-01    5.33
 2024-03-01    5.33
 2024-04-01    5.33
 2024-05-01    5.33
 Length: 839, dtype: float64,
 'IEABC': 1999-01-01    -60838.0
 1999-04-01    -67957.0
 1999-07-01    -76129.0
 1999-10-01    -81685.0
 2000-01-01    -94772.0
                 ...   
 2022-10-01   -216154.0
 2023-01-01   -212655.0
 2023-04-01   -214980.0
 2023-07-01   -196378.0
 2023-10-01   -194810.0
 Length: 100, dtype: float64}