Reproducible steps to create a corpus from [FinnHub](https://finnhub.io/).

# Pseudocode

Below is a list of the steps we take.
Keep in mind that these steps are a 10 thousand foot view.
The implementation will be commented to a more detailed level.

1. Get the tickers from the [SEC](https://www.sec.gov/file/company-tickers)
2. Using the retrieved data, get the tickers for every publicly traded stock in the U.S. market.

In [None]:
import requests
import pandas as pd
import json
import csv
import os
import finnhub
import time

from finnhub.exceptions import FinnhubAPIException
from pathlib import Path
from tqdm.notebook import tqdm
from dotenv import load_dotenv, dotenv_values 

In [None]:
tickers_url = 'https://www.sec.gov/files/company_tickers.json'
user_agent = 'FinnHub-Data-Ingestion'
limit = 20

data_folder = Path('./data/')
tickers_file = data_folder.joinpath('./tickers.csv')
raw_folder = data_folder.joinpath('./raw/')
raw_json_folder = raw_folder.joinpath('./json_folder/')
raw_csv_folder = raw_folder.joinpath('./csv_folder/')
corpus_folder = data_folder.joinpath('./corpus/')

# Step 1

1. Get the list of tickers from the SEC
2. Convert the tickers into an array, then sort it.
3. Save the tickers to a CSV

In [None]:
def get_tickers(tickers_file: Path, tickers_url: str, user_agent: str, ) -> pd.DataFrame:
    if not tickers_file.exists():
        tickers = None
        with requests.Session() as session:
            session.headers['User-Agent'] = user_agent
            with session.get(tickers_url) as result:
                if result.status_code == 200:
                    t1 = json.loads(result.text)
                    t2 = [x for x in t1.values()]
                    t3 = sorted(t2, key = lambda tup: tup['ticker'])
                    tickers = [(x['cik_str'], x['ticker'], x['title']) for x in t3]
        if tickers is not None:
            df = pd.DataFrame(tickers, columns = ['CIK', 'Ticker', 'Name'])
            if not tickers_file.parent.exists():
                tickers_file.parent.mkdir(parents = True)
            df.to_csv(tickers_file, index = False)
        else:
            raise RuntimeError('Error retrieving tickers')          
    return pd.read_csv(tickers_file) #type: ignore

tickers_df = get_tickers(tickers_file, tickers_url, user_agent)

# Step 2

1. Iterate through each ticker in the `tickers.csv` file and download all available trading data.
2. Save data of each ticker to JSON files.

*Initialize API variables*

In [None]:
load_dotenv()

base_url = '/api/v1'

tickers = tickers_df['Ticker']

finnhub_client = finnhub.Client(api_key=os.getenv("API_KEY"))

start_date = '2000-01-01'
end_date = '2023-12-31'

*Download trading data in JSON format from API and save to data/raw folder with filename {ticker}.json*

In [None]:
raw_json_folder.mkdir(parents=True, exist_ok=True)

for ticker in tqdm(tickers):
    try:
        # Fetch congressional trading data for the specified time frame
        response = finnhub_client.congressional_trading(ticker, start_date, end_date)

        if 'data' in response and response['data']:
            filename = raw_json_folder.joinpath(f'{ticker}.json')
            with open(filename, 'w') as jsonfile:
                json.dump(response, jsonfile)
                print(f'Response for {ticker} saved to {filename}')
        else:
            print(f'No trade data available for {ticker}. Skipping...')
    except FinnhubAPIException as e:
        print(f'Skipping {ticker}: {e}')

    # Throttle the API calls to stay within the rate limit
    time.sleep(1/6) 

print("Data download completed.")

# Step 3
*Converting JSON files csv and merge all csv files*

In [None]:
corpus_folder.mkdir(parents=True, exist_ok=True)
raw_csv_folder.mkdir(parents=True, exist_ok=True)

json_files = list(raw_json_folder.glob('*.json'))

concatenated_data = pd.DataFrame()

for json_file in tqdm(json_files):
    try:
        # Load JSON data from file
        with open(json_file, 'r') as f:
            json_data = json.load(f)

        # Convert JSON data to DataFrame
        data = pd.DataFrame(json_data['data'])

        # Append DataFrame to concatenated_data
        concatenated_data = pd.concat([concatenated_data, data], ignore_index=True)

        # Construct CSV file path
        csv_file = raw_csv_folder.joinpath(json_file.stem + '.csv')

        # Write DataFrame to CSV file
        data.to_csv(csv_file, index=False)

    except Exception as e:
        print(f"Error processing {json_file}: {e}")

# Write concatenated data to a single CSV file
concatenated_csv_file = corpus_folder.joinpath('all_available_transactions.csv')
concatenated_data.to_csv(concatenated_csv_file, index=False)

print("All JSON files converted to CSV and concatenated into one big CSV file.")