### We demonstrate how the data collection pipeline works in a few different scenarios

#### Case 1: first time running the pipeline

In [1]:
from data_collection.stock import StockMetadataManager, StockDataCollector
from data_collection.config import constants as Config

import pandas as pd
import logging

# Set up logging to print to stdout, which Cloud Run will capture
logging.basicConfig(level=logging.INFO)

# Initialize your classes outside of the endpoint
metadata_manager = StockMetadataManager(config=Config)
data_collector = StockDataCollector(metadata_manager=metadata_manager)

def trigger_pipeline():
    try:
        data, metadata = data_collector.run_ingestion_pipeline_localy()
        if data is not None and metadata is not None:
            logging.info("Data ingestion completed successfully.")
            
            # Attempt to save the updates, catching any exceptions that occur
            try:
                data_collector.save_updates(data=data, metadata=metadata)
                logging.info("Data saved successfully.")
                return {"message": "Data ingestion and save completed successfully."}
            except Exception as save_exception:
                logging.exception("Error occurred during data save.")
                return {"message": f"An error occurred during data save: {save_exception}"}

        else:
            logging.error("Data ingestion completed, but no data was returned.")
            return {"message": "Data ingestion completed, but no data was returned."}
    except Exception as e:
        logging.exception("Error occurred during data ingestion.")
        return {"message": f"An error occurred during data ingestion: {e}"}

In [2]:
Config

{'remote': {'bucket': 'stock_data_lake',
  'ticker_file': 'stock_data_lake/tickers.csv',
  'data_folder': 'stock_data_lake/data',
  'metadata_file': 'stock_data_lake/data/metadata.csv',
  'data_file': 'stock_data_lake/data/stock_history.csv'}}

There is no metadata or data file in the 'bucket', so any ticker in the ticker file will have to be ingested.

In [3]:
tickers = pd.DataFrame({'Ticker': ['AAPL', 'MSFT']})
tickers.to_csv(Config['remote']['ticker_file'], index = False)

tickers

Unnamed: 0,Ticker
0,AAPL
1,MSFT


In [4]:
trigger_pipeline()

INFO:root:Ticker AAPL initialized
INFO:root:Ticker MSFT initialized


All tickers will be ingested. No metadata csv file found stock_data_lake/data/metadata.csv.


INFO:root:Ticker MSFT history fetched
INFO:root:Ticker AAPL history fetched
INFO:root:Data ingestion completed successfully.
INFO:root:No existing data or metadata found. Writing new data and metadata
INFO:root:Data saved successfully.


First time ingestion: No data csv file found stock_data_lake/data/stock_history.csv.
All tickers will be ingested. No metadata csv file found stock_data_lake/data/metadata.csv.


{'message': 'Data ingestion and save completed successfully.'}

In [5]:
data = pd.read_csv(Config['remote']['data_file'])
metadata = pd.read_csv(Config['remote']['metadata_file'])

In [6]:
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
0,1980-12-12,0.099319,0.099750,0.099319,0.099319,469033600,0.0,0.0,AAPL
1,1980-12-15,0.094569,0.094569,0.094137,0.094137,175884800,0.0,0.0,AAPL
2,1980-12-16,0.087659,0.087659,0.087228,0.087228,105728000,0.0,0.0,AAPL
3,1980-12-17,0.089387,0.089818,0.089387,0.089387,86441600,0.0,0.0,AAPL
4,1980-12-18,0.091978,0.092410,0.091978,0.091978,73449600,0.0,0.0,AAPL
...,...,...,...,...,...,...,...,...,...
20357,2023-12-11,368.480011,371.600006,366.100006,371.299988,27708800,0.0,0.0,MSFT
20358,2023-12-12,370.850006,374.420013,370.459991,374.380005,24838300,0.0,0.0,MSFT
20359,2023-12-13,376.019989,377.640015,370.769989,374.369995,30955500,0.0,0.0,MSFT
20360,2023-12-14,373.309998,373.760010,364.130005,365.929993,43277500,0.0,0.0,MSFT


In [7]:
metadata

Unnamed: 0,ticker,ingestion_date,first_day,last_day,timespan,final_df_length,input_null_values,processed_null_values
0,AAPL,2023-12-17,1980-12-12,2023-12-15,15709,10844,0,0
1,MSFT,2023-12-17,1986-03-13,2023-12-15,13792,9518,0,0


#### Case 1.1: First time running the pipeline with a wrong ticker name

In [8]:
tickers = pd.DataFrame({'Ticker': ['AAPL', 'MSFT', 'FB']})
tickers.to_csv(Config['remote']['ticker_file'], index = False)

In [9]:
trigger_pipeline()

INFO:root:Ticker AAPL initialized
INFO:root:Ticker MSFT initialized
INFO:root:Ticker FB initialized


All tickers will be ingested. No metadata csv file found stock_data_lake/data/metadata.csv.


ERROR:yfinance:FB: No timezone found, symbol may be delisted
ERROR:root:Error loading history of FB. Retrieved data is not valid
INFO:root:Ticker MSFT history fetched
INFO:root:Ticker AAPL history fetched
INFO:root:Data ingestion completed successfully.
INFO:root:No existing data or metadata found. Writing new data and metadata
INFO:root:Data saved successfully.


First time ingestion: No data csv file found stock_data_lake/data/stock_history.csv.
All tickers will be ingested. No metadata csv file found stock_data_lake/data/metadata.csv.


{'message': 'Data ingestion and save completed successfully.'}

In [10]:
data = pd.read_csv(Config['remote']['data_file'])
metadata = pd.read_csv(Config['remote']['metadata_file'])

In [11]:
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
0,1980-12-12,0.099319,0.099750,0.099319,0.099319,469033600,0.0,0.0,AAPL
1,1980-12-15,0.094569,0.094569,0.094137,0.094137,175884800,0.0,0.0,AAPL
2,1980-12-16,0.087659,0.087659,0.087228,0.087228,105728000,0.0,0.0,AAPL
3,1980-12-17,0.089387,0.089818,0.089387,0.089387,86441600,0.0,0.0,AAPL
4,1980-12-18,0.091978,0.092410,0.091978,0.091978,73449600,0.0,0.0,AAPL
...,...,...,...,...,...,...,...,...,...
20357,2023-12-11,368.480011,371.600006,366.100006,371.299988,27708800,0.0,0.0,MSFT
20358,2023-12-12,370.850006,374.420013,370.459991,374.380005,24838300,0.0,0.0,MSFT
20359,2023-12-13,376.019989,377.640015,370.769989,374.369995,30955500,0.0,0.0,MSFT
20360,2023-12-14,373.309998,373.760010,364.130005,365.929993,43277500,0.0,0.0,MSFT


In [12]:
metadata

Unnamed: 0,ticker,ingestion_date,first_day,last_day,timespan,final_df_length,input_null_values,processed_null_values
0,AAPL,2023-12-17,1980-12-12,2023-12-15,15709,10844,0,0
1,MSFT,2023-12-17,1986-03-13,2023-12-15,13792,9518,0,0


#### Case 2: running the pipeline when an update is not needed

In [13]:
trigger_pipeline()

INFO:root:Ticker AAPL initialized
INFO:root:Ticker MSFT initialized
INFO:root:Ticker FB initialized
ERROR:yfinance:FB: No timezone found, symbol may be delisted
ERROR:root:Error loading history of FB. Retrieved data is not valid
INFO:root:Ticker MSFT history fetched


INFO:root:Ticker AAPL history fetched
INFO:root:Data ingestion completed successfully.
INFO:root:Data saved successfully.


{'message': 'Data ingestion and save completed successfully.'}

In [14]:
data = pd.read_csv(Config['remote']['data_file'])
metadata = pd.read_csv(Config['remote']['metadata_file'])

In [15]:
data[data['Date'] == '2023-11-24']

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
10828,2023-11-24,190.869995,190.899994,189.25,189.970001,24048300,0.0,0.0,AAPL
20346,2023-11-24,377.329987,377.970001,375.140015,377.429993,10176600,0.0,0.0,MSFT


In [16]:
metadata

Unnamed: 0,ticker,ingestion_date,first_day,last_day,timespan,final_df_length,input_null_values,processed_null_values
0,AAPL,2023-12-17,1980-12-12,2023-12-15,15709,10844,0,0
1,MSFT,2023-12-17,1986-03-13,2023-12-15,13792,9518,0,0
2,AAPL,2023-12-17,2023-12-15,2023-12-15,1,1,0,0
3,MSFT,2023-12-17,2023-12-15,2023-12-15,1,1,0,0


#### Case 2.1: adding new ticker of interest in future iterations of the pipeline

In [17]:
tickers = pd.DataFrame({'Ticker': ['AAPL', 'MSFT', 'NVDA']})
tickers.to_csv(Config['remote']['ticker_file'], index = False)

tickers

Unnamed: 0,Ticker
0,AAPL
1,MSFT
2,NVDA


In [18]:
trigger_pipeline()

INFO:root:Ticker AAPL initialized
INFO:root:Ticker MSFT initialized
INFO:root:Ticker NVDA initialized
INFO:root:Ticker MSFT history fetched
INFO:root:Ticker AAPL history fetched


INFO:root:Ticker NVDA history fetched
INFO:root:Data ingestion completed successfully.
INFO:root:Data saved successfully.


{'message': 'Data ingestion and save completed successfully.'}

In [19]:
data = pd.read_csv(Config['remote']['data_file'])
metadata = pd.read_csv(Config['remote']['metadata_file'])

In [20]:
data[data['Date'] == '2023-11-24']

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
10828,2023-11-24,190.869995,190.899994,189.25,189.970001,24048300,0.0,0.0,AAPL
20346,2023-11-24,377.329987,377.970001,375.140015,377.429993,10176600,0.0,0.0,MSFT
26616,2023-11-24,484.65741,489.166993,477.408047,477.718018,29464500,0.0,0.0,NVDA


In [21]:
metadata

Unnamed: 0,ticker,ingestion_date,first_day,last_day,timespan,final_df_length,input_null_values,processed_null_values
0,AAPL,2023-12-17,1980-12-12,2023-12-15,15709,10844,0,0
1,MSFT,2023-12-17,1986-03-13,2023-12-15,13792,9518,0,0
2,AAPL,2023-12-17,2023-12-15,2023-12-15,1,1,0,0
3,MSFT,2023-12-17,2023-12-15,2023-12-15,1,1,0,0
4,NVDA,2023-12-17,1999-01-22,2023-12-15,9094,6267,0,0


#### Case 2.2: updating the data

In [22]:
#We will create fake data and metadata files to demonstrate the update methods
data = pd.read_csv(Config['remote']['data_file'])
data['Date'] = pd.to_datetime(data['Date'])
data = data[(data['Date'] > pd.to_datetime('2023-11-01')) & (data['Date'] < pd.to_datetime('2023-11-10'))]
metadata = data.groupby('Ticker').max().reset_index()

metadata.rename(columns={'Ticker': 'ticker', 'Date': 'ingestion_date'}, inplace=True)
metadata.drop(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'], axis = 1, inplace=True)
metadata['first_day'] = metadata['ingestion_date'] - pd.DateOffset(days = 10)
metadata['last_day'] = metadata['ingestion_date']

metadata['timespan'] = 10
metadata['final_df_length'] = 10
metadata['input_null_values'] = 0
metadata['processed_null_values'] = 0

metadata

Unnamed: 0,ticker,ingestion_date,first_day,last_day,timespan,final_df_length,input_null_values,processed_null_values
0,AAPL,2023-11-09,2023-10-30,2023-11-09,10,10,0,0
1,MSFT,2023-11-09,2023-10-30,2023-11-09,10,10,0,0
2,NVDA,2023-11-09,2023-10-30,2023-11-09,10,10,0,0


In [23]:
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
10813,2023-11-02,175.289074,177.546095,175.229156,177.33638,77334800,0.0,0.0,AAPL
10814,2023-11-03,174.010754,176.587362,173.121926,176.417572,79763700,0.0,0.0,AAPL
10815,2023-11-06,176.147945,179.19392,175.978171,178.994186,63841300,0.0,0.0,AAPL
10816,2023-11-07,178.944239,182.199959,178.734524,181.58078,70530000,0.0,0.0,AAPL
10817,2023-11-08,182.110085,183.208629,181.351076,182.649368,49340300,0.0,0.0,AAPL
10818,2023-11-09,182.719278,183.87774,181.570782,182.169998,53763500,0.0,0.0,AAPL
20331,2023-11-02,346.536642,348.123418,344.071644,347.614471,24348100,0.0,0.0,MSFT
20332,2023-11-03,348.921826,353.672194,346.626467,352.085388,23624000,0.0,0.0,MSFT
20333,2023-11-06,352.734083,356.815795,352.634279,355.807831,23828300,0.0,0.0,MSFT
20334,2023-11-07,358.672028,361.725827,356.905624,359.799744,25833900,0.0,0.0,MSFT


In [24]:
metadata.to_csv(Config['remote']['metadata_file'], index = False)
data.to_csv(Config['remote']['data_file'], index = False)

In [25]:
trigger_pipeline()

INFO:root:Ticker AAPL initialized
INFO:root:Ticker MSFT initialized
INFO:root:Ticker NVDA initialized


INFO:root:Ticker NVDA history fetched
INFO:root:Ticker MSFT history fetched
INFO:root:Ticker AAPL history fetched
INFO:root:Data ingestion completed successfully.
INFO:root:Data saved successfully.


{'message': 'Data ingestion and save completed successfully.'}

In [26]:
data = pd.read_csv(Config['remote']['data_file'])
metadata = pd.read_csv(Config['remote']['metadata_file'])

In [27]:
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker
0,2023-11-02,175.289074,177.546095,175.229156,177.336380,77334800,0.0,0.0,AAPL
1,2023-11-03,174.010754,176.587362,173.121926,176.417572,79763700,0.0,0.0,AAPL
2,2023-11-06,176.147945,179.193920,175.978171,178.994186,63841300,0.0,0.0,AAPL
3,2023-11-07,178.944239,182.199959,178.734524,181.580780,70530000,0.0,0.0,AAPL
4,2023-11-08,182.110085,183.208629,181.351076,182.649368,49340300,0.0,0.0,AAPL
...,...,...,...,...,...,...,...,...,...
88,2023-12-11,474.910004,475.309998,458.299988,466.269989,50972800,0.0,0.0,NVDA
89,2023-12-12,460.459991,476.660004,460.459991,476.570007,37238700,0.0,0.0,NVDA
90,2023-12-13,476.290009,485.940002,476.079987,480.880005,44779200,0.0,0.0,NVDA
91,2023-12-14,483.899994,486.700012,474.220001,483.500000,39123200,0.0,0.0,NVDA


In [28]:
metadata

Unnamed: 0,ticker,ingestion_date,first_day,last_day,timespan,final_df_length,input_null_values,processed_null_values
0,AAPL,2023-11-09,2023-10-30,2023-11-09,10,10,0,0
1,MSFT,2023-11-09,2023-10-30,2023-11-09,10,10,0,0
2,NVDA,2023-11-09,2023-10-30,2023-11-09,10,10,0,0
3,AAPL,2023-12-17,2023-11-10,2023-12-15,36,25,0,0
4,MSFT,2023-12-17,2023-11-10,2023-12-15,36,25,0,0
5,NVDA,2023-12-17,2023-11-10,2023-12-15,36,25,0,0
