### We demonstrate how the data collection pipeline works in a few different scenarios

#### Case 1: first time running the pipeline

In [1]:
from data_collection.stock import StockMetadataManager, StockDataCollector
from data_collection.config import constants as Config

import pandas as pd
import logging

# Set up logging to print to stdout, which Cloud Run will capture
logging.basicConfig(level=logging.INFO)

# Initialize your classes outside of the endpoint
metadata_manager = StockMetadataManager(config=Config)
data_collector = StockDataCollector(metadata_manager=metadata_manager)

def trigger_pipeline():
    try:
        data, metadata = data_collector.run_ingestion_pipeline_localy()
        if data is not None and metadata is not None:
            logging.info("Data ingestion completed successfully.")
            
            # Attempt to save the updates, catching any exceptions that occur
            try:
                data_collector.save_updates(data=data, metadata=metadata)
                logging.info("Data saved successfully.")
                return {"message": "Data ingestion and save completed successfully."}
            except Exception as save_exception:
                logging.exception("Error occurred during data save.")
                return {"message": f"An error occurred during data save: {save_exception}"}

        else:
            logging.error("Data ingestion completed, but no data was returned.")
            return {"message": "Data ingestion completed, but no data was returned."}
    except Exception as e:
        logging.exception("Error occurred during data ingestion.")
        return {"message": f"An error occurred during data ingestion: {e}"}

In [2]:
Config

{'remote': {'bucket': 'stock_data_lake',
  'ticker_file': 'stock_data_lake/tickers.csv',
  'data_folder': 'stock_data_lake/data',
  'metadata_file': 'stock_data_lake/data/metadata.csv',
  'data_file': 'stock_data_lake/data/stock_history.csv'}}

There is no metadata or data file in the 'bucket', so any ticker in the ticker file will have to be ingested.

In [3]:
tickers = pd.DataFrame({'Ticker': ['AAPL', 'MSFT']})
tickers.to_csv(Config['remote']['ticker_file'], index = False)

tickers

Unnamed: 0,Ticker
0,AAPL
1,MSFT


In [4]:
trigger_pipeline()

All tickers will be ingested. No metadata csv file found stock_data_lake/data/metadata.csv.


INFO:root:Data ingestion completed successfully.
INFO:root:No existing data or metadata found. Writing new data and metadata


First time ingestion: No data csv file found stock_data_lake/data/stock_history.csv.
All tickers will be ingested. No metadata csv file found stock_data_lake/data/metadata.csv.


INFO:root:Data saved successfully.


{'message': 'Data ingestion and save completed successfully.'}

In [5]:
data = pd.read_csv(Config['remote']['data_file'])
metadata = pd.read_csv(Config['remote']['metadata_file'])

In [6]:
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ticker
0,1980-12-12,0.099319,0.099750,0.099319,0.099319,469033600,AAPL
1,1980-12-13,0.099319,0.099750,0.099319,0.099319,469033600,AAPL
2,1980-12-14,0.099319,0.099750,0.099319,0.099319,469033600,AAPL
3,1980-12-15,0.094569,0.094569,0.094137,0.094137,175884800,AAPL
4,1980-12-16,0.087659,0.087659,0.087228,0.087228,105728000,AAPL
...,...,...,...,...,...,...,...
29454,2023-11-20,371.220001,378.869995,371.000000,377.440002,52465100,MSFT
29455,2023-11-21,375.670013,376.220001,371.119995,373.070007,28423100,MSFT
29456,2023-11-22,378.000000,379.790009,374.970001,377.850006,23345300,MSFT
29457,2023-11-23,378.000000,379.790009,374.970001,377.850006,23345300,MSFT


In [7]:
metadata

Unnamed: 0,ticker,ingestion_date,first_day,last_day,timespan,final_df_length,input_null_values,processed_null_values
0,AAPL,2023-11-26,1980-12-12,2023-11-24,15688,15688,0,0
1,MSFT,2023-11-26,1986-03-13,2023-11-24,13771,13771,0,0
