In [15]:
import os

# Set the environment variable for the Google Cloud credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/workspaces/financial_data_scraping/jaber-financial-b20f23e23588.json'

# Verify that it has been set correctly
print("GOOGLE_APPLICATION_CREDENTIALS:", os.getenv('GOOGLE_APPLICATION_CREDENTIALS'))


GOOGLE_APPLICATION_CREDENTIALS: /workspaces/financial_data_scraping/jaber-financial-b20f23e23588.json


In [16]:
from google.cloud import storage

def test_gcs_access(bucket_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    print(f"Bucket '{bucket_name}' exists: {bucket.exists()}")

# Replace 'your-bucket-name' with your actual bucket name
test_gcs_access('companies_details')

Bucket 'companies_details' exists: True


In [17]:
from google.cloud import bigquery, storage

# Initialize clients
gcs_client = storage.Client()
bq_client = bigquery.Client()

### Load Comapnies

In [18]:
# Define your parameters
project_id = "jaber-financial"
dataset_id = "financial_data"
bucket_name = "companies_details"
history_dir_prefix = "data/"

def list_parquet_files(bucket_name, prefix):
    """List all parquet files in a specific GCS bucket and prefix."""
    bucket = gcs_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    return [blob.name for blob in blobs if blob.name.endswith('.parquet')]

# Load all files from data/history/ directory into a single table named 'companies_details'
companies_details_files = list_parquet_files(bucket_name, history_dir_prefix)

companies_table_ref = bq_client.dataset(dataset_id).table("companies_details")
companies_job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.PARQUET,  # Parquet file format
    autodetect=True
)

# Start the load job for history files
load_job = bq_client.load_table_from_uri(
    [f"gs://{bucket_name}/{file}" for file in companies_details_files],
    companies_table_ref,
    job_config=companies_job_config
)

# Wait for the job to complete
load_job.result()

# Check the result
companies_details_table = bq_client.get_table(companies_table_ref)
print(f"Loaded {companies_details_table.num_rows} rows into {dataset_id}:companies_details.")


Loaded 70 rows into financial_data:companies_details.


### Load Historical_data

In [19]:
# Define your parameters
project_id = "jaber-financial"
dataset_id = "financial_data"
bucket_name = "historical_data_details"
history_dir_prefix = "data/history/"

def list_parquet_files(bucket_name, prefix):
    """List all parquet files in a specific GCS bucket and prefix."""
    bucket = gcs_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    return [blob.name for blob in blobs if blob.name.endswith('.parquet')]

# Load all files from data/history/ directory into a single table named 'companies_details'
historical_details_files = list_parquet_files(bucket_name, history_dir_prefix)

history_table_ref = bq_client.dataset(dataset_id).table("historical_data_details")
historical_job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.PARQUET,  # Parquet file format
    autodetect=True
)

# Start the load job for history files
load_job = bq_client.load_table_from_uri(
    [f"gs://{bucket_name}/{file}" for file in historical_details_files],
    history_table_ref,
    job_config=historical_job_config
)

# Wait for the job to complete
load_job.result()

# Check the result
historical_details_table = bq_client.get_table(history_table_ref)
print(f"Loaded {historical_details_table.num_rows} rows into {dataset_id}:historical_data_details.")


Loaded 314049 rows into financial_data:historical_data_details.


### Load Analytics price target

In [20]:
# Define your parameters
project_id = "jaber-financial"
dataset_id = "financial_data"
bucket_name = "analyst_price_target"
analyst_dir_prefix = "data/"

def list_parquet_files(bucket_name, prefix):
    """List all parquet files in a specific GCS bucket and prefix."""
    bucket = gcs_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    return [blob.name for blob in blobs if blob.name.endswith('.parquet')]

# Load all files from data/history/ directory into a single table named 'companies_details'
analyst_details_files = list_parquet_files(bucket_name, analyst_dir_prefix)

analyst_table_ref = bq_client.dataset(dataset_id).table("analyst_price_target")
analyst_job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.PARQUET,  # Parquet file format
    autodetect=True
)

# Start the load job for history files
load_job = bq_client.load_table_from_uri(
    [f"gs://{bucket_name}/{file}" for file in analyst_details_files],
    analyst_table_ref,
    job_config=analyst_job_config
)

# Wait for the job to complete
load_job.result()

# Check the result
analyst_details_table = bq_client.get_table(analyst_table_ref)
print(f"Loaded {analyst_details_table.num_rows} rows into {dataset_id}:analyst_price_target.")


Loaded 70 rows into financial_data:analyst_price_target.


### Load Market Cap

In [21]:
# Define your parameters
project_id = "jaber-financial"
dataset_id = "financial_data"
bucket_name = "marketcap_details"
market_dir_prefix = "data/"

def list_parquet_files(bucket_name, prefix):
    """List all parquet files in a specific GCS bucket and prefix."""
    bucket = gcs_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    return [blob.name for blob in blobs if blob.name.endswith('.parquet')]

# Load all files from data/history/ directory into a single table named 'companies_details'
marketcap_details_files = list_parquet_files(bucket_name, market_dir_prefix)

market_table_ref = bq_client.dataset(dataset_id).table("marketcap_details")
market_job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.PARQUET,  # Parquet file format
    autodetect=True
)

# Start the load job for history files
load_job = bq_client.load_table_from_uri(
    [f"gs://{bucket_name}/{file}" for file in marketcap_details_files],
    market_table_ref,
    job_config=market_job_config
)

# Wait for the job to complete
load_job.result()

# Check the result
market_details_table = bq_client.get_table(market_table_ref)
print(f"Loaded {market_details_table.num_rows} rows into {dataset_id}:marketcap_details.")


Loaded 70 rows into financial_data:marketcap_details.
