In [1]:
from google.cloud import storage
import pandas as pd
import io
import yfinance as yf
from datetime import datetime
import os
import time
import numpy as np

In [2]:
from google.cloud import storage
from google.oauth2 import service_account

In [3]:
import os

# Set the environment variable for the Google Cloud credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/workspaces/financial_data_scraping/jaber-financial-b20f23e23588.json'

# Verify that it has been set correctly
print("GOOGLE_APPLICATION_CREDENTIALS:", os.getenv('GOOGLE_APPLICATION_CREDENTIALS'))


GOOGLE_APPLICATION_CREDENTIALS: /workspaces/financial_data_scraping/jaber-financial-b20f23e23588.json


In [4]:
from google.cloud import storage

def test_gcs_access(bucket_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    print(f"Bucket '{bucket_name}' exists: {bucket.exists()}")

# Replace 'your-bucket-name' with your actual bucket name
test_gcs_access('companies_details')

Bucket 'companies_details' exists: True


In [5]:
# Define Google Cloud Storage bucket name and file path
bucket_name = 'companies_details'
source_blob_name = 'data/companies_details.parquet'

def download_from_gcs(bucket_name, source_blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    file_content = blob.download_as_bytes()
    return file_content

# Download the file content from GCS
file_content = download_from_gcs(bucket_name, source_blob_name)

# Load the content into a pandas DataFrame
all_companies = pd.read_parquet(io.BytesIO(file_content))

In [7]:
# Function to fetch market cap and scraping details
def fetch_market_cap(ticker, isin, exchange_suffix='.MC'):
    try:
        ticker_with_suffix = ticker + exchange_suffix
        ticker_data = yf.Ticker(ticker_with_suffix)
        info = ticker_data.info

        return {
            'ISIN': isin,
            'Ticker': ticker,
            'Exchange': exchange_suffix.strip('.'),  # Clean the suffix
            'MarketCap': info.get('marketCap'),
            'scraping_url_analysis': f"https://finance.yahoo.com/quote/{ticker_with_suffix}/analysis/",
            'scraping_timestamp': datetime.now().isoformat()
        }
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return None

# Function to upload DataFrame to GCS
def upload_to_gcs(bucket_name, df, destination_blob_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Convert DataFrame to a BytesIO buffer
    buffer = io.BytesIO()
    df.to_parquet(buffer, index=False)
    buffer.seek(0)

    # Upload the buffer to GCS
    blob.upload_from_file(buffer, content_type='application/octet-stream')
    print(f"Data uploaded to {destination_blob_name} in bucket {bucket_name}.")

# Function to save DataFrame locally
def save_locally(df, local_path):
    df.to_parquet(local_path, index=False)
    print(f"Data saved locally to {local_path}.")

# List of tickers from the DataFrame
tickers = all_companies[['Ticker', 'ISIN']].values.tolist()

# Fetch market cap for each ticker with a delay
marketcap_details = []
for ticker, isin in tickers:
    data = fetch_market_cap(ticker, isin)
    if data:
        marketcap_details.append(data)

    # Introduce random delay between API calls (1 to 3 seconds)
    random_int = np.random.choice([1, 2, 3])
    time.sleep(random_int)

# Convert the results into a DataFrame
marketcap_details_df = pd.DataFrame(marketcap_details)

# Define Google Cloud Storage bucket name and file path
bucket_name = 'marketcap_details'
destination_blob_name = 'data/marketcap_details.parquet'

# Define local file path
local_file_path = '/workspaces/financial_data_scraping/data/marketcap_details.parquet'

# Ensure the local directory exists
local_dir = os.path.dirname(local_file_path)
if not os.path.exists(local_dir):
    os.makedirs(local_dir)

# Save DataFrame locally
save_locally(marketcap_details_df, local_file_path)

# Upload to GCS
upload_to_gcs(bucket_name, marketcap_details_df, destination_blob_name)

Data saved locally to /workspaces/financial_data_scraping/data/marketcap_details.parquet.
Data uploaded to data/marketcap_details.parquet in bucket marketcap_details.
