In [8]:
# 1️⃣ Authenticate
from google.colab import auth
auth.authenticate_user()

import io
import os
from googleapiclient.discovery import build
from google.cloud import bigquery

# 2️⃣ Initialize Drive API & BigQuery client
drive_service = build('drive', 'v3')
PROJECT_ID = "ncc-data-bigquery"
BQ_DATASET = "graphs"
bq_client = bigquery.Client(project=PROJECT_ID)

# 3️⃣ Shared folder ID (top-level)
SHARED_FOLDER_ID = "1vnDC82IoWqNL8wnYHxCYBBHRXIegOjcL"

# 4️⃣ Helper: list files in a folder
def list_folder_files(folder_id):
    query = f"'{folder_id}' in parents and trashed=false"
    results = drive_service.files().list(q=query, pageSize=1000).execute()
    return results.get('files', [])

# 5️⃣ Recursive function to traverse folder tree
def traverse_and_load(folder_id, path_prefix=""):
    items = list_folder_files(folder_id)
    for item in items:
        if item['mimeType'] == 'application/vnd.google-apps.folder':
            # Folder: recurse
            folder_name = item['name']
            new_prefix = os.path.join(path_prefix, folder_name)
            traverse_and_load(item['id'], new_prefix)
        elif item['name'].lower().endswith(".csv"):
            # CSV file: download and load
            csv_name = item['name']
            table_name_safe = os.path.splitext(csv_name)[0].replace("-", "_").replace(" ", "_")
            table_name = table_name_safe
            table_id = f"{PROJECT_ID}.{BQ_DATASET}.{table_name}"

            print(f"Loading CSV: {os.path.join(path_prefix, csv_name)} → BigQuery table: {table_id}")

            # Download CSV into memory
            request = drive_service.files().get_media(fileId=item['id'])
            fh = io.BytesIO()
            downloader = build('drive', 'v3')._http.request
            from googleapiclient.http import MediaIoBaseDownload
            downloader = MediaIoBaseDownload(fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()

            fh.seek(0)

            # Load into BigQuery
            job_config = bigquery.LoadJobConfig(
                source_format=bigquery.SourceFormat.CSV,
                skip_leading_rows=1,
                autodetect=True
            )
            load_job = bq_client.load_table_from_file(fh, table_id, job_config=job_config)
            load_job.result()
            print(f"✅ Loaded {table_id}")

# 6️⃣ Run the recursive load
traverse_and_load(SHARED_FOLDER_ID)
print("🎉 All CSVs loaded from shared folder → BigQuery!")


KeyboardInterrupt: 