In [2]:
import pandas as pd
from google.oauth2 import service_account
from googleapiclient.discovery import build

# --- Configuration ---
SERVICE_ACCOUNT_FILE = 'turing-genai-ws-58339643dd3f.json'  # Path to your service account JSON
FOLDER_ID = '1C384l5Cmm2-4bs8D-4F-6P42d5jwYaKY'  # Your Google Drive folder ID
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']

# Authenticate
credentials = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES)
drive_service = build('drive', 'v3', credentials=credentials)

# Recursively traverse folders to find all PDFs
def list_all_pdfs_recursive(parent_folder_id, parent_path=""):
    output_rows = []

    # List all children
    query = f"'{parent_folder_id}' in parents and trashed=false"
    page_token = None
    while True:
        response = drive_service.files().list(
            q=query,
            spaces='drive',
            fields='nextPageToken, files(id, name, mimeType)',
            pageToken=page_token
        ).execute()

        for file in response.get('files', []):
            file_id = file['id']
            file_name = file['name']
            mime_type = file['mimeType']

            if mime_type == 'application/vnd.google-apps.folder':
                # Recurse into subfolder
                new_path = f"{parent_path}/{file_name}" if parent_path else file_name
                print(f"Entering folder: {new_path}")
                output_rows += list_all_pdfs_recursive(file_id, new_path)
            elif mime_type == 'application/pdf':
                pdf_link = f"https://drive.google.com/file/d/{file_id}/view?usp=sharing"
                pdf_base_name = file_name.rsplit('.', 1)[0]
                output_rows.append({
                    "pdf_name": pdf_base_name,
                    "pdf_link": pdf_link
                })

        page_token = response.get('nextPageToken', None)
        if not page_token:
            break

    return output_rows

# --- Run ---
print("Scanning Drive folder structure...")
all_pdf_entries = list_all_pdfs_recursive(FOLDER_ID)
print(f"Found {len(all_pdf_entries)} total PDF files.")

# Save to CSV with only pdf_name and pdf_link
output_df = pd.DataFrame(all_pdf_entries, columns=["pdf_name", "pdf_link"])
output_csv_path = 'pdf_catalog.csv'
output_df.to_csv(output_csv_path, index=False, encoding='utf-8')
print(f"Successfully created CSV with {len(output_df)} entries at {output_csv_path}")


Scanning Drive folder structure...
Found 4638 total PDF files.
Successfully created CSV with 4638 entries at pdf_catalog.csv
