In [3]:
%pwd

'/workspaces/AI-Research-Assistant/notebook'

In [4]:
import os
os.chdir("../")
%pwd

'/workspaces/AI-Research-Assistant'

In [7]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass
class DataIngestionConfig:
    root_dir: Path
    FOLDER_ID_NP: str
    FOLDER_ID_P: List[str]
    DOWNLOAD_PATH_NP: Path
    DOWNLOAD_PATH_P: Path
    CREDENTIALS_FILE: Path

In [8]:
from src.constants import *
from src.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath=CONFIG_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            FOLDER_ID_NP=config.FOLDER_ID_NP,
            FOLDER_ID_P=config.FOLDER_ID_P,
            DOWNLOAD_PATH_NP=config.DOWNLOAD_PATH_NP,
            DOWNLOAD_PATH_P=config.DOWNLOAD_PATH_P,
            CREDENTIALS_FILE=config.CREDENTIALS_FILE
        )
        return data_ingestion_config

In [10]:
import pathway as pw
from google.oauth2.service_account import Credentials as ServiceCredentials
import os

In [None]:
# Initialize the GDrive client
def initialize_gdrive_client(credentials_file, file_name_pattern):
    credentials = ServiceCredentials.from_service_account_file(credentials_file)
    client = pw.io.gdrive._GDriveClient(
        credentials=credentials,
        file_name_pattern=file_name_pattern
    )
    return client

# Download all PDF files from the specified folder
def download_pdfs_from_folder(client, folder_id, download_path):
    # Ensure the download directory exists
    os.makedirs(download_path, exist_ok=True)

    # Retrieve the list of PDF files in the folder
    pdf_files = client._ls(folder_id)

    # Download each PDF file
    for file in pdf_files:
        file_id = file['id']
        file_name = file['name']
        file_path = os.path.join(download_path, file_name)

        # Download the file content
        file_content = client.download(file)

        if file_content:
            # Save the file to the specified directory
            with open(file_path, 'wb') as f:
                f.write(file_content)
            print(f"Downloaded: {file_name}")
        else:
            print(f"Failed to download: {file_name}")
# Replace with your folder ID and path to the credentials file
FOLDER_ID_NP = '1_xFmMlrNDR0wzzPsv6wXXdGz0eX6vaYb'
CREDENTIALS_FILE = 'credentials.json'
DOWNLOAD_PATH_NP = 'artifacts/data_ingestion/Non-Publishable'
FOLDER_ID_P = ['1RifJJBjm5tA8E20808RjvkIAiWnFbceb', '1JVzabziJf4d2drCTXFssFr_wZMnjr8oT', '1sJKv0o5ySrigZewU_wtTxysx9j0kO_nV', '1ZgkbpvhoNKUuH0b4uCv30lyWg3-5ijTC', '13eDgt0YghQU2qlogGrTrXJzfD0h0F2Iw']
DOWNLOAD_PATH_P= 'artifacts/data_ingestion/Publishable'
# Initialize the client
gdrive_client = initialize_gdrive_client(CREDENTIALS_FILE, '*.pdf')
# Download the PDFs
download_pdfs_from_folder(gdrive_client, FOLDER_ID_NP, DOWNLOAD_PATH_NP)
# Corrected loop for downloading PDFs from multiple folders
for folder_id in FOLDER_ID_P:
    download_pdfs_from_folder(gdrive_client, folder_id, DOWNLOAD_PATH_P)


In [38]:
class DataIngestion: 
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def initialize_gdrive_client(self):
        """Initialize the Google Drive client."""
        credentials = ServiceCredentials.from_service_account_file(str(self.config.CREDENTIALS_FILE))
        client = pw.io.gdrive._GDriveClient(
            credentials=credentials
        )
        return client

    def download_pdfs_from_folder(self, folder_id: str, download_path: Path):
        """Download PDF files from a specific Google Drive folder."""
        try:
            download_path = Path(download_path)
            gdrive_client = self.initialize_gdrive_client()
            

            os.makedirs(download_path, exist_ok=True)
            pdf_files = gdrive_client._ls(folder_id)
            

            for file in pdf_files:
                file_id = file['id']
                file_name = file['name']
                file_path = download_path / file_name
                file_content = gdrive_client.download(file)
                
                if file_content:
                    with open(file_path, 'wb') as f:
                        f.write(file_content)
                    print(f"Downloaded: {file_name}")
                else:
                    print(f"Failed to download: {file_name}")
        except Exception as e:
            print(f"Error downloading files from folder {folder_id}: {e}")
            raise e

    def download(self):
        """Download PDFs from both non-publishable and publishable folders."""
        try:
            self.download_pdfs_from_folder(self.config.FOLDER_ID_NP, self.config.DOWNLOAD_PATH_NP)
            for folder_id in self.config.FOLDER_ID_P:
                self.download_pdfs_from_folder(folder_id, self.config.DOWNLOAD_PATH_P)
        except Exception as e:
            print(f"Error during the download process: {e}")
            raise e


In [39]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download()

except Exception as e:
    raise e

[2025-01-05 07:52:03,222: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-01-05 07:52:03,223: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-05 07:52:03,225: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-01-05 07:52:03,225: INFO: common: created directory at: artifacts]
[2025-01-05 07:52:03,226: INFO: common: created directory at: artifacts/data_ingestion]
[2025-01-05 07:52:03,229: INFO: __init__: file_cache is only supported with oauth2client<4.0.0]
Downloaded: R004.pdf
Downloaded: R003.pdf
Downloaded: R002.pdf
Downloaded: R001.pdf
Downloaded: R005.pdf
[2025-01-05 07:52:18,840: INFO: __init__: file_cache is only supported with oauth2client<4.0.0]
Downloaded: R007.pdf
Downloaded: R006.pdf
[2025-01-05 07:52:25,831: INFO: __init__: file_cache is only supported with oauth2client<4.0.0]
Downloaded: R008.pdf
Downloaded: R009.pdf
[2025-01-05 07:52:32,591: INFO: __init__: file_cache is only supported with oauth2client<4.0.0]
Downloa