<a href="https://colab.research.google.com/github/rsabilio/ia024-projeto-rag/blob/main/0_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing packages

In [None]:
!pip install -q pydrive2

# Downloading the Files

In [None]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive2 client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
from os import makedirs
from os.path import dirname, join

def download_drive_item(drive, item_id, dest_path):
    """
    Downloads a file or folder from Google Drive to the specified local destination.

    Args:
      drive (GoogleDrive): Authenticated GoogleDrive instance.
      item_id (str): The ID of the Google Drive item (file or folder) to download.
      dest_path (str): The local destination path to download the item.

    """
    item = drive.CreateFile({'id': item_id})
    item.FetchMetadata()

    # Determine if the item is a folder or a file
    if item['mimeType'] == 'application/vnd.google-apps.folder':
        makedirs(dest_path, exist_ok=True)
        file_list = drive.ListFile({'q': f"'{item_id}' in parents and trashed=false"}).GetList()
        for file in file_list:
            file_path = join(dest_path, file['title'])
            if file['mimeType'] == 'application/vnd.google-apps.folder':
                download_drive_item(drive, file['id'], file_path)
            else:
                makedirs(dirname(file_path), exist_ok=True)
                print(f"Downloading {file['title']} to {file_path}")
                file.GetContentFile(file_path)
    else:
        makedirs(dirname(dest_path), exist_ok=True)
        print(f"Downloading {item['title']} to {dest_path}")
        item.GetContentFile(dest_path)

#### IFSP Boituva Portarias

In [None]:
item_id = '1NX5kRKEGV3t5X0k_wb3RJHNbCLASiiiw'
destination_path = '/content/shared_item'
download_drive_item(drive, item_id, destination_path)

Downloading Mai_Port_0033_Fiscal de contrato Vigilância.pdf to /content/shared_item/Portarias 2024/MAIO/Mai_Port_0033_Fiscal de contrato Vigilância.pdf
Downloading Abr_Port_0032_ENADE ADS_BTV.pdf to /content/shared_item/Portarias 2024/ABRIL/Abr_Port_0032_ENADE ADS_BTV.pdf
Downloading Abr_Port_0031_Revogar e Designar CPA DO CAMPUS_BTV.pdf to /content/shared_item/Portarias 2024/ABRIL/Abr_Port_0031_Revogar e Designar CPA DO CAMPUS_BTV.pdf
Downloading Abr_Port_0030_Designar Orientadores de Estágio para o ano letivo de 2024.pdf to /content/shared_item/Portarias 2024/ABRIL/Abr_Port_0030_Designar Orientadores de Estágio para o ano letivo de 2024.pdf
Downloading Abr_Port_0029_Aprovar ad referendum Projeto Pedagógico do Curso Superior de Tecnologia em Gestão da Produção Industrial.pdf to /content/shared_item/Portarias 2024/ABRIL/Abr_Port_0029_Aprovar ad referendum Projeto Pedagógico do Curso Superior de Tecnologia em Gestão da Produção Industrial.pdf
Downloading Abr_Port_0028_Aprovar ad referen

### Estatuto

In [None]:
file_id = '1XzjBDrh0N3TxUbKc4Jjs5pEaONxj7lx2'
downloaded_file = drive.CreateFile({'id': file_id})
downloaded_file.GetContentFile('ESTATUTO IFSP-VERSÃO VIGENTE-Resolução 08-2014--ALERAÇÃO da Resolução 872-2013.pdf')

# Stats

In [None]:
import os
import pandas as pd

In [None]:
# Instalar e configurar o Git
#!apt-get install git

In [1]:
# Clonar o repositório GitHub
!git clone https://github.com/rsabilio/ia024-projeto-rag.git

Cloning into 'ia024-projeto-rag'...
remote: Enumerating objects: 5005, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 5005 (delta 1), reused 10 (delta 1), pack-reused 4995[K
Receiving objects: 100% (5005/5005), 1.11 GiB | 21.76 MiB/s, done.
Resolving deltas: 100% (1390/1390), done.
Updating files: 100% (4582/4582), done.


In [2]:
%cd ia024-projeto-rag
!ls

/content/ia024-projeto-rag
0-downloaded-files  0_download.ipynb  1_extracao.ipynb	1-files-content  LICENSE  README.md


In [17]:
# TODO: adicionar a contagem de arquivos das outras pastas
def count_files_by_directory(path):
    data = []

    for root, dirs, files in os.walk(path):
        relative_path = os.path.relpath(root, path)
        parts = relative_path.split(os.sep)

        # Verificar se há informações suficientes nos diretórios
        if len(parts) >= 4:
            campus = parts[0].split('-')[1]
            tipo_documento = parts[1]
            ano = parts[2]
            mes = parts[3].split(' - ')[0].strip()
            quantidade = len(files)

            data.append({
                'campus': campus,
                'tipo_documento': tipo_documento,
                'ano': ano,
                'mes': mes,
                'quantidade': quantidade
            })

    return pd.DataFrame(data)

# Caminho para o repositório clonado
repo_name = '/content/ia024-projeto-rag/0-downloaded-files'

# Contar arquivos por diretório e criar o DataFrame
df = count_files_by_directory(repo_name)

# Exibir o DataFrame
display(df)


Unnamed: 0,campus,tipo_documento,ano,mes,quantidade
0,capivari,portarias,2013,09,1
1,capivari,portarias,2013,12,2
2,capivari,portarias,2013,11 Novembro,4
3,capivari,portarias,2013,08,1
4,capivari,portarias,2013,10,4
...,...,...,...,...,...
366,boituva,portarias,2021,10,11
367,boituva,portarias,2021,01,9
368,boituva,portarias,2021,06,11
369,boituva,portarias,2021,04,8


In [19]:
df.groupby(by=['campus']).agg({'quantidade': 'sum'})

Unnamed: 0_level_0,quantidade
campus,Unnamed: 1_level_1
araraquara,1656
boituva,1180
capivari,1737
