# Objetctive

This noteobook updates CSV files in a Google Cloud Storage bucket by adding a new column to all CSV files missing this information.

This function performs the following steps:
1. Backs up all CSV files before making any changes
2. Downloads each CSV file to a temporary folder, updates each CSV file by adding a new column and reordering the columns, uploads the updated CSV file back to the bucket, replacing the old one.

In [1]:
import pandas as pd
from google.oauth2.service_account import Credentials
from google.cloud import storage
import tqdm

# 1.Set Parameters

In [1]:
# GOOGLE CLOUD
key_file_path = "/Users/.credentials/google.json"
bucket_name = "bucket-name"

# Specify the prefixes (folders) you want to scan
prefixes = [
    "staging/brutos_prontuario_vitai/estoque_posicao/",
]

# LOCAL
backup_folder = "/Users/tmp/"
tmp_folder = "/Users/projects/pipelines_rj_sms/data/raw/"

# PAYLOAD
new_column = "ultimaAtualizacao"
columns_order = [
    "id",
    "produtoId",
    "estabelecimentoId",
    "cnes",
    "sigla",
    "produtoCodigo",
    "descricao",
    "grupo",
    "subGrupo",
    "categoria",
    "apresentacao",
    "lote",
    "secao",
    "dataVencimento",
    "controlado",
    "saldo",
    "valorMedio",
    "dataHora",
    "ultimaAtualizacao",
    "_data_carga",
]

# 2. Backup Step

In [None]:
credentials = Credentials.from_service_account_file(key_file_path)
client = storage.Client(credentials=credentials)
bucket = client.get_bucket(bucket_name)

# Calculate the total number of files
total_files = 0

for prefix in prefixes:
    blobs = bucket.list_blobs(prefix=prefix)
    for blob in blobs:
        if blob.name.endswith(".csv"):
            total_files += 1

print(f"Total files: {total_files}")


# Start backing up the files
for prefix in prefixes:
    blobs = bucket.list_blobs(prefix=prefix)
    for blob in tqdm.notebook.tqdm(blobs, total=total_files):
        if blob.name.endswith(".csv"):
            blob.download_to_filename(f'{backup_folder}{blob.name.split("/")[-1]}')

# 3. Transform Step

In [None]:
tmp_path = f"{tmp_folder}temp.csv"


for prefix in prefixes:

    blobs = bucket.list_blobs(prefix=prefix)
    for blob in tqdm.notebook.tqdm(blobs, total=total_files):

        if blob.name.endswith(".csv"):
            blob.download_to_filename(tmp_path)

            df = pd.read_csv(
                tmp_path,
                sep=";",
                dtype=str,
                keep_default_na=False,
                encoding="utf-8",
            )

            if new_column not in df.columns:
                df[new_column] = ""

            df = df[columns_order]

            # Save the updated DataFrame back to a CSV file
            df.to_csv(tmp_path, index=False, sep=";", encoding="utf-8")

            # Upload the updated CSV file, replacing the old one
            blob.upload_from_filename(tmp_path)

print("All CSV files have been updated.")