## Working with GCS Blob Storage
**Run this Notebook on GCP with Python Kernel**

In [1]:
import time
import datetime
import os
import pandas as pd

#### Set-up COS functions for GCS

In [2]:
from google.cloud import storage

In [3]:
def list_blobs(bucket_name, folder_name):
    """List all files in given COS directory."""    
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))
    for blob in blobs:
        print(blob.name + '\t' + str(blob.size))

In [4]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""    
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)

In [5]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from COS bucket."""
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

In [6]:
def delete_folder(bucket_name, folder_name):
    """Delete folder from COS bucket."""    
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))
    for blob in blobs:
        blob.delete()

In [7]:
def list_blobs_pd(bucket_name, folder_name):
    """List all files in given COS directory."""       
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    blob_name = []
    blob_size = []
    blob_time = []
    
    for blob in blobs:
        blob_name.append(blob.name)
        blob_size.append(blob.size)
        blob_time.append(blob.time_created)

    blobs_df = pd.DataFrame(list(zip(blob_name, blob_size, blob_time)), columns=['Name', 'Size', 'Time_Stamp'])

#     blobs_df = blobs_df.style.format({"Size": "{:,.0f}"}) 
    
    return blobs_df

### Accessing data using Pandas from GCS

In [8]:
list_blobs(bucket_name='msca-bdp-data-open', 
           folder_name='austin/')

austin/	0
austin/Austin_311.json	214537534
austin/Austin_311_Data.json	214524951
austin/Municipal_Court_Caseload_Information.csv	877080187
austin/Municipal_Court_Caseload_Information.zip	94212181
austin/PO_Commodity_Goods.csv	68316711
austin/PO_Commodity_Goods.parquet	21978945
austin/Water_Quality_Sampling_Data.csv	320434750
austin/Water_Quality_Sampling_Data.zip	18928325
austin/violation_type_desc.csv	152


In [9]:
df = pd.read_csv('gs://msca-bdp-data-open/austin/PO_Commodity_Goods.csv')
df.head(1)

Unnamed: 0,COMMODITY,COMMODITY_DESCRIPTION,EXTENDED_DESCRIPTION,QUANTITY,UNIT_OF_MEASURE,UNIT_OF_MEAS_DESC,UNIT_PRICE,ITM_TOT_AM,MASTER_AGREEMENT,CONTRACT_NAME,...,AWARD_DATE,VENDOR_CODE,LGL_NM,AD_LN_1,AD_LN_2,CITY,ST,ZIP,CTRY,DATA_BUILD_DATE
0,21045,Meter Boxes and Concrete Pull Boxes,"MANHOLE, 36"", REINFORCED TONGUE, AND GROOVE CO...",4.0,EA,Each,253.0,1012.0,MA1100GA120000004,Generated by reorder 6/11/15 2:06 PM.,...,06/12/2015,HAN7137140,HANSON PIPE & PRECAST LLC,801 AIRPORT BLVD,,AUSTIN,TX,78702,US,09/12/2016


In [10]:
df.to_csv('gs://msca-bdp-data-shared/temp_gcs/PO_Commodity_Goods.csv')

In [11]:
list_blobs(bucket_name='msca-bdp-data-shared', 
           folder_name='temp_gcs/')

temp_gcs/PO_Commodity_Goods.csv	70169254


In [12]:
!gsutil ls -l 'gs://msca-bdp-data-shared/temp_gcs/PO_Commodity_Goods.csv'

  70169254  2023-04-13T15:32:37Z  gs://msca-bdp-data-shared/temp_gcs/PO_Commodity_Goods.csv
TOTAL: 1 objects, 70169254 bytes (66.92 MiB)


In [13]:
delete_folder(bucket_name='msca-bdp-data-shared', 
           folder_name='temp_gcs/')

In [14]:
!gsutil ls -l 'gs://msca-bdp-data-shared/temp_gcs'

CommandException: One or more URLs matched no objects.


In [15]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Thu, 13 April 2023 10:32:41'