
- This notebook was created within Kaggle. It is used to move the HappyHappy whale project data---over 64GBs--from Kaggle to a GCP cloud storage bucket. The intent is to use GCP storage and Google's Collab for this project.

In [None]:
#PYTHON PACKAGE IMPORTS
import os 
import time
from google.cloud import storage
from kaggle_secrets import UserSecretsClient


#CONSTANTS
PATH = "/kaggle/input/happy-whale-and-dolphin"
folders = os.listdir(PATH)
project = UserSecretsClient().get_secret("project")
storage_client = storage.Client(project=project)
bucket_name = 'happywhales' 

#FUNCTIONS
# function for creating the bucket at the google cloud services 
def create_bucket(bucket_name):
    bucket = storage_client.create_bucket(bucket_name)
    
    
#function for uploading the data from kaggle to google cloud services 
def upload_files(bucket_name, path, jpg_list):
        
    #get the bucket based on its name
    bucket = storage_client.get_bucket(bucket_name)
    
    #loop through each folder
    for folder in os.listdir(path):
        
        #skip CSV files -- added "train_images" condition after interuption 
        if (not folder == "train_images") and (not folder.endswith("csv")):
            
            #loop through images in folder
            for jpg in os.listdir(f"{path}/{folder}"):
                
                #skip jpgs already in jpg_list
                if jpg in jpg_list:
                    pass
                
                else:
                    
                    #path to the jpg
                    filename = f"{path}/{folder}/{jpg}"
                
                    #create a bucket blob
                    blob = bucket.blob(f"{folder}/{jpg}")
                
                    #upload to GCP
                    blob.upload_from_filename(filename)
                


def list_blobs(bucket_name, prefix, delimiter):

    blobs = storage_client.list_blobs(bucket_name, prefix=prefix, delimiter=delimiter)
    
    jpg_list = []   
    for blob in blobs:
        jpg = blob.name
        jpg_list.append(jpg.split("/")[1])
        
    return jpg_list

In [2]:
#CREATE BUCKET - only needs to be done once
#create_bucket(bucket_name)  

In [3]:
# I had to move my data piecemeal. To track what I had already moved, a JPG list was used. 
# Whatever is in that list should not be recopied
jpg_list = list_blobs(bucket_name, prefix="test_images/", delimiter=False)
len(jpg_list)

In [8]:
#UPLOAD DATA TO GOOGLE CLOUD STORAGE
upload_files(bucket_name, PATH, jpg_list)