# GCP Access Script

A variety of functionalities to read, (over)write, and move files/directories on GCP. Download this script as a .py file when using it in conjunction with other scripts in the repository.

In [None]:
#!pip install gcsfs
from google.cloud import storage
import os
import tempfile
import pandas as pd
import re

In [None]:
class gcp:
    
    def __init__(self, bucket_name, credentials):
        """
        Input:
        - bucket_name: name of the target GCP bucket
        - credentials: GCP credentials in json format
        """
        self.bucket_name = bucket_name
        # make sure the json file is in the current directory
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials
        self.storage_client = storage.Client()
        self.bucket = self.storage_client.bucket(self.bucket_name)
        
    def list_blob(self):
        """List all the blobs in the bucket."""
        blobs = self.storage_client.list_blobs(self.bucket_name)
        for blob in blobs:
            print(blob.name)
            
    def delete_blob(self, filepath):
        """Deletes a blob from the bucket given specified filepath."""
        blob = self.bucket.blob(filepath)
        blob.delete()
    
    def list_html(self, imdb_id='', source=''):
        """Returns a list of html filepaths from GCP as a verification measure"""
        blobs = self.storage_client.list_blobs(self.bucket_name)
        html_files = [blob.name for blob in blobs if re.match(imdb_id + '/' + source, blob.name)]
        return html_files
    
    def list_filepath(self, imdb_id='', source=''):
        """Lists all the filepaths of a given imdb_id and/or web source."""
        df = self.read_master_csv()
        return df['filepath'][df['filepath'].str.contains(imdb_id + '/' + source)].tolist()
    
    def read_html(self, imdb_id='', source=''):
        """Returns a list of html files given imdb_id and web source."""
        html_files = self.list_filepath(imdb_id, source)
        html_file_list = []
        for file in html_files:
            blob = self.bucket.blob(file)
            html_file_list.append(blob.download_as_bytes().decode('utf-8', 'backslashreplace'))
        return html_file_list
    
    def read_html_by_filepath(self, filepath):
        """Takes in a list of filepaths and returns a list of raw html files."""
        html_file_list = []
        for file in filepath:
            blob = self.bucket.blob(file)
            html_file_list.append(blob.download_as_bytes().decode('utf-8', 'backslashreplace'))
        return html_file_list
    
    def upload_html(self, html_text, gcp_filepath):
        """Uploads raw html files in .txt format to the designated directory."""
        # create tmp html file locally and upload to GCP
        blob = self.bucket.blob(gcp_filepath)
        blob.upload_from_string(html_text)
            
    def read_master_csv(self):
        """Reads the master csv (csv that contains all metadata information about each file in GCP)."""
        return pd.read_csv('gs://' + self.bucket_name + '/master.csv').iloc[:, 1:]
    
    def upload_master_csv(self, master_csv, local_path):
        """Save a local copy of the new master csv and upload to GCP."""
        master_csv.to_csv(local_path)
        blob = self.bucket.blob('master.csv')
        blob.upload_from_filename(local_path)

In [None]:
# test code
if __name__ == '__main__':
    gcp_access = gcp()
    master_csv = gcp_access.read_master_csv()
    print(master_csv.head())