## Toczala - CP4D Python Helper Routines

In [None]:
# Import needed Python packages

import os, uuid, sys
import pandas as pd
import requests
import json
import shutil
import warnings
import urllib3
warnings.simplefilter(action='ignore', category=urllib3.exceptions.InsecureRequestWarning)

from project_lib import Project
from ibm_watson_studio_lib import access_project_or_space

#
# YOU NEED TO SET THIS FOR EACH SPECIFIC PROJECT
#
# Set your Cloud Pak base access point
#
CP4D_BASE_URL = "https://cpd-cpd-instance.apps.poc.watson.techops.aa.com"
#
# Grab a Watson Studio lib object
#
wslib = access_project_or_space()
#
# Set/get platform and project info and token
#
token = os.environ.get('USER_ACCESS_TOKEN')
project_id = os.environ.get('PROJECT_ID')
host = CP4D_BASE_URL
#
# Set your REST message header (with authorization token)
#
headers = {
     'Authorization': 'Bearer ' + token,
     'accept': 'application/json',
     'Content-Type': 'application/json'
}

# FILE MANIPULATION ROUTINES for STORAGE VOLUMES

In [None]:
#
# PROJECT FILE MANIPULATION
#
# Project file manipulation is done through use of the ibm-watson-studio-lib package
# for Python.  You can read more abvout this package at:
#  https://www.ibm.com/docs/en/cloud-paks/cp-data/4.0?topic=lib-watson-studio-python
#
# Some notes:
#  - you cannot delete files.  You can delete them from the filesystem, but then the
#    file handle is still registered with the project area, and no file exists.  If you
#    see corrupted files in project spaces, this is probably the cause.
#  - you can doa raw os copy of a project file, but unless you then register the file,
#    you will not see it in any other view of project files.
#
# Get data connections for this project
#
def get_project_connections():
    return_list = []
    connection_list = wslib.list_connections()
    for connection in connection_list:
        inst_name = connection["name"]
        return_list.append(inst_name)
    return return_list
#
# Get path to the files in the project
#
def get_data_asset_path():
    return wslib.mount.get_base_dir()
#
# Get a json object of dir of the files on the project space
#
def get_proj_dir_json():
    file_json_1 = wslib.list_connected_data()
    file_json_2 = wslib.list_stored_data()
    final_file_json = file_json_1 + file_json_2
    return final_file_json
#
# Get a list of names of dir of the files on the project space
#
def get_proj_dir_list():
    final_file_json = get_proj_dir_json()
    return_list = []
    for file_ent in final_file_json:
        file_name = file_ent["name"]
        return_list.append(file_name)
    return return_list
#
# Pretty print a dir of the files on the project space
#
def print_proj_dir_list():
    file_list_1 = wslib.list_connected_data()
    file_list_2 = wslib.list_stored_data()
    final_file_list = file_list_1 + file_list_2
    wslib.show(final_file_list)
    return True
#
# Rename a file in the project data area
#
def copy_project_file(old_name, new_name):
    try:
        stat = [0]
        #
        # Move the file to the data assets folder in the
        # mounted project storage
        #
        data_asset_folder = get_data_asset_path()
        old_target_path = os.path.join(data_asset_folder, old_name)
        target_path = os.path.join(data_asset_folder, new_name)
        shutil.copy(old_target_path, target_path)
    except:
        print("**ERROR**", " in copy_project_file - move ")
        print("**ERROR**", sys.exc_info())
        stat = [1]
    try:
        #
        # Register the file as data asset
        #
        wslib.mount.register_asset(target_path, asset_name=new_name)    
    except:
        print("**ERROR**", " in copy_project_file - register_asset")
        print("**ERROR**", sys.exc_info())
        stat = [1]
    return stat
#
# Create a CSV file from a panda dataframe in a project data file
# name = name of the resulting csv file (with no source directory)
#
def write_pandas_df_to_proj_csv_file(name,dataframe):
    try:
        stat = wslib.save_data(name, str.encode(dataframe.to_csv(index=False)))
    except:
        print("**ERROR**", " in write_pandas_df_to_proj_csv_file ")
        print("**ERROR**", sys.exc_info())
        stat = []
    return stat

In [None]:
#
# STORAGE VOLUME FILE MANIPULATION
#
#
# Storage volume file manipulation is done through use of the Volumes API.
# You can read more abvout this API at:
#  https://www.ibm.com/docs/en/cloud-paks/cp-data/4.0?topic=resources-volumes-api
#
#
# Get a list of the available storage volume names
#
def get_storage_volume_list():
    #
    # Get a storage volumes and storage volume IDs
    #
    return_list = []
    this_url = host + "/zen-data/v3/service_instances?addon_type=volumes"
    dataDict = {
        "addon_type":"volumes"
        }
    mydata = json.dumps(dataDict)
    #
    try:
        r = requests.get(this_url,
                         headers=headers,
                         verify=False)
    except:
        print("**ERROR**", " in get_storage_volume_list ")
        print("**ERROR**", sys.exc_info())
        return_list = []
    #
    storage_volumes = r.json()
    #
    instance_list = storage_volumes["service_instances"]
    for volume in instance_list:
        inst_name = volume["display_name"]
        inst_id = volume["id"]
        mount_point = volume["parameters"]["mount_path"]
        return_list.append(inst_name)
    #
    return return_list 
#
# Create a Volume API specific HTTP header with access token
#
def get_volume_header():
    #
    # Get a storage volume token
    #
    this_url = host + "/zen-data/v2/serviceInstance/token"
    dataDict = {
        "serviceInstanceID": inst_id
        }
    mydata = json.dumps(dataDict)
    #
    try:
        r = requests.get(this_url,
                         headers=headers,
                         data=mydata,
                         verify=False)
    except:
        print("**ERROR**", " in get_volume_header ")
        print("**ERROR**", sys.exc_info())
        volume_header = {"error":sys.exc_info()}
    #
    result = r.json()
    volume_header = {
         'Authorization': 'Bearer ' + result["AccessToken"],
         'Content-Type': 'application/json'
        }
    return volume_header
#
# Input the name of the storage volume, and discover the mount point
#
def get_sv_mount_point(sv_name):
    #
    # Get a storage volumes and storage volume IDs
    #
    this_url = host + "/zen-data/v3/service_instances?addon_type=volumes"
    dataDict = {
        "addon_type":"volumes"
        }
    mydata = json.dumps(dataDict)
    #
    try:
        r = requests.get(this_url,
                         headers=headers,
                         verify=False)
    except:
        print("**ERROR**", " in get_sv_mount_point ")
        print("**ERROR**", sys.exc_info())
        ret_mtpt = "error"
    #
    storage_volumes = r.json()
    #
    ret_mntpt = ""
    #
    instance_list = storage_volumes["service_instances"]
    for volume in instance_list:
        inst_name = volume["display_name"]
        inst_id = volume["id"]
        mount_point = volume["parameters"]["mount_path"]
        if (sv_name == inst_name):
            ret_mtpt = mount_point
    return ret_mtpt
#
# Get a list of files located at a specific directory path, for the 
# storage volume
#
def get_sv_dir(sv_name,dir_path=""):
    # sv_name is a string - service volumen name
    # dir_path is a string - target directory path to read
    file_list = []
    #
    # Make sure that sstorage volume is mounted
    #
    this_url = host + "/zen-data/v1/volumes/directories/volume_services" + sv_name
    dataDict = { }
    mydata = json.dumps(dataDict)
    #
    try:
        r = requests.get(this_url,
                         headers=headers,
                         data=mydata,
                         verify=False)
    except:
        print("**ERROR**", " in get_sv_dir ")
        print("**ERROR**", sys.exc_info())
        file_list = ["error", sys.exc_info()]
    #
    result = r.json()
    #
    file_path = get_sv_mount_point(sv_name) + dir_path
    file_list = get_temp_dir_list(file_path)
    return file_list
#
# Split_filename returns the filename and path from a full filename
#
def split_filename(filename):
    #
    filebase = filename.split('/')[-1].split('.')[0]
    return filebase
#
# Split_fileext returns the extension from a full filename
#
def split_fileext(filename):
    #
    tempfilename, file_extension = os.path.splitext(filename)
    while '.' in tempfilename:
        tempfilename, tempfile_extension = os.path.splitext(tempfilename)
        file_extension = tempfile_extension + file_extension       
    return file_extension
#
# On a particular storage volume, at a particular path, backkup a file
# by extending the filename with a string
# (i.e. toxfile.txt becomes toxfile_20220310.txt)
#
def backup_sv_file(sv_name,filename,dir_path="",extension=""):
    #
    stat=0
    #
    # if extension is defaulted, just add BACKUP
    #
    if extension == "":
        extension = "BACKUP"
    #
    # Find your mount point, directory path, and filename
    #
    mount_pt = get_sv_mount_point(sv_name) + "/"
    #
    file_base = split_filename(filename)
    file_ext = split_fileext(filename)
    new_name = mount_pt + dir_path + file_base + extension + file_ext
    #
    old_name = mount_pt + dir_path + filename
    #
    print("Old file - " + old_name + "    - New Name - " + new_name)
    #
    try:
        stat = copy_temp_file(old_name, new_name)
    except:
        print("**ERROR**", " in backup_sv_file ")
        print("**ERROR**", sys.exc_info())
        stat = []
    return stat


In [None]:
#
# TEMP DATA AREA FILE MANIPULATION
#
# Every CloudPak for Data Python notebook is run in a virtul environment, 
# and has it's own limited virtual file space.  this space can be used to
# store temporary files and scratch files.
#
# Note that all of these routines are using standard Python utilities to
# do file manipulation
#
# Get a dir of the files on the temp space
#
def get_temp_dir_list(path="."):
    try:
        final_file_list = os.listdir(path)
    except:
        print("**ERROR**", " in get_temp_dir_list ")
        print("**ERROR**", sys.exc_info())
        final_file_list = []        
    return final_file_list
#
# Rename a file in the temp data area
#
def rename_temp_file(old_name, new_name):
    try:
        stat = os.rename(old_name,new_name) 
#        stat = shutil.move(old_name,new_name) 
    except:
        print("**ERROR**", " in rename_temp_file ")
        print("**ERROR**", sys.exc_info())
        stat = []
    return stat
#
# Rename a file in the temp data area
#
def copy_temp_file(old_name, new_name):
    try:
        stat = shutil.copy(old_name,new_name) 
    except:
        print("**ERROR**", " in copy_temp_file ")
        print("**ERROR**", sys.exc_info())
        stat = []
    return stat
#
# Create a CSV file from a panda dataframe in a temp data file
# name = name of the resulting csv file (with no source directory)
#
def write_pandas_df_to_temp_csv_file(name,dataframe):
    try:
        stat = dataframe.to_csv(name,index=False)
    except:
        print("**ERROR**", " in write_pandas_df_to_temp_csv_file ")
        print("**ERROR**", sys.exc_info())
        stat = []
    return stat

## Test and show sample code for the File manipulation routines

In [None]:
#
#  PROJECT AREA EXAMPLES
#
print ("\nPROJECT AREA EXAMPLES \n")
#
# List your connections
#
my_connections = get_project_connections()
print("\nMy connections:\n", my_connections)

#
# Find out where your data asset folder is located
#
data_asset_path = get_data_asset_path()
print("\nPath to data asset folder is:\n", data_asset_path)

#
# PROJECT DATA FILE EXAMPLES
#
# Get a directory of the files out on the project
#
my_dir = get_proj_dir_json()
#
list_dir = get_proj_dir_list()
print("\nOriginal List of Files is:\n")
print(list_dir)

#
# Write out a sample data file to the project
#
SAMPLE_DATAFILE     = "DUMMY_FILE.csv"
NEW_SAMPLE_DATAFILE = "RENAMED_DUMMY_FILE.csv"
#
# Dump data from dataframe to csv file
#
my_dir_df = pd.DataFrame(data=my_dir)
data_file_info = write_pandas_df_to_proj_csv_file(SAMPLE_DATAFILE,my_dir_df)
print ("\nCreate new DataFile Info:\n")
print (data_file_info)
#
list_dir = get_proj_dir_list()
print("\nNew List of Files is:  (should include " + SAMPLE_DATAFILE + ") \n")
print(list_dir)

#
# Rename the data file that you just created
#
print("\nCopying " + SAMPLE_DATAFILE + " to " + NEW_SAMPLE_DATAFILE + " \n")
stat = copy_project_file(SAMPLE_DATAFILE,NEW_SAMPLE_DATAFILE)
#
list_dir = get_proj_dir_list()
print("\nNew List of Files is:\n")
print(list_dir)


In [None]:
#
#  TEMP AREA EXAMPLES
#
print ("\nTEMP AREA EXAMPLES \n\n")
#
temp_dir = get_temp_dir_list()
print ("\nTemp dir is:\n")
print (temp_dir)
#
# Write out a sample data file to the temp space
#
data_file_info = write_pandas_df_to_temp_csv_file(SAMPLE_DATAFILE,my_dir_df)
print ("\n\nDataFile Info:\n")
print (data_file_info)
#
temp_dir = get_temp_dir_list()
print ("\nTemp dir is:\n")
print (temp_dir)
#
# Rename the data file that you just created
#
# Note: when doing these manipulations, you WILL OVERWRITE existing files
#
stat = rename_temp_file(SAMPLE_DATAFILE,NEW_SAMPLE_DATAFILE)
#
temp_dir = get_temp_dir_list()
print ("\nTemp dir is:\n")
print (temp_dir)



#
# Delete the file that you just created
#

#
# List all of the connected data
#


In [None]:
#
# If you need to do RAW CURL calls then uncomment out this entire =block of code below
#
#import subprocess
#SPACE = " "
###
### =============================================
###
###
### Build your curl call
###
#curl_head = "curl -k -X GET"
#url = "https://cpd-cpd-instance.apps.poc.watson.techops.aa.com/zen-data/v3/service_instances?addon_type=volumes"
#header_line = "-H 'Authorization: Bearer " + token + "' -H 'Content-Type: application/json'"
#data_line = ""
#curl_cmd = curl_head + SPACE + url + SPACE + header_line + SPACE + data_line
#print ("===\n",curl_cmd,"\n===\n")
#result = subprocess.run(curl_cmd, shell=True, capture_output=True)
#result2 = result.stdout.decode()
#print (result2)


In [None]:
# ==========================================
#
#  STORAGE VOLUME EXAMPLES
#
print ("\nSTORAGE VOLUME EXAMPLES \n\n")
#
import time
import datetime
from datetime import datetime, date, time, timezone
###
### =============================================
###
def today_str():
    today = datetime.now()
    retstr = today.strftime("%d-%m-%Y")
    retstr = "-" + retstr
    return retstr
### =============================================
###
#
# Get the list of available storage volumes
#
print ("\nStorage volume list is: ")
stor_vol_list = get_storage_volume_list()
print (stor_vol_list)
#
inst_name = stor_vol_list[0]
print("Focused on instance "+inst_name)
#
# Get a directory
#
print ("\nDirectory list is: ")
dir_list = get_sv_dir(inst_name)
print (dir_list)
#
# Get a directory
#
print ("\nDirectory list is: ")
dir_list = get_sv_dir(inst_name,"/.")
print (dir_list)
#
# Get a directory
#
print ("\nDirectory list (for bad directory path) is: ")
dir_list = get_sv_dir(inst_name,"some_wrong_dir")
print (dir_list)
#
# Get a directory
#
print ("\nDirectory list is: ")
dir_list = get_sv_dir(inst_name,"")
print (dir_list)
#
# Backup/Copy the first file
#
target_file = dir_list[0]
#
# Backup a file
#
print ("\n\nBacking up file " + target_file)
#
# Calculate your suffix for backup file extensiuon
#
datestamp = today_str()
#
# Backup file
#
stat = backup_sv_file(inst_name,target_file,"",datestamp)
#
# Get a directory
#
print ("\nDirectory list is: ")
dir_list = get_sv_dir(inst_name,"")
print (dir_list)
