In [1]:
# default_exp repo_management.DLMusic_Data

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# export
from pydrive.drive import GoogleDrive
from pydrive.auth import GoogleAuth
import os

from ml.local_repo_path import local_repo_path
import ml.data_manage.gdrive_interact as gdrive

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=884310440114-oqhbrdkc3vikjmr3nvnrkb0ptr7lvp8r.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


In [21]:
local_repo_path

'/home/sfronczak/seanczak/ml/'

# Repo Management

While I don't want to track large data files with git (also some I'd like to keep private), I still want to make use of the cloud to store my files in the case that something happens to my local machine. Thus, here I outline the ability to shuttle files between my google drive and this repo (first build solution, we'll see if it lasts).

# Accessing Google drive
Using pydrive https://pythonhosted.org/PyDrive/quickstart.html, I came up with the following code.

# General utils and conventions
Need to go to googles API Console (see link above) and download the `client_secrets.json` and put it in this directory (perhaps also in the ml module directory). I think this only needs to be done once

## Prepping connection

In [22]:
#export
gauth = GoogleAuth()
drive = GoogleDrive(gauth)
gauth.LocalWebserverAuth() # Creates local webserver and auto handles authentication.

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=884310440114-oqhbrdkc3vikjmr3nvnrkb0ptr7lvp8r.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


## Encoding google file types
These are super long and not always intuitive so I'll store them in a dict that will make them more readable

In [23]:
# export
gtypes = {
    'folder' : 'application/vnd.google-apps.folder'
}

In [24]:
gtypes['folder']

'application/vnd.google-apps.folder'

## Grabbing root id

In [25]:
# export
def get_root_remote_id(folderName = 'ml_repo_data', gtypes=gtypes):
    # query google drive
    folders = drive.ListFile(
        {'q': f"title='{folderName}' and mimeType='{gtypes['folder']}' and trashed=false"}).GetList()
    folder = folders[0] # the above returns a list
    return folder['id']

In [26]:
root_id = get_root_remote_id()
root_id[:5] # not going to print all 33 chars

'1zAxv'

## Grabbing folder id
Argument is for the id of that above it in the tree (the `parent` id)

In [27]:
# export
def get_folder_id(parent_id, foldername):
    # grab the folder
    ftype = gtypes['folder'] # unfortunately if I don't do this Jupyter freaks out with indentations/coloration
    folders = drive.ListFile(
        {'q': f"title='{foldername}' and mimeType='{ftype}' and '{parent_id}' in parents and trashed=false"}).GetList()
    folder = folders[0] # the above returns a list
    return folder['id']

In [28]:
DLM_id = get_folder_id(parent_id = root_id, foldername = 'DL_music')
DLM_id[:5] # not going to print all 33 chars

'1QbKZ'

## Grabbing folder contents

In [29]:
# export
def grab_folder_contents(parent_id):
    '''Return a list of all the items in a folder based on its parent id'''
    file_list = drive.ListFile({'q': f"'{parent_id}' in parents and trashed=false"}).GetList()
    return file_list

In [30]:
file_list = grab_folder_contents(DLM_id) 
# it returns a list
file = file_list[1]
# each file is a dictionary of information
file.keys()

dict_keys(['kind', 'id', 'etag', 'selfLink', 'webContentLink', 'alternateLink', 'embedLink', 'iconLink', 'title', 'mimeType', 'labels', 'copyRequiresWriterPermission', 'createdDate', 'modifiedDate', 'modifiedByMeDate', 'lastViewedByMeDate', 'markedViewedByMeDate', 'version', 'parents', 'downloadUrl', 'userPermission', 'originalFilename', 'fileExtension', 'md5Checksum', 'fileSize', 'quotaBytesUsed', 'ownerNames', 'owners', 'lastModifyingUserName', 'lastModifyingUser', 'capabilities', 'editable', 'copyable', 'writersCanShare', 'shared', 'explicitlyTrashed', 'appDataContents', 'headRevisionId', 'spaces'])

## check if file exists remote by name and parent

In [74]:
# export
def check_file_exists_remote(parent_id, fname):
    file_list = grab_folder_contents(parent_id)
    for file in file_list:
        if file['title'] == fname : return True
        continue
    return False

In [75]:
parent_id = file['parents'][0]['id']
fname = file['title']
check_file_exists_remote(parent_id, fname)

True

## Grabbing file id

In [69]:
# export
def get_file_id(parent_id, fname):
    # grab the folder
    ftype = gtypes['folder'] # unfortunately if I don't do this Jupyter freaks out with indentations/coloration
    file_list = drive.ListFile(
        {'q': f"title='{fname}' and '{parent_id}' in parents and trashed=false"}).GetList()
    file = file_list[0] # the above returns a list
    return file['id']

In [73]:
file_id = get_file_id(parent_id, fname)
file_id[:5]

'1Qm_e'

## downloading files
Everything draws from the pydrives "file" object which can be initiated with the file's remote id. Downloading it from there is simple

In [53]:
# export
def download_file(file_id, local_dpath = None):
    # Create GoogleDriveFile instance with file id of file1.
    file = drive.CreateFile({'id': item['id']})
    local_dpath = './' if local_dpath is None else local_repo_path + local_dpath
    local_fpath = local_dpath + file['title']
    file.GetContentFile(local_fpath)
    return local_fpath

In [54]:
local_dpath = 'data/DeepLearn_Music/'
file_id = item['id']

local_fpath = download_file(file_id, local_dpath)
local_fpath

'/home/sfronczak/seanczak/ml/data/DeepLearn_Music/sd.mp3'

## uploading new file

In [70]:
# export
def upload_new_file(local_fpath, fname, parent_id):
    file = drive.CreateFile({'parents': [{'id': f'{parent_id}'}]})
    file['title'] = fname
    file.SetContentFile(local_fpath)
    file.Upload()
    return

In [64]:
upload_new_file(local_fpath, item['title'], item['parents'][0]['id'])

GoogleDriveFile({'parents': [{'id': '1QbKZKPxfPPnLzxL6NcLmZO0mOmzm_wCL'}]})


## updating existing file

In [65]:
# export
def update_existing_file(local_fpath, file_id):
    file = drive.CreateFile({'id': item['id']})
    file.SetContentFile(local_fpath)
    file.Upload()
    return

In [66]:
update_existing_file(local_fpath, item['id'])

## Sync a file to remote
Regardless of it exists or not (it will check)

In [71]:
# export
def sync_file_to_remote(local_fpath, fname, parent_id):
    '''will check if file exists remote first then will upload/update
    accordingly'''
    file_exists_remote = check_file_exists_remote(parent_id, fname)
    # update if its already there
    if file_exists_remote:
        file_id = get_file_id(parent_id, fname)
        update_existing_file(local_fpath, file_id)
    # upload a new one else
    else:
        upload_new_file(local_fpath, fname, parent_id)
    return

In [72]:
sync_file_to_remote(local_fpath, item['title'], item['parents'][0]['id'])

# DeepLearn_music Utils
Using some of the utils above but defining the tree structure here

In [32]:
class DeepLearnMusic_Syncer():
    def __init__(self):
        self.DLM_folderName = 'DL_music'
        self.DLM_id = self.get_DLM_remote_id()
        return
    
    def get_DLM_remote_id(self):
        # get the root_id in the drive
        root_id = get_root_remote_id()
        # grab the folder
        ftype = gtypes['folder'] # unfortunately if I don't do this Jupyter freaks out with indentations
        folders = drive.ListFile(
            {'q': f"title='{self.DLM_folderName}' and mimeType='{ftype}' and '{root_id}' in parents and trashed=false"}).GetList()
        folder = folders[0] # the above returns a list
        return folder['id']

a = DeepLearnMusic_Syncer()
a.DLM_id

'1QbKZKPxfPPnLzxL6NcLmZO0mOmzm_wCL'

In [None]:
file_list = drive.ListFile({'q': "'1zAxvLd2KQiOcZXma3nMaeUehQ_JiSJFy' in parents and trashed=false"}).GetList()
for file in file_list:
    print('Title: %s, ID: %s' % (file['title'], file['id']))
    # Get the folder ID that you want
    if(file['title'] == "To Share"):
        fileID = file['id']

In [33]:
def grab_file_ids(parent_id):
    file_list = drive.ListFile({'q': f"'{parent_id}' in parents and trashed=false"}).GetList()
    ids = {}
    for file in file_list:
        ids[file['title']] = file['id']
    return ids

In [34]:
grab_file_ids(a.DLM_id)

{'sd.wav': '1fwWtr9TwPIz-yJNhtdY13pxlaZHGsjUW',
 'sd.mp3': '1Qm_e7UXzdLv1ipqwvcFvcyIiLURZj2o8'}

# Tutorial

In [22]:
# this will prompt you to log in to google
import ml.repo_management.drive as drive

drive.get_root_remote_id()

'1zAxvLd2KQiOcZXma3nMaeUehQ_JiSJFy'

In [20]:
fileList = drive.ListFile({'q': "'DS_resources' in parents and trashed=false"}).GetList()
for file in fileList:
    print('Title: %s, ID: %s' % (file['title'], file['id']))
    # Get the folder ID that you want
    if(file['title'] == "To Share"):
        fileID = file['id']

HttpError: <HttpError 404 when requesting https://www.googleapis.com/drive/v2/files?q=%27DS_resources%27+in+parents+and+trashed%3Dfalse&maxResults=1000&alt=json returned "File not found:". Details: "File not found: ">

In [25]:
file_list = drive.ListFile({'q': "'1zAxvLd2KQiOcZXma3nMaeUehQ_JiSJFy' in parents and trashed=false"}).GetList()
for file in file_list:
    print('Title: %s, ID: %s' % (file['title'], file['id']))
    # Get the folder ID that you want
    if(file['title'] == "To Share"):
        fileID = file['id']

Title: a, ID: 1KBw2Pl0KUNr-mSIpAbgA269GJzfI_8-Wf1gZgtIb0rM


In [26]:
fileID

NameError: name 'fileID' is not defined