# General imports

In [1]:
import os
import numpy as np
import gc
import requests
from datetime import datetime

In [2]:
ipinfo = requests.get('http://ipinfo.io')
region = ipinfo.json()['country'] + ': ' + ipinfo.json()['region']

In [3]:
print(region)

US: South Carolina


In [4]:
timestr = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

In [5]:
!pip install line_profiler

Collecting line_profiler
[?25l  Downloading https://files.pythonhosted.org/packages/d8/cc/4237472dd5c9a1a4079a89df7ba3d2924eed2696d68b91886743c728a9df/line_profiler-3.0.2-cp36-cp36m-manylinux2010_x86_64.whl (68kB)
[K     |████▊                           | 10kB 14.8MB/s eta 0:00:01[K     |█████████▌                      | 20kB 1.8MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 2.3MB/s eta 0:00:01[K     |███████████████████             | 40kB 2.6MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 2.1MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 2.3MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 1.9MB/s 
Installing collected packages: line-profiler
Successfully installed line-profiler-3.0.2


In [6]:
%load_ext line_profiler

# Profiling

## Google Drive

In [7]:
from google.colab import drive

In [8]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Create directory to store profiling output

In [11]:
profiling_output_dir_path = '/content/gdrive/My Drive/ML_data/IO_profiling_output'
if not os.path.exists(profiling_output_dir_path): os.makedirs(profiling_output_dir_path)

In [12]:
os.chdir(profiling_output_dir_path)
with open('region.txt', 'w') as f:
  f.write(region)
with open('time.txt', 'w') as f:
  f.write(timestr)

Start profiling

In [13]:
%%timeit
retry = True
while retry:
  retry=False
  try:
    next(os.walk('/content/gdrive/My Drive/ML_data/snakes/valid/venomous'))
  except StopIteration:
    print('Exception Raised. Retry')
    retry = True

The slowest run took 1989.06 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 9.95 ms per loop


In [14]:
def test_from_gdrive(N, offset=0):
  images = os.listdir('/content/gdrive/My Drive/ML_data/snakes/valid/venomous')
  for i in range(offset,offset+N):
    filename = images[i]
    path = '/content/gdrive/My Drive/ML_data/snakes/valid/venomous/' + filename
    with open(path, 'rb') as f:
      f.seek(0,2)
      length_of_file = f.tell()
      f.seek(0,0)
      content = f.read(length_of_file)

In [15]:
%lprun -T test_from_gdrive_0_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=0)
%lprun -T test_from_gdrive_50_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=50)
%lprun -T test_from_gdrive_100_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=100)
%lprun -T test_from_gdrive_150_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=150)
%lprun -T test_from_gdrive_200_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=200)
%lprun -T test_from_gdrive_250_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=250)
%lprun -T test_from_gdrive_300_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=300)
%lprun -T test_from_gdrive_350_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=350)
%lprun -T test_from_gdrive_400_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=400)
%lprun -T test_from_gdrive_450_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=450)
%lprun -T test_from_gdrive_500_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=500)
%lprun -T test_from_gdrive_550_t0.csv -f test_from_gdrive test_from_gdrive(50,offset=550)


*** Profile printout saved to text file 'test_from_gdrive_0_t0.csv'. 

*** Profile printout saved to text file 'test_from_gdrive_50_t0.csv'. 

*** Profile printout saved to text file 'test_from_gdrive_100_t0.csv'. 

*** Profile printout saved to text file 'test_from_gdrive_150_t0.csv'. 

*** Profile printout saved to text file 'test_from_gdrive_200_t0.csv'. 

*** Profile printout saved to text file 'test_from_gdrive_250_t0.csv'. 

*** Profile printout saved to text file 'test_from_gdrive_300_t0.csv'. 

*** Profile printout saved to text file 'test_from_gdrive_350_t0.csv'. 

*** Profile printout saved to text file 'test_from_gdrive_400_t0.csv'. 

*** Profile printout saved to text file 'test_from_gdrive_450_t0.csv'. 

*** Profile printout saved to text file 'test_from_gdrive_500_t0.csv'. 

*** Profile printout saved to text file 'test_from_gdrive_550_t0.csv'. 


## Google Cloud Storage API

In [10]:
from google.colab import auth
auth.authenticate_user()

In [16]:
from google.cloud import storage

In [17]:
client = storage.Client(project='ml-self-learning-project')

In [18]:
bucket = client.get_bucket('ml_datasets_checco_1')

In [19]:
if not os.path.exists('/content/gcs-api/snakes/valid/venomous'): os.makedirs('/content/gcs-api/snakes/valid/venomous')
if not os.path.exists('/content/gcs-api/snakes/valid/non_venomous'): os.makedirs('/content/gcs-api/snakes/valid/non_venomous')

In [20]:
def test_from_storage_api(N, offset=0):
  i=0
  for b in client.list_blobs('ml_datasets_checco_1'):
    i += 1
    if i <= offset:
      continue
    if i >=N+offset+1:
      break
    filepath = '/content/gcs-api/'+ '/'.join(b.name.split('/')[-4:])
    b.download_to_filename(filepath)

In [21]:
%lprun -T test_from_storage_api_0_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=0)
%lprun -T test_from_storage_api_50_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=50)
%lprun -T test_from_storage_api_100_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=100)
%lprun -T test_from_storage_api_150_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=150)
%lprun -T test_from_storage_api_200_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=200)
%lprun -T test_from_storage_api_250_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=250)
%lprun -T test_from_storage_api_300_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=300)
%lprun -T test_from_storage_api_350_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=350)
%lprun -T test_from_storage_api_400_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=400)
%lprun -T test_from_storage_api_450_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=450)
%lprun -T test_from_storage_api_500_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=500)
%lprun -T test_from_storage_api_550_t0.csv -f test_from_storage_api test_from_storage_api(50,offset=550)


*** Profile printout saved to text file 'test_from_storage_api_0_t0.csv'. 

*** Profile printout saved to text file 'test_from_storage_api_50_t0.csv'. 

*** Profile printout saved to text file 'test_from_storage_api_100_t0.csv'. 

*** Profile printout saved to text file 'test_from_storage_api_150_t0.csv'. 

*** Profile printout saved to text file 'test_from_storage_api_200_t0.csv'. 

*** Profile printout saved to text file 'test_from_storage_api_250_t0.csv'. 

*** Profile printout saved to text file 'test_from_storage_api_300_t0.csv'. 

*** Profile printout saved to text file 'test_from_storage_api_350_t0.csv'. 

*** Profile printout saved to text file 'test_from_storage_api_400_t0.csv'. 

*** Profile printout saved to text file 'test_from_storage_api_450_t0.csv'. 

*** Profile printout saved to text file 'test_from_storage_api_500_t0.csv'. 

*** Profile printout saved to text file 'test_from_storage_api_550_t0.csv'. 


## GCSFUSE

In [None]:
from google.colab import auth
auth.authenticate_user()

In [22]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100   653  100   653    0     0  14511      0 --:--:-- --:--:-- --:--:-- 14840
OK
36 packages can be upgraded. Run 'apt list --upgradable' to see them.
The following NEW packages will be installed:
  gcsfuse
0 upgraded, 1 newly installed, 0 to remove and 36 not upgraded.
Need to get 4,278 kB of archives.
After this operation, 12.8 MB of additional disk space will be used.
Selecting previously unselected package gcsfuse.
(Reading database ... 144676 files and directories currently installed.)
Preparing to unpack .../gcsfuse_0.30.0_amd64.deb ...
Unpacking gcsfuse (0.30.0) ...
Setting up gcsfuse (0.30.0) ...


In [23]:
os.makedirs('/content/bucket-data')

In [24]:
os.chdir('/content')

In [25]:
!gcsfuse --implicit-dirs ml_datasets_checco_1 bucket-data
#!gcsfuse --implicit-dirs --stat-cache-ttl 5h --type-cache-ttl 5h --stat-cache-capacity 65536 ml_datasets_checco_1 bucket-data

Using mount point: /content/bucket-data
Opening GCS connection...
Opening bucket...
Mounting file system...
File system has been successfully mounted.


In [26]:
os.chdir(profiling_output_dir_path)

In [27]:
%%timeit
retry = True
while retry:
  retry=False
  try:
    next(os.walk('/content/bucket-data/aicrowd-blitz-challenge/snakes/valid/venomous'))
  except StopIteration:
    print('Exception Raised. Retry')
    retry = True

1 loop, best of 3: 3.56 s per loop


In [28]:
def test_from_gcsfuse(N, offset=0):
  images = os.listdir('/content/bucket-data/aicrowd-blitz-challenge/snakes/valid/venomous')
  for i in range(offset,offset+N):
    filename = images[i]
    path = '/content/bucket-data/aicrowd-blitz-challenge/snakes/valid/venomous/' + filename
    with open(path, 'rb') as f:
      f.seek(0,2)
      length_of_file = f.tell()
      f.seek(0,0)
      content = f.read(length_of_file)

In [None]:
%lprun -T test_from_gcsfuse_0_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=0)
%lprun -T test_from_gcsfuse_50_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=50)
%lprun -T test_from_gcsfuse_100_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=100)
%lprun -T test_from_gcsfuse_150_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=150)
%lprun -T test_from_gcsfuse_200_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=200)
%lprun -T test_from_gcsfuse_250_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=250)
%lprun -T test_from_gcsfuse_300_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=300)
%lprun -T test_from_gcsfuse_350_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=350)
%lprun -T test_from_gcsfuse_400_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=400)
%lprun -T test_from_gcsfuse_450_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=450)
%lprun -T test_from_gcsfuse_500_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=500)
%lprun -T test_from_gcsfuse_550_t0.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=550)


*** Profile printout saved to text file 'test_from_gcsfuse_0_t0.csv'. 


# Test reading second time

In [None]:
gc.collect()

In [None]:
zz = np.ones(shape=(40000,35000), dtype=np.float64)

In [None]:
del zz
gc.collect()

In [None]:
%lprun -T test_from_gdrive_0_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=0)
%lprun -T test_from_gdrive_50_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=50)
%lprun -T test_from_gdrive_100_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=100)
%lprun -T test_from_gdrive_150_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=150)
%lprun -T test_from_gdrive_200_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=200)
%lprun -T test_from_gdrive_250_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=250)
%lprun -T test_from_gdrive_300_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=300)
%lprun -T test_from_gdrive_350_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=350)
%lprun -T test_from_gdrive_400_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=400)
%lprun -T test_from_gdrive_450_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=450)
%lprun -T test_from_gdrive_500_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=500)
%lprun -T test_from_gdrive_550_t1.csv -f test_from_gdrive test_from_gdrive(50,offset=550)

In [None]:
def read_from_storage_api(N, offset=0):
  i=0
  for b in client.list_blobs('ml_datasets_checco_1'):
    i += 1
    if i <= offset:
      continue
    if i >=N+offset+1:
      break
    filepath = '/content/gcs-api/'+ '/'.join(b.name.split('/')[-4:])
    with open(filepath, 'rb') as f:
      f.seek(0,2)
      length_of_file = f.tell()
      f.seek(0,0)
      content = f.read(length_of_file)

In [None]:
%lprun -T test_from_storage_api_0_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=0)
%lprun -T test_from_storage_api_50_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=50)
%lprun -T test_from_storage_api_100_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=100)
%lprun -T test_from_storage_api_150_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=150)
%lprun -T test_from_storage_api_200_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=200)
%lprun -T test_from_storage_api_250_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=250)
%lprun -T test_from_storage_api_300_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=300)
%lprun -T test_from_storage_api_350_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=350)
%lprun -T test_from_storage_api_400_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=400)
%lprun -T test_from_storage_api_450_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=450)
%lprun -T test_from_storage_api_500_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=500)
%lprun -T test_from_storage_api_550_t1.csv -f read_from_storage_api read_from_storage_api(50,offset=550)

In [None]:
%lprun -T test_from_gcsfuse_0_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=0)
%lprun -T test_from_gcsfuse_50_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=50)
%lprun -T test_from_gcsfuse_100_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=100)
%lprun -T test_from_gcsfuse_150_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=150)
%lprun -T test_from_gcsfuse_200_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=200)
%lprun -T test_from_gcsfuse_250_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=250)
%lprun -T test_from_gcsfuse_300_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=300)
%lprun -T test_from_gcsfuse_350_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=350)
%lprun -T test_from_gcsfuse_400_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=400)
%lprun -T test_from_gcsfuse_450_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=450)
%lprun -T test_from_gcsfuse_500_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=500)
%lprun -T test_from_gcsfuse_550_t1.csv -f test_from_gcsfuse test_from_gcsfuse(50,offset=550)

# Analysis

In [None]:
!ls "/content/gdrive/My Drive/ML_data/IO_profiling_output"

In [None]:
!pip install -U seaborn

In [None]:
import seaborn as sns

In [None]:
import pandas as pd

In [None]:
def parse(strategy='gdrive'):
  file_opening = list()
  file_reading = list()
  total = list()
  runtime_type = list()
  for folder_name in ['IO_profiling_output', 'IO_profiling_output_fifth', 'IO_profiling_output_fourth', 'IO_profiling_output_first', 'IO_profiling_output_second_gpu', 'IO_profiling_output_third_gpu']:
    for offset in range(0,551,50):
      filename = '/content/gdrive/My Drive/ML_data/'+ folder_name + '/test_from_'+ strategy + '_' + str(offset) + '_t0.csv'
      with open(filename) as f:
        lines = f.readlines()
      file_opening.append(
          float([x for x in lines if 'with open' in x][0].split()[3])
      )
      file_reading.append(
          float([x for x in lines if 'content =' in x][0].split()[3])
      )
      total.append(
          float([x for x in lines if 'Total time' in x][0].split()[2])
      )
      rt_type = 'gpu' if 'gpu' in folder_name else 'cpu'
      runtime_type.append(rt_type)
  return file_opening, file_reading, total, runtime_type

In [None]:
def parse_storage_api():
  file_opening = list()
  file_reading = list()
  total = list()
  runtime_type = list()
  for folder_name in ['IO_profiling_output', 'IO_profiling_output_fifth', 'IO_profiling_output_fourth', 'IO_profiling_output_first', 'IO_profiling_output_second_gpu', 'IO_profiling_output_third_gpu']:
    for offset in range(0,551,50):
      filename = '/content/gdrive/My Drive/ML_data/' + folder_name + '/test_from_storage_api_' + str(offset) + '_t0.csv'
      with open(filename) as f:
        lines = f.readlines()
      file_opening.append(
          float([x for x in lines if 'list_blobs' in x][0].split()[3])
      )
      file_reading.append(
          float([x for x in lines if 'download_to_filename' in x][0].split()[3])
      )
      total.append(
          float([x for x in lines if 'Total time' in x][0].split()[2])
      )
      rt_type = 'gpu' if 'gpu' in folder_name else 'cpu'
      runtime_type.append(rt_type)
  return file_opening, file_reading, total, runtime_type

In [None]:
gdrive_opening, gdrive_reading, gdrive_total, gdrive_rt = parse('gdrive')
strategy = ['gdrive']*len(gdrive_opening)
fuse_opening, fuse_reading, fuse_total, fuse_rt = parse('gcsfuse')
strategy.extend(['gcsfuse']*len(fuse_opening))
api_opening, api_reading, api_total, api_rt = parse_storage_api()
strategy.extend(['api']*len(api_opening))

data = pd.DataFrame(
    dict(
         strategy=strategy,
         opening=gdrive_opening + fuse_opening + api_opening,
         reading=gdrive_reading + fuse_reading + api_reading,
         total=gdrive_total + fuse_total + api_total,
         runtime_type=gdrive_rt + fuse_rt + api_rt,
    )
)

In [None]:
sns.histplot(data, x='opening', bins=50, hue='strategy')

In [None]:
sns.histplot(data, x='reading', bins=50, hue='strategy')

In [None]:
sns.histplot(data, x='total', bins=50, hue='strategy')

In [None]:
sns.boxplot(data=data, x='strategy', y='total')

In [None]:
data['strategy+rt'] = data.strategy + data.runtime_type
sns.boxplot(data=data, x='strategy+rt', y='total')

In [None]:
def parse_second_time(strategy='gdrive'):
  file_opening = list()
  file_reading = list()
  total = list()
  for folder_name in ['IO_profiling_output_fourth', 'IO_profiling_output_first', 'IO_profiling_output_second_gpu', 'IO_profiling_output_third_gpu']:
    for offset in range(0,551,50):
      filename = '/content/gdrive/My Drive/ML_data/' + folder_name + '/test_from_'+ strategy + '_' + str(offset) + '_t1.csv'
      with open(filename) as f:
        lines = f.readlines()
      file_opening.append(
          float([x for x in lines if 'with open' in x][0].split()[3])
      )
      file_reading.append(
          float([x for x in lines if 'content =' in x][0].split()[3])
      )
      total.append(
          float([x for x in lines if 'Total time' in x][0].split()[2])
      )
  return file_opening, file_reading, total

In [None]:
gdrive_opening, gdrive_reading, gdrive_total = parse_second_time('gdrive')
strategy = ['gdrive']*len(gdrive_opening)
fuse_opening, fuse_reading, fuse_total = parse_second_time('gcsfuse')
strategy.extend(['gcsfuse']*len(fuse_opening))
api_opening, api_reading, api_total = parse_second_time('storage_api')
strategy.extend(['api']*len(api_opening))

data_second_time = pd.DataFrame(
    dict(
         strategy=strategy,
         opening=gdrive_opening + fuse_opening + api_opening,
         reading=gdrive_reading + fuse_reading + api_reading,
         total=gdrive_total + fuse_total + api_total
    )
)

In [None]:
sns.histplot(data_second_time, x='opening', bins=50, hue='strategy')

In [None]:
sns.histplot(data_second_time, x='reading', bins=50, hue='strategy')

In [None]:
sns.histplot(data_second_time, x='total', bins=50, hue='strategy')

In [None]:
!ls "gdrive/My Drive/ML_data"