In [48]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install -U scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-0.19.1-cp35-cp35m-manylinux1_x86_64.whl
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.19.1
[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
import boto3
import botocore
BUCKET_NAME = "nanonets-platform-task"


In [2]:
def get_files():
    s3 = boto3.client('s3', 'us-west-2')
    paginator = s3.get_paginator('list_objects')
    page_iterator = paginator.paginate(Bucket = BUCKET_NAME)
    images = []
    for page in page_iterator:
        for a in page['Contents']:
            category, filename = a['Key'].split('/', 1)
            if category == 'processed': # this is where we store processed images.
                continue
            if a['Key'].endswith('jpg'):
                image_format = 'jpg'
            elif a['Key'].endswith('png'):
                image_format = 'png'
            else:
                image_format = 'unknown'
            images.append({'category': category, 'name': filename, 's3key': a['Key'], 'format': image_format})
            
    return images
    

In [3]:
images = get_files()

In [4]:
categories = {image['category'] for image in images}
categories

{'daisy', 'dandelion', 'roses', 'sunflowers'}

In [101]:
len(images)

2598

In [102]:
[a for a in images if a['format'] == 'unknown']

[{'category': 'daisy',
  'format': 'unknown',
  'name': 'LICENSE.txt',
  's3key': 'daisy/LICENSE.txt'}]

In [19]:
import pandas as pd

images_df = pd.DataFrame(images)
pp = images_df.head()

In [103]:
images_df = images_df[images_df['format'] != 'unknown'] # drop unknown file types

In [104]:
images_df.shape

(2597, 6)

In [80]:
from io import BytesIO
from PIL import Image
import imghdr

s3 = boto3.client('s3', 'us-west-2')
TARGET_SIZE = (300, 300)
TARGET_COLORS = (255,255,255)

def download_image(row):
    string_io = BytesIO()
    s3.download_fileobj(Bucket=BUCKET_NAME, Key=row['s3key'], Fileobj=string_io)
    return string_io

def get_image_format(dat):
    return imghdr.what(None, dat.getvalue())
        
def transform_image(row):
    img_format = get_image_format(row['rawdata'])
    if img_format == None:
        return "Invalid"
    img_dat = Image.open(row['rawdata'])
    if img_format != 'jpeg':
        rgb_img_dat = img_dat.convert('RGB')
        resized = rgb_img_dat.resize(TARGET_SIZE)
    else:
        resized = img_dat.resize(TARGET_SIZE)
    out_bytes = BytesIO()
    resized.save(out_bytes, format='jpeg')
    out_bytes.seek(0)
    return out_bytes


In [27]:
images_df['rawdata'] = images_df.apply(download_image, axis=1) # this will take a while

In [87]:
images_df['resized'] = images_df.apply(transform_image, axis=1)

In [82]:
images_df.shape
images_df[images_df['resized'] == "Invalid"].shape # no invalid images

(0, 6)

In [51]:
import numpy as np

In [88]:
train, validate, test = np.split(images_df.sample(frac=1), [int(.7*len(images_df)), int(.9*len(images_df))])

In [89]:
(validate.shape, test.shape, train.shape)

((520, 6), (260, 6), (1817, 6))

In [92]:
def upload_df_to_s3(df, split=""):
    print('Uploading to bucket:{0}'.format(BUCKET_NAME))
    for index, row in df.iterrows():
        category_tracker[row['category']] += 1
        s3key = 'processed/{0}/{1}/img{2}.jpg'.format(row['category'], 
                                                      split, 
                                                      category_tracker[row['category']])
        print('Uploading file:{0}'.format(s3key))
        s3.upload_fileobj(Bucket=BUCKET_NAME, Fileobj=row['resized'], Key=s3key, Callback=lambda x: print(x))   

In [93]:
category_tracker = dict(zip(categories, [0]*len(categories)))
upload_df_to_s3(train, "train")

Uploading to bucket:nanonets-platform-task
Uploading file:processed/dandelion/train/img1.jpg
22790
Uploading file:processed/roses/train/img1.jpg
22316
Uploading file:processed/dandelion/train/img2.jpg
16454
Uploading file:processed/roses/train/img2.jpg
13574
Uploading file:processed/dandelion/train/img3.jpg
10552
Uploading file:processed/dandelion/train/img4.jpg
12050
Uploading file:processed/dandelion/train/img5.jpg
12594
Uploading file:processed/sunflowers/train/img1.jpg
25863
Uploading file:processed/roses/train/img3.jpg
17270
Uploading file:processed/dandelion/train/img6.jpg
15092
Uploading file:processed/dandelion/train/img7.jpg
27793
Uploading file:processed/dandelion/train/img8.jpg
17113
Uploading file:processed/dandelion/train/img9.jpg
17734
Uploading file:processed/daisy/train/img1.jpg
8197
Uploading file:processed/dandelion/train/img10.jpg
17412
Uploading file:processed/dandelion/train/img11.jpg
12099
Uploading file:processed/roses/train/img4.jpg
26155
Uploading file:processe

In [94]:
upload_df_to_s3(test, "test")

Uploading to bucket:nanonets-platform-task
Uploading file:processed/dandelion/test/img627.jpg
17473
Uploading file:processed/sunflowers/test/img293.jpg
28448
Uploading file:processed/daisy/test/img439.jpg
17000
Uploading file:processed/daisy/test/img440.jpg
19325
Uploading file:processed/roses/test/img462.jpg
10862
Uploading file:processed/sunflowers/test/img294.jpg
36659
Uploading file:processed/sunflowers/test/img295.jpg
9072
Uploading file:processed/roses/test/img463.jpg
15799
Uploading file:processed/dandelion/test/img628.jpg
19035
Uploading file:processed/dandelion/test/img629.jpg
20038
Uploading file:processed/sunflowers/test/img296.jpg
40453
Uploading file:processed/roses/test/img464.jpg
11422
Uploading file:processed/roses/test/img465.jpg
14726
Uploading file:processed/roses/test/img466.jpg
12012
Uploading file:processed/dandelion/test/img630.jpg
15852
Uploading file:processed/dandelion/test/img631.jpg
14229
Uploading file:processed/daisy/test/img441.jpg
24493
Uploading file:pr

In [95]:
upload_df_to_s3(validate, "validate")

Uploading to bucket:nanonets-platform-task
Uploading file:processed/dandelion/validate/img725.jpg
17840
Uploading file:processed/daisy/validate/img500.jpg
16168
Uploading file:processed/roses/validate/img519.jpg
17403
Uploading file:processed/roses/validate/img520.jpg
12147
Uploading file:processed/sunflowers/validate/img337.jpg
20354
Uploading file:processed/roses/validate/img521.jpg
10938
Uploading file:processed/daisy/validate/img501.jpg
18120
Uploading file:processed/dandelion/validate/img726.jpg
14135
Uploading file:processed/dandelion/validate/img727.jpg
18113
Uploading file:processed/dandelion/validate/img728.jpg
21389
Uploading file:processed/sunflowers/validate/img338.jpg
22971
Uploading file:processed/daisy/validate/img502.jpg
26114
Uploading file:processed/daisy/validate/img503.jpg
16239
Uploading file:processed/dandelion/validate/img729.jpg
14605
Uploading file:processed/sunflowers/validate/img339.jpg
28768
Uploading file:processed/roses/validate/img522.jpg
16592
Uploading 