# Introduction to Data Formats and S3

In [1]:
import pandas as pd
import numpy as np

import boto3
import sagemaker.amazon.common as smac

In [2]:
np.random.seed(5)

In [2]:
# NOTE: Specify your bucket
s3_bucket_name = 'bucket-jd-1'

## Sample DataSet
### Three features x1,x2,x3 and a target variable y

In [3]:
n = 10

x1 = np.random.random_sample(n)       # n floating point numbers between 0 and 1
x2 = np.random.randint(100,200,n)     # n integers
x3 = np.random.random_sample(n) * 10  # n floating point numbers between 0 and 10
y = np.random.randint(0,2,n)          # Response variable 0 or 1  

In [4]:
y

array([0, 1, 1, 1, 1, 0, 1, 0, 1, 1])

In [5]:
df = pd.DataFrame({'x1':x1,
              'x2':x2, 
              'x3':x3,
              'y':y})

In [6]:
df

Unnamed: 0,x1,x2,x3,y
0,0.378391,115,8.645204,0
1,0.40419,150,5.832953,1
2,0.723977,131,5.789761,1
3,0.596353,189,3.331292,1
4,0.60825,197,7.402876,1
5,0.166842,171,6.859591,0
6,0.130583,110,8.667402,1
7,0.057972,158,4.844501,0
8,0.554657,136,4.248542,1
9,0.935385,192,5.565186,1


In [7]:
# Write to SageMaker Notebook Instance
df.to_csv('demo_file.csv',index=False)

In [8]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [9]:
# http://boto3.readthedocs.io/en/latest/guide/s3.html
def download_from_s3(filename, bucket, key):
    with open(filename,'wb') as f:
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).download_fileobj(f)

In [10]:
write_to_s3('demo_file.csv', s3_bucket_name, 'data_format/demo_file.csv')

In [11]:
download_from_s3('demo_file_from_s3.csv',s3_bucket_name,'data_format/demo_file.csv')

## RecordIO Format

We will use SageMaker SDK write_numpy_to_dense_tensor() method to create RecordIO files


Data Types: Int32, Float32, Float64  

Reference:
https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/amazon/common.py

In [16]:
df.head()

Unnamed: 0,x1,x2,x3,y
0,0.378391,115,8.645204,0
1,0.40419,150,5.832953,1
2,0.723977,131,5.789761,1
3,0.596353,189,3.331292,1
4,0.60825,197,7.402876,1


In [17]:
# X must be an array
X = df[['x1','x2','x3']].to_numpy()

In [18]:
X

array([[3.78390927e-01, 1.15000000e+02, 8.64520393e+00],
       [4.04189633e-01, 1.50000000e+02, 5.83295327e+00],
       [7.23976987e-01, 1.31000000e+02, 5.78976088e+00],
       [5.96353404e-01, 1.89000000e+02, 3.33129154e+00],
       [6.08250343e-01, 1.97000000e+02, 7.40287576e+00],
       [1.66841606e-01, 1.71000000e+02, 6.85959120e+00],
       [1.30582833e-01, 1.10000000e+02, 8.66740173e+00],
       [5.79715060e-02, 1.58000000e+02, 4.84450071e+00],
       [5.54657051e-01, 1.36000000e+02, 4.24854157e+00],
       [9.35385477e-01, 1.92000000e+02, 5.56518611e+00]])

In [19]:
type(X)

numpy.ndarray

In [20]:
# Response/Target variable needs to a vector
# y must be a vector 
y = df[['y']].to_numpy()

In [21]:
# it is right now a array of dimensions 10x1
y.shape

(10, 1)

In [22]:
y

array([[0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1]])

In [23]:
# Flatten to a single dimension array of 10 elements
y = y.ravel()

In [24]:
y

array([0, 1, 1, 1, 1, 0, 1, 0, 1, 1])

In [25]:
def write_recordio_file (filename, x, y=None):
    with open(filename, 'wb') as f:
        smac.write_numpy_to_dense_tensor(f, x, y)

In [26]:
def read_recordio_file (filename, recordsToPrint = 10):
    with open(filename, 'rb') as f:
        record = smac.read_records(f)
        for i, r in enumerate(record):
            if i >= recordsToPrint:
                break
            print ("record: {}".format(i))
            print(r)

In [27]:
write_recordio_file('demo_file.recordio',X,y)

In [28]:
df.head(3)

Unnamed: 0,x1,x2,x3,y
0,0.378391,115,8.645204,0
1,0.40419,150,5.832953,1
2,0.723977,131,5.789761,1


In [29]:
read_recordio_file('demo_file.recordio',3)

record: 0
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.3783909271564435
      values: 115.0
      values: 8.64520392578028
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 0
    }
  }
}

record: 1
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.4041896328543546
      values: 150.0
      values: 5.832953265698598
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 1
    }
  }
}

record: 2
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.723976987153143
      values: 131.0
      values: 5.789760876800564
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 1
    }
  }
}



In [30]:
write_to_s3('demo_file.recordio', s3_bucket_name, 'data_format/demo_file.recordio')

In [31]:
download_from_s3('demo_file_from_s3.recordio',s3_bucket_name,'data_format/demo_file.recordio')