In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import boto3
import botocore
import json

# Dataset : downloading - preprocessing - uploading

First download the [dataset](http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip) and keep it in the data folder with name 'bankadditionalfull.csv'

In [None]:
raw_data = pd.read_csv('data/bankadditionalfull.csv', sep=';', index_col=0)
raw_data.head(2)

In [None]:
def identify_categorical(dataframe):   # Finds categorical data from the dataframe
    total = dataframe.columns
    numerical = dataframe._get_numeric_data().columns
    dictionary = {'CATEGORICAL': list(set(total) - set(numerical)), 'NUMERIC':list(numerical)}
    return dictionary

features = identify_categorical(raw_data)

In [None]:
# Do not execute this code block! It is only shown for understanding purpose
# As Amazon ML accepts objects type of data as well as long as we specify data types correctly

for columns in features['CATEGORICAL']:
    le = LabelEncoder()
    le.fit(raw_data[columns])
    raw_data[columns] = le.transform(raw_data[columns])
    del(le)

# Saving the file
# This file needs to be saved in S3 Bucket to use Amazon ML
raw_data.to_csv('data/bankadditionalfull_.csv', index=False)

raw_data.head(2)    

Before starting off with using amazon services, do [this](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html)

And it is mandatory to save the data in either S3 or RedShift, otherwise you cannot use Amazon ML

## If data already exists in S3 Bucket

In [None]:
# Checking if a bucket exists already 
s3 = boto3.client('s3')
response = s3.list_buckets()

# response is a dictionary, which gives
# metadata, bucket name & owner Id
bucket = [buckets['Name'] for buckets in response['Buckets']]
BUCKET_NAME = bucket[0]
KEY = 'bankadditionalfull_.csv' # File name in the bucket, not the Owner ID.

s3 = boto3.resource('s3')

try:
    s3.Bucket(BUCKET_NAME).download_file('bankadditionalfull_.csv', 'data/bankadditionalfull_.csv')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The file does not exist.")
    else:
        raise        

## If data is in your local machine

In [None]:
# For users who do not have a S3 Bucket created
import boto3

s3 = boto3.client('s3')
s3.create_bucket(Bucket='<your_bucket_name>')
s3.upload_file(filename, '<your_bucket_name>', filename)

- To know more about using boto3 to access S3 buckets click [here](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-example-creating-buckets.html)

- Since we have saved our dataset in the S3 Bucket, we can now move forward to creating ML model.

- First we need to create a datasource. A datasource is basically the information of our dataset. Like, 
    * Where is it stored
    * Info of the data features (aka categorical/numerical/text/binary)

![DataSource](images/createdatasource.png)

In [None]:
client = boto3.client('machinelearning', endpoint_url='https://machinelearning.us-east-1.amazonaws.com/')

#### Creating JSON file for DataSchema

In [None]:
# Copied this from boto3 documentation
# Even you copy it as it is
DataSchema = { 
    "version": "1.0",
    "targetFieldName": "y",
    "dataFormat": "CSV",
    "dataFileContainsHeader": 'true', # Set it to true because, CSV contains feature names.
    }

# Now we will fill the "attributes"
attributes = []
for featureType in list(features.keys()):
    for featureName in features[featureType]:
        attributes.append({'fieldName':featureName, 'fieldType':featureType})
        
DataSchema['attributes'] = attributes  

# Saving DataSchema in a JSON file
with open('data/dataschema.json', 'w') as outfile:
    json.dump(DataSchema, outfile)

In [None]:
# Make sure for Amazon ML you set your region name to 'us-east-1' or 'eu-west-1'
# As AML works only for US East(Virginia) and EU (Ireland) as of now.

response = client.create_data_source_from_s3(
    DataSourceId='string',
    DataSourceName='Example Predictions',  # Any name will do
    DataSpec={
        'DataLocationS3': 's3://bankclassification/bankadditionalfull_.csv', # s3://bucket_name/file_name
        'DataSchema': 's3://bankclassification/dataschema.json'
    },
    ComputeStatistics=True
)