In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import boto3
import json

# Dataset : downloading - preprocessing - uploading

First download the [dataset](http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip) and keep it in the data folder with name 'bankadditionalfull.csv'

In [None]:
raw_data = pd.read_csv('data/bankadditionalfull.csv', sep=';', index_col=0)
raw_data.head(2)

In [None]:
# Finds categorical data from the dataframe
# Needed for creating the Data Schema, we'll see afterwards

def identify_categorical(dataframe):   
    total = dataframe.columns
    numerical = dataframe._get_numeric_data().columns
    dictionary = {'CATEGORICAL': list(set(total) - set(numerical)), 'NUMERIC':list(numerical)}
    return dictionary

features = identify_categorical(raw_data)

Before starting off with using amazon services, do [this](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html)

And it is mandatory to save the data in either S3 or RedShift, otherwise you cannot use Amazon ML

## If data already exists in S3 Bucket

In [None]:
# Checking if a bucket exists already 
s3 = boto3.client('s3')
response = s3.list_buckets()

bucket = [buckets['Name'] for buckets in response['Buckets']]

# If you already have uploaded the data file to S3 Bucket, you would get the list of buckets in bucket variable
print(bucket)

## If data is in your local machine and not yet uploaded

In [None]:
# For users who do not have a S3 Bucket created
s3 = boto3.client('s3')
s3.create_bucket(Bucket='thinkdifferentnow') # Specify any name, all the buckets should have a unique name.
s3.upload_file('data/bankadditionalfull.csv', 'thinkdifferentnow', 'bankadditionalfull.csv')

- To know more about using boto3 to access S3 buckets click [here](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-example-creating-buckets.html)

- Since we have saved our dataset in the S3 Bucket, we can now move forward to creating ML model.

- First we need to create a datasource. A datasource is basically the information of our dataset. Like, 
    * Where is it stored
    * Info of the data features (aka categorical/numerical/text/binary)

![DataSource](images/createdatasource.png)

In [None]:
client = boto3.client('machinelearning')

#### Creating JSON file for DataSchema

In [None]:
# Copied this from boto3 documentation
# Even you copy it as it is
DataSchema = { 
    "version": "1.0",
    "targetFieldName": "y",
    "dataFormat": "CSV",
    "dataFileContainsHeader": 'true', # Set it to true because, CSV contains feature names.
    }

# Now we will fill the "attributes"
attributes = []
for featureType in list(features.keys()):
    for featureName in features[featureType]:
        attributes.append({'fieldName':featureName, 'fieldType':featureType})
        
DataSchema['attributes'] = attributes  

# Saving DataSchema in a JSON file
with open('data/dataschema.json', 'w') as outfile:
    json.dump(DataSchema, outfile)

In [None]:
# Make sure for Amazon ML you set your region name to 'us-east-1' or 'eu-west-1'
# As AML works only for US East(Virginia) and EU (Ireland) as of now.


# Make sure you wait for 4-5 minutes once you execute this code cell.
_ = client.create_data_source_from_s3(
    DataSourceId='ds-sYkrd9KZMme', # Any ID will do
    DataSourceName='tryingboto',  # Any name will do
    DataSpec={
        'DataLocationS3': 's3://bankclassification/bankadditionalfull_.csv', # s3://bucket_name/file_name
        # DataScehma is the string of the DataSchema dictionary that we created before. You can copy-paste it from dataschema.json that we created.
        'DataSchema': '{"version": "1.0", "targetFieldName": "y", "dataFormat": "CSV", "dataFileContainsHeader": "true", "attributes": [{"fieldName": "day_of_week", "fieldType": "CATEGORICAL"}, {"fieldName": "y", "fieldType": "BINARY"}, {"fieldName": "contact", "fieldType": "CATEGORICAL"}, {"fieldName": "education", "fieldType": "CATEGORICAL"}, {"fieldName": "loan", "fieldType": "CATEGORICAL"}, {"fieldName": "poutcome", "fieldType": "CATEGORICAL"}, {"fieldName": "default", "fieldType": "CATEGORICAL"}, {"fieldName": "marital", "fieldType": "CATEGORICAL"}, {"fieldName": "job", "fieldType": "CATEGORICAL"}, {"fieldName": "month", "fieldType": "CATEGORICAL"}, {"fieldName": "housing", "fieldType": "CATEGORICAL"}, {"fieldName": "duration", "fieldType": "NUMERIC"}, {"fieldName": "campaign", "fieldType": "NUMERIC"}, {"fieldName": "pdays", "fieldType": "NUMERIC"}, {"fieldName": "previous", "fieldType": "NUMERIC"}, {"fieldName": "emp.var.rate", "fieldType": "NUMERIC"}, {"fieldName": "cons.price.idx", "fieldType": "NUMERIC"}, {"fieldName": "cons.conf.idx", "fieldType": "NUMERIC"}, {"fieldName": "euribor3m", "fieldType": "NUMERIC"}, {"fieldName": "nr.employed", "fieldType": "NUMERIC"}]}'
    },
    ComputeStatistics=True
)

# It turns out, surprisingly it took 16 mins of compute time for creating the datasource :(

Once Data Source is created, you'd get this:

![Data Source](images/dataSource.png)

In [None]:
__ = client.create_ml_model(
    MLModelId='mlmodelid_',
    MLModelName='marketingbank',
    MLModelType='BINARY',   # Amazon ML has 3 types of model types: BINARY | MULTICLASS | REGRESSION
    TrainingDataSourceId='ds-sYkrd9KZMme'
)

Once the model is trained, you would get this in your Dashboard

![ML model trained](images/mlmodel.png)

In [None]:
# To create batch predictions on the test data
# We again need to create a datasource for the dataset
# And enter the datasource id below

___ = client.create_batch_prediction(
    BatchPredictionId='batchpredictionid_',
    BatchPredictionName='predictresults',
    MLModelId='mlmodelid_',
    BatchPredictionDataSourceId='ds-CmsaR7xPeTU',
    OutputUri='s3://bankclassification/'
)

# OutputUri specifies in which S3 bucket directory shall the prediction folder be placed.

After executing above code cell, if you go and check your Dashboard you would see something like this:

![Batch Predictions](images/batchPredictions.png)



![Completed](images/batchPredictionsCompleted.png)


And you can find your predictions folder with a name of batch-prediction in the give S3 bucket