In [1]:
import sagemaker
import boto3
import pandas as pd
from sagemaker import get_execution_role

In [2]:
# get region name
region = boto3.Session().region_name
print ('region -> {}'.format(region))

region -> eu-west-1


In [3]:
# initialize session
session = sagemaker.Session()

# bucket details
bucket = 'snowflake-getting-started'
prefix = 'bank-marketing'

# get execution role
role = get_execution_role()

In [4]:
#download the dataset - this could be coming from snowflake as well
!wget -N https://datahub.io/machine-learning/bank-marketing/r/bank-marketing.csv
local_data_path = 'bank-marketing.csv'

--2020-07-30 19:11:55--  https://datahub.io/machine-learning/bank-marketing/r/bank-marketing.csv
Resolving datahub.io (datahub.io)... 172.67.157.38, 104.18.49.253, 104.18.48.253, ...
Connecting to datahub.io (datahub.io)|172.67.157.38|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://pkgstore.datahub.io/machine-learning/bank-marketing/bank-marketing_csv/data/000466faca88d1fc9961b0fe85ff995f/bank-marketing_csv.csv [following]
--2020-07-30 19:11:56--  https://pkgstore.datahub.io/machine-learning/bank-marketing/bank-marketing_csv/data/000466faca88d1fc9961b0fe85ff995f/bank-marketing_csv.csv
Resolving pkgstore.datahub.io (pkgstore.datahub.io)... 104.18.48.253, 104.18.49.253, 172.67.157.38, ...
Connecting to pkgstore.datahub.io (pkgstore.datahub.io)|104.18.48.253|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘bank-marketing.csv’ not modified on server. Omitting download.



In [5]:
RANDOM_STATE = 99

# load csv in memory
data = pd.read_csv('bank-marketing.csv')

#For CSV training, the algorithm assumes that the target variable is in the first column and that the CSV does not have a header record.
#For CSV inference, the algorithm assumes that CSV input does not have the label column.
#more at https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html

## rearrange columns so that the target 'class' becomes first attributes
cols = list(data)
cols.insert(0, cols.pop(cols.index('Class')))
data = data.loc[:, cols]

## Conver the values in Class variable as 0,1 instead of 1,2
data.loc[data['Class']==1,'Class']=0
data.loc[data['Class']==2,'Class']=1
data.head()

Unnamed: 0,Class,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
0,0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown
1,0,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown
2,0,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown
3,0,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown
4,0,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown


In [6]:
# split data into test and training
train_data = data.sample(frac=0.8, random_state=RANDOM_STATE)
test_data = data.drop(train_data.index)

#test_data = test_data.drop(['Class'],axis=1)
test_data.head()

Unnamed: 0,Class,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16
13,0,58,technician,married,unknown,no,71,yes,no,unknown,5,may,71,1,-1,0,unknown
14,0,57,services,married,secondary,no,162,yes,no,unknown,5,may,174,1,-1,0,unknown
18,0,60,retired,married,primary,no,60,yes,no,unknown,5,may,219,1,-1,0,unknown
33,0,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown
36,0,25,blue-collar,married,secondary,no,-7,yes,no,unknown,5,may,365,1,-1,0,unknown


In [7]:
# put train data into csv
train_data.to_csv('train_data.csv', index=False, header=False)

# upload train csv to s3
train_data_s3_path = session.upload_data(path='train_data.csv',
                                         bucket=bucket,
                                         key_prefix=prefix+'/train')
print('train data uploaded to {}'.format(train_data_s3_path))

train data uploaded to s3://snowflake-getting-started/bank-marketing/train/train_data.csv


In [8]:
# put test data into csv
test_data.to_csv('test_data.csv', index=False, header=False)

# upload test csv to s3
test_data_s3_path = session.upload_data(path='test_data.csv',
                                         bucket=bucket,
                                        key_prefix=prefix+'/test')
print('train data uploaded to {}'.format(test_data_s3_path))

train data uploaded to s3://snowflake-getting-started/bank-marketing/test/test_data.csv
