In [None]:
# Importing required libraries.
import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)

import boto3
import sagemaker

In [None]:
# Update SageMaker SDK if necessary 
if int(sagemaker.__version__.split('.')[0]) != 2:
    !pip install sagemaker==2.24.1
    print("Updating SageMakerVersion. Please restart the kernel")
else:
    print("SageMaker SDK version is good")

You may want to re-use the resources you aready created with AWS. Run the cell below to load any prevously created variables. You should see a print-out of the existing variables. If you don't see anything printed then it's probably the first time you are running the notebook!

In [None]:
%store -r
%store

In [None]:
boto_session = boto3.Session()
region = boto_session.region_name
print("Region = {}".format(region))

sagemaker_boto_client = boto_session.client('sagemaker')

sagemaker_session = sagemaker.session.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_boto_client)


default_bucket = sagemaker_session.default_bucket()  # Alterantively you can use our custom bucket here.
prefix = 'sagemaker-tutorial'  # use this prefix to store all files pertaining to this workshop.
data_prefix = prefix + '/data'

%store default_bucket
%store prefix
%store data_prefix

Use the following code snippet to download the dataset to `/data/` folder

In [None]:
local_data_dir = '../data'
!mkdir $local_data_dir
!wget -O ../data/default_of_credit_card.xls  https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls
%store local_data_dir

In [None]:
# load data as dataframe
local_data_path = f'{local_data_dir}/default_of_credit_card.xls'

df = pd.read_excel(local_data_path, header=1)
df.head()

check for Null values in the data. If the result is not 0, we need to think of imputation strategies


In [None]:
print(f'Total number of missing values in the data: {df.isnull().sum().sum()}')

In [None]:
# plot the bar graph customer gender
df['SEX'].value_counts(normalize=True).plot.bar()
plt.xticks([0,1], ['Male', 'Female'])

As seen in the chart, there is an imbalance in the gender ratio

In [None]:
df['default payment next month'].value_counts(normalize=True).plot.bar()
plt.xticks([0,1], ['Not Default', 'Default'])

Majority of clients didn't default on their payment

In [None]:
# plot the age distribution
plt.hist(df['AGE'], bins=30)
plt.xlabel('Clients Age Distribution')

We are using the data stored in `s3_raw_data` with data wrangler in the next notebook

# Data Wrangler


Amazon SageMaker Data Wrangler reduces the time it takes to aggregate and prepare data for machine learning (ML). With SageMaker Data Wrangler, you can simplify the process of data preparation and feature engineering, and complete each step of the data preparation workflow, including data selection, cleansing, exploration, and visualization from a single visual interface.

 
<span style="color:red">**TODO:  ADD DATAWRANGLER SCREENSHOTS + Bias**</span>

 
<span style="color:red">**TODO:  IAM Policies required for the demo including [feature store] and lake formation access to feature store (https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-adding-policies.html)**</span>


### Upload data to S3 for Data Wrangler

In [None]:
local_raw_path = f'{local_data_dir}/dataset.csv'
df.to_csv(local_raw_path, index=False)

response = sagemaker_session.upload_data(local_raw_path,
                                         bucket=default_bucket, 
                                         key_prefix=data_prefix)
print(response)

s3_raw_data = response

%store s3_raw_data
%store local_raw_path