# Importing Important Libraries

### Steps to be followed

1) Importing necessary libraries 
2) Creating S3 Bucket
3) Mapping Train and Test data in S3
4) Mapping the path of model in S3


In [10]:
import sagemaker #we are using sager because in this project we are going to use the inbuild algorithms which are present in AWS Sagemaker(eg : - XGBoost)
import boto3 #whenever working with sage make you've to import boto3 by using boto3 we can also read s3 bucket from local python environment if it is public
from sagemaker.amazon.amazon_estimator import get_image_uri #we'll be downloading an image container which has the whole xgboost inbuilt algorithm that whole thing can be download using that get_image_uri library
from sagemaker.session import s3_input, Session #ss3_input and Session are the functions we are importing to configure input data channels from Amazon S3 (Simple Storage Service) for training a machine learning model on SageMaker and to interact with various SageMaker resources, such as creating training jobs, deploying models, managing endpoints, etc
import pandas

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


#### Creating a S3 Bucket

In [12]:
bucket_name = 'projectbucket189' #assigning the name of the bucket
my_region = boto3.session.Session().region_name #setting the region of the instance
print(my_region)

us-east-1


In [7]:
# Now creating an S3 bucket with help of code

# Initialize an S3 resource
s3 = boto3.resource("s3")
# Initialize an S3 client with a specific region
s3 = boto3.client('s3', region_name='us-east-1')
try: 
     # Check if the specified region is 'us-east-1'
    if my_region == 'us-east-1':
        # Attempt to create an S3 bucket
        s3.create_bucket(Bucket=bucket_name)
    # If no exception occurred, print success message
    print('S3 Bucket created successfully')
except Exception as e :
    # If an exception occurred during bucket creation, print the error message
    print('s3 error: ', e)
        

S3 Bucket created successfully


In [14]:
# Setting an output path where our model will be saved once we train it 

# using the sagemaker we are using the xgboost built in alog
prefix = 'xgboost-as-a-built-in-algo'
#using string formatting we are setting our output path in place of those 2 {}'s in first{} our bucket name will be placed and in the second {} our prefix which is xgboost algo
output_path= 's3://{}/{}/output'.format(bucket_name, prefix) 
print(output_path)
# In the output we can see our output path

s3://projectbucket189/xgboost-as-a-built-in-algo/output


### Downloading the dataset and storing it in our S3 bucket

In [1]:
import pandas as pd
# with the help of url lib we are downloading out dataset 
import urllib
try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv","bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ' ,e)

try:
#     and loading the dataset into a variable name model_data
    model_data = pd.read_csv('./bank_clean.csv',index_col= 0)
    print('Success : Data loaded into dataframe. ')
except Exception as e:
    print('Data load error: ', e)
    

Success: downloaded bank_clean.csv.
Success : Data loaded into dataframe. 


### Dataset details :- Our Dataset is basically a cleaned dataset which states weather the customer buys the product or not (bascially it is onehot encoded)

### Train & test Split

In [4]:
import numpy as np 
# train test split is not done as usual like we do xtrain, ytrain, xtest, ytest basically who data is divided into train_data and test_data so both of the train and test data will be consisting both dependent and independent variables 
train_data, test_data = np.split(model_data.sample(frac=1, random_state = 1729), [int(0.7* len(model_data))])
# getting the shapes of tain and test data
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


#### Remember :whenever we are dealing with sagemaker the dependent feature should be your first column

### Creating & Saving seperate Train and test data into s3 Bucket

In [16]:
# Training Data
import os
# from y_no and y_yes dependent feature we are taking y_yes feature and concatinating all other features and creating into a csv file
pd.concat([train_data['y_yes'],train_data.drop(['y_no','y_yes'],axis =1)], axis =1).to_csv('train.csv',index =False , header = False) 
# uploading that train.csv file into s3 bucket
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
# Whenever we're training our model the data path is given from the s3 bucket for that we've to create the path
# inorder to create the path we are using an inbuilt function of sagemaker which is sagemaker.TrainingInput 
# then we are giving the whole path of the training folder just like last time we did in setting output path 
# and the content type will be csv becoz the whole which we're uploading is a csv file
s3_input_train = sagemaker.TrainingInput(s3_data = 's3://{}/{}/train'.format(bucket_name, prefix), content_type ='csv')

In [17]:
# Test Data
# doing the same for the test data as well
import os
pd.concat([test_data['y_yes'],test_data.drop(['y_no','y_yes'],axis =1)], axis =1).to_csv('test.csv',index =False , header = False) 
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data = 's3://{}/{}/test'.format(bucket_name, prefix), content_type ='csv')