# Libraries and Installations

In [1]:
!pip install sagemaker-experiments
!pip install s3fs
!pip install matplotlib
!pip install seaborn
!pip install shap
!pip install smdebug


Collecting s3fs
  Downloading s3fs-2021.4.0-py3-none-any.whl (23 kB)
Collecting fsspec==2021.04.0
  Downloading fsspec-2021.4.0-py3-none-any.whl (108 kB)
Collecting aiobotocore>=1.0.1
  Downloading aiobotocore-1.3.0.tar.gz (48 kB)
Collecting botocore<1.20.50,>=1.20.49

ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

boto3 1.17.57 requires botocore<1.21.0,>=1.20.57, but you'll have botocore 1.20.49 which is incompatible.



  Downloading botocore-1.20.49-py2.py3-none-any.whl (7.4 MB)
Collecting aiohttp>=3.3.1
  Downloading aiohttp-3.7.4.post0-cp38-cp38-win_amd64.whl (635 kB)
Collecting aioitertools>=0.5.1
  Downloading aioitertools-0.7.1-py3-none-any.whl (20 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.6.3-cp38-cp38-win_amd64.whl (125 kB)
Collecting async-timeout<4.0,>=3.0
  Downloading async_timeout-3.0.1-py3-none-any.whl (8.2 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.1.0-cp38-cp38-win_amd64.whl (48 kB)
Building wheels for collected packages: aiobotocore
  Building wheel for aiobotocore (setup.py): started
  Building wheel for aiobotocore (setup.py): finished with status 'done'
  Created wheel for aiobotocore: filename=aiobotocore-1.3.0-py3-none-any.whl size=45784 sha256=dbb65cfe3ee7cb0799d03ee3fedf8b9b0d082cbf37340987876ffc23b7621ea2
  Stored in directory: c:\users\sudharsan.munusa\appdata\local\pip\cache\wheels\f3\c8\af\bc69105edc81462cf4bca30d72a792a173f80522042defc6a1
Succes

ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

aiobotocore 1.3.0 requires botocore<1.20.50,>=1.20.49, but you'll have botocore 1.20.58 which is incompatible.



Collecting pyinstrument>=3.1.3
  Downloading pyinstrument-3.4.1-py2.py3-none-any.whl (81 kB)
Collecting botocore<1.21.0,>=1.20.57
  Downloading botocore-1.20.58-py2.py3-none-any.whl (7.4 MB)
Collecting pyinstrument-cext>=0.2.2
  Downloading pyinstrument_cext-0.2.4-cp38-cp38-win_amd64.whl (8.5 kB)
Installing collected packages: pyinstrument-cext, pyinstrument, smdebug, botocore
  Attempting uninstall: botocore
    Found existing installation: botocore 1.20.49
    Uninstalling botocore-1.20.49:
      Successfully uninstalled botocore-1.20.49
Successfully installed botocore-1.20.58 pyinstrument-3.4.1 pyinstrument-cext-0.2.4 smdebug-1.0.8


In [None]:
from io import StringIO
import numpy as np
import os
import pandas as pd
import boto3
import time
import s3fs
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
import re
import shap
from scipy import stats
import copy

In [None]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.analytics import ExperimentAnalytics

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig,CollectionConfig
from sagemaker.estimator import Estimator
from sagemaker.session import s3_input
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import Session

from smdebug.trials  import create_trial

# Configs

In [None]:
now = datetime.now()

current_time = now.strftime("%Y-%m-%d--%H-%M-%S")
print("current_time:",current_time)

sagemaker_session = sagemaker.Session()

role = get_execution_role()

bucket = '1905-assignment2-sm'
prefix = 'Scikit-pre-model-Inference-Pipelines'

train_data = 's3://1905-assignment2-sm/housing/imput-datasets/train_data_without_header.csv'
test_data = 's3://1905-assignment2-sm/housing/imput-datasets/test_data_without_header.csv'

FRAMEWORK_VERSION = "0.23-1"
script_path = 'sklearn_pipeline.py'
dependency_path ='dependencies.py'

base_job_name = f"Builtin-XGB-algo-{current_time}"

output_data_prefix = f'housing/datasets/output/{base_job_name}'
data_output_path = f's3://{bucket}/{output_data_prefix}'

debug_prefix = f'housing/jobs/debug/{base_job_name}'
debug_path = f's3://{bucket}/{debug_prefix}'

experiment_name_prefix = "builtin-xgboost-track13"

In [None]:
train_data

# Batch transform

## Fit the train data

In [None]:
sklearn_preprocessor = SKLearn(
    entry_point = script_path,
    role = role,
    framework_version = FRAMEWORK_VERSION,
    train_instance_type =  "ml.m5.xlarge", #"local" ,
    train_use_spot_instance = True,
    train_max_run = 600,
   # train_max_wait = 1200,
    dependencies = [dependency_path],
    sagemaker_session = sagemaker_session)

In [None]:
role

In [None]:
sklearn_preprocessor.fit(
    inputs={'train':train_data},
    job_name=base_job_name)

## Transform the training data

In [None]:
transformer = sklearn_preprocessor.transformer(
    instance_count=1,
    instance_type='ml.m5.xlarge',
    assemble_with = 'Line',
    accept = 'text/csv',
    output_path=data_output_path)

In [None]:
transformer.transform(
    data=train_data,
    content_type="text/csv",
    job_name=base_job_name+'-train')

print("Waiting for transform job:" + transformer.latest_transform_job.job_name)
transformer.wait()

In [None]:
preprocessed_train_data = transformer.output_path

In [None]:
preprocessed_train_data

## Transform the test data

In [None]:
transformer.transform(
    data=test_data,
    content_type="text/csv",
    job_name=base_job_name+"-test")

print("Waiting for transform job:" + transformer.latest_transform_job.job_name)
transformer.wait()

In [None]:
preprocessed_test_data = transformer.output_path

In [None]:
f'{output_data_prefix}'

## Upload processed data to s3

In [None]:
client = boto3.client('s3')
obj = client.get_object(Bucket=bucket, Key = f'{output_data_prefix}/train_data_without_header.csv.out')
body = obj['Body']
csv_string = body.read().decode('utf-8')
processed_train_data = pd.read_csv(StringIO(csv_string))

In [None]:
train_file = 'processed_train_data.csv'
processed_train_data.to_csv(train_file,index=False,header=False)
with open(train_file,'rb') as data:
    boto3.Session().resource('s3').Bucket(bucket).upload_fileobj(data,os.path.join(output_data_prefix,'processed-train-data.csv'))

In [None]:
obj = client.get_object(Bucket=bucket, Key = f'{output_data_prefix}/test_data_without_header.csv.out')
body = obj['Body']
csv_string = body.read().decode('utf-8')
processed_test_data = pd.read_csv(StringIO(csv_string))

test_file = 'processed_test_data.csv'
processed_test_data.to_csv(test_file,index=False,header=False)
with open(test_file,'rb') as data:
    boto3.Session().resource('s3').Bucket(bucket).upload_fileobj(data,os.path.join(output_data_prefix,'processed-test-data.csv'))