# Libraries and Installations

In [1]:
!pip install sagemaker-experiments
!pip install s3fs
!pip install matplotlib
!pip install seaborn
!pip install shap
!pip install smdebug


Collecting sagemaker-experiments
  Using cached sagemaker_experiments-0.1.30-py3-none-any.whl (42 kB)
Installing collected packages: sagemaker-experiments
Successfully installed sagemaker-experiments-0.1.30
Collecting shap
  Downloading shap-0.39.0.tar.gz (356 kB)
[K     |████████████████████████████████| 356 kB 14.4 MB/s eta 0:00:01
Collecting tqdm>4.25.0
  Downloading tqdm-4.60.0-py2.py3-none-any.whl (75 kB)
[K     |████████████████████████████████| 75 kB 8.1 MB/s  eta 0:00:01
[?25hCollecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Building wheels for collected packages: shap
  Building wheel for shap (setup.py) ... [?25ldone
[?25h  Created wheel for shap: filename=shap-0.39.0-cp36-cp36m-linux_x86_64.whl size=419939 sha256=24e478d4895b5312cfa8b105e723ff0c887064d6041697003b0acd3745cd25e0
  Stored in directory: /home/ec2-user/.cache/pip/wheels/6f/08/25/2992725334291786ea084e06cac493d93049b80e3470318a1b
Successfully built shap
Installing collected packages

In [2]:
from io import StringIO
import numpy as np
import os
import pandas as pd
import boto3
import time
import s3fs
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
import re
import shap
from scipy import stats
import copy




In [3]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.analytics import ExperimentAnalytics

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig,CollectionConfig
from sagemaker.estimator import Estimator
from sagemaker.session import s3_input
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import Session

from smdebug.trials  import create_trial













[2021-04-29 07:05:13.594 ip-172-16-36-224:5457 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None


# Configs

In [4]:
now = datetime.now()

current_time = now.strftime("%Y-%m-%d--%H-%M-%S")
print("current_time:",current_time)

sagemaker_session = sagemaker.Session()

role = get_execution_role()

bucket = '1905-assignment2-sm'
prefix = 'Scikit-pre-model-Inference-Pipelines'

train_data = 's3://1905-assignment2-sm/housing/imput-datasets/train_data_without_header.csv'
test_data = 's3://1905-assignment2-sm/housing/imput-datasets/test_data_without_header.csv'

FRAMEWORK_VERSION = "0.23-1"
script_path = 'sklearn_pipeline.py'
dependency_path ='dependencies.py'

base_job_name = f"Builtin-XGB-algo-{current_time}"

output_data_prefix = f'housing/datasets/output/{base_job_name}'
data_output_path = f's3://{bucket}/{output_data_prefix}'

debug_prefix = f'housing/jobs/debug/{base_job_name}'
debug_path = f's3://{bucket}/{debug_prefix}'

experiment_name_prefix = "builtin-xgboost-track13"

current_time: 2021-04-29--07-05-13


In [5]:
train_data

's3://1905-assignment2-sm/housing/imput-datasets/train_data_without_header.csv'

In [6]:
role

'arn:aws:iam::752400441523:role/Sagemaker_Access'

# Batch transform

## Fit the train data

In [7]:
sklearn_preprocessor = SKLearn(
    entry_point = script_path,
    role = role,
    framework_version = FRAMEWORK_VERSION,
    train_instance_type =  "ml.m5.xlarge", #"local" ,
    train_use_spot_instance = True,
    train_max_run = 600,
   # train_max_wait = 1200,
    dependencies = [dependency_path],
    sagemaker_session = sagemaker_session)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.


In [8]:
role

'arn:aws:iam::752400441523:role/Sagemaker_Access'

In [9]:
sklearn_preprocessor.fit(
    inputs={'train':train_data},
    job_name=base_job_name)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: Builtin-XGB-algo-2021-04-29--07-05-13


2021-04-29 07:05:14 Starting - Starting the training job...
2021-04-29 07:05:25 Starting - Launching requested ML instancesProfilerReport-1619679914: InProgress
.........
2021-04-29 07:07:11 Starting - Preparing the instances for training......
2021-04-29 07:08:11 Downloading - Downloading input data...
2021-04-29 07:08:41 Training - Training image download completed. Training in progress.
2021-04-29 07:08:41 Uploading - Uploading generated training model[34m2021-04-29 07:08:36,932 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-04-29 07:08:36,935 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-29 07:08:36,943 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-04-29 07:08:37,232 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-29 07:08:37,243 sagemaker-training-toolkit INFO     No GPUs detecte

## Transform the training data

In [10]:
transformer = sklearn_preprocessor.transformer(
    instance_count=1,
    instance_type='ml.m5.xlarge',
    assemble_with = 'Line',
    accept = 'text/csv',
    output_path=data_output_path)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2021-04-29-07-09-26-986


In [11]:
transformer.transform(
    data=train_data,
    content_type="text/csv",
    job_name=base_job_name+'-train')

print("Waiting for transform job:" + transformer.latest_transform_job.job_name)
transformer.wait()

INFO:sagemaker:Creating transform job with name: Builtin-XGB-algo-2021-04-29--07-05-13-train


...........................
.[34m2021-04-29 07:13:52,028 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-29 07:13:52,030 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-29 07:13:52,031 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[35m2021-04-29 07:13:52,028 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2021-04-29 07:13:52,030 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[35m2021-04-29 07:13:52,031 INFO - sagemaker-containers - nginx config: [0m
[35mworker_processes auto;[0m
[35mdaemon off;[0m
[35mpid /tmp/nginx.pid;[0m
[35merror_log  /dev/stderr;
[0m
[35mworker_rlimit_nofile 4096;
[0m
[35mevents 

In [12]:
preprocessed_train_data = transformer.output_path

In [13]:
preprocessed_train_data

's3://1905-assignment2-sm/housing/datasets/output/Builtin-XGB-algo-2021-04-29--07-05-13'

## Transform the test data

In [14]:
transformer.transform(
    data=test_data,
    content_type="text/csv",
    job_name=base_job_name+"-test")

print("Waiting for transform job:" + transformer.latest_transform_job.job_name)
transformer.wait()

INFO:sagemaker:Creating transform job with name: Builtin-XGB-algo-2021-04-29--07-05-13-test


............................[34m2021-04-29 07:18:39,155 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-29 07:18:39,158 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2021-04-29 07:18:39,158 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect 

In [15]:
preprocessed_test_data = transformer.output_path

In [16]:
f'{output_data_prefix}'

'housing/datasets/output/Builtin-XGB-algo-2021-04-29--07-05-13'

## Upload processed data to s3

In [17]:
client = boto3.client('s3')
obj = client.get_object(Bucket=bucket, Key = f'{output_data_prefix}/train_data_without_header.csv.out')
body = obj['Body']
csv_string = body.read().decode('utf-8')
processed_train_data = pd.read_csv(StringIO(csv_string))

In [18]:
train_file = 'processed_train_data.csv'
processed_train_data.to_csv(train_file,index=False,header=False)
with open(train_file,'rb') as data:
    boto3.Session().resource('s3').Bucket(bucket).upload_fileobj(data,os.path.join(output_data_prefix,'processed-train-data.csv'))

In [19]:
obj = client.get_object(Bucket=bucket, Key = f'{output_data_prefix}/test_data_without_header.csv.out')
body = obj['Body']
csv_string = body.read().decode('utf-8')
processed_test_data = pd.read_csv(StringIO(csv_string))

test_file = 'processed_test_data.csv'
processed_test_data.to_csv(test_file,index=False,header=False)
with open(test_file,'rb') as data:
    boto3.Session().resource('s3').Bucket(bucket).upload_fileobj(data,os.path.join(output_data_prefix,'processed-test-data.csv'))

## Real time Prediction using endpoint

In [20]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime, strftime
from sagemaker.estimator import Estimator
from sagemaker import PipelineModel

timestamp_prefix = current_time

scikit_learn_inferencee_model = sklearn_preprocessor.create_model()
scikit_learn_inferencee_model.env = {"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT":"text/csv"}
model_containers = [scikit_learn_inferencee_model]

model_name = 'inference-pipeline-' + current_time
endpoint_name = 'inference-pipeline-ep-' + current_time

sm_model = PipelineModel(
            name=model_name,
            role=role,
            models=model_containers)

predictor = sm_model.deploy(initial_instance_count=1,
                           instance_type='ml.m5.xlarge',
                           endpoint_name=endpoint_name,
                           #data_capture_config=data_capture_config
                           )

from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

payload = "-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY"

predictor = Predictor(
        endpoint_name = endpoint_name,
        sagemaker_session = sagemaker_session,
        serializer = CSVSerializer(),
        deserializer = JSONDeserializer(),
    )


In [38]:
print(predictor.predict(data=payload))

{'instances': [{'features': [-1.3358644914093034, 1.0625114171921215, 0.9843617822955569, -0.7963234446647113, -0.9664904061190807, -0.9606541571538452, -0.9683302007899453, 2.3372729061643134, 0.6152810290729377, -0.07388883558315307, -1.073845989834207, 0.0, 0.0, 0.0, 1.0, 0.0]}]}


In [39]:
#Delete the endpoint
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '215f8680-8a28-4feb-9008-2fead898ae7a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '215f8680-8a28-4feb-9008-2fead898ae7a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Thu, 29 Apr 2021 07:52:29 GMT'},
  'RetryAttempts': 0}}

# Train, Track and Debug

In [None]:
sess = boto3.Session()
sm = sess.client('sagemaker')
role = get_execution_role()
region = "us-east-1"

In [None]:
container = get_image_uri(region, "xgboost",repo_version="0.90-2")

In [None]:
save_interval = 2

In [None]:
prefix

In [None]:
content_type = "text/csv"
train_input = s3_input(f"s3://{bucket}/{output_data_prefix}/processed-train/processed_train_data.csv",content_type='csv')
validation_input = s3_input(f"s3://{bucket}/{output_data_prefix}/processed-test/processed_test_data.csv",content_type='csv')

In [None]:
train_df = pd.read_csv(f"s3://{bucket}/{output_data_prefix}/processed_train_data.csv")

# Pre-processing Tracker

In [None]:
with Tracker.create(display_name="Pre-Processing",sagemaker_boto_client=sm) as tracker:
    tracker.log_parameters({
        "Num_Imputer" : "SimpleImputer",
        "Num_Norm" : "StandardScaler",
        "Cat_Norm" : "SimpleImputer",
        "Cat_Convert" : "OneHotEncoder",
        "No_of_rows" : str(len(train_df))
    })
    tracker.log_input(name="xgboost-track",media_type="s3/uri",value=f"s3://{bucket}/{output_data_prefix}//processed-train/processed_train_data.csv")

## Create Experiment

In [None]:
xgboost_experiment = Experiment.create(
    experiment_name = f"{experiment_name_prefix}--{int(time.time())}",
    description = "xgboost-track",
    sagemaker_boto_client=sm)
print(xgboost_experiment)

In [None]:
trial_name_map()

In [None]:
trial_component = tracker.trial_component

In [None]:
debug_path

## Train

In [None]:
for i,eta in enumerate([0.3,0.4,0.5]):
    for j,max_depth in enumerate([2,4,6]):
        base_name = f"xbg-eta-{str(eta).replace('.','-')}-max-depth-{str(max_depth).replace('.','-')}"
        time_val = f"time-{int(time.time())}"
        trial_name = base_name + "-" + time_val
        xgb_trial = Trial.create(
            trial_name=trial_name,
            experiment_name = xgboost_experiment.experiment_name,
            sagemaker_boto_client=sm,)
        trial_name_map[(eta,max_depth)] = trial_name
        
        #associate preprocessing trial component with current trial
        xgb_trial.add_trial_component(trial_component)
        
        xgboost_estimator = Estimator(
            role=role,
            base_job_name=trial_name,
            train_instance_count=1,
            train_instance_type='ml.m5.xlarge',
            image_name=container,
            hyperparameters={
                "max_depth":str(max_depth),
                "eta":str(eta),
                "gamma":"4",
                "min_child_weight":"6",
                "subsample":"0.7",
                "silent":"0",
                "eval_metric":"rmse",
                "objective":"reg:linear",
                "num_round":"51",
            })
        enable_sagemaker_metrics=True,
        #train_use_spot_instances=True,
        train_max_run=600,
        train_max_wait=1200,
        
        debugger_hook_config=DebuggerHookConfig(
        s3_output_path=debug_path,
        collection_configs=[
                CollectionConfig(
                    name="metrics",
                    parameters={
                        "save_interval":str(save_interval)
                    }
                ),
                CollectionConfig(
                    name="feature_importance",
                    parameters={
                        "save_interval":str(save_interval)
                    }
                ),
                CollectionConfig(
                    name="full_shap",
                    parameters={
                        "save_interval":str(save_interval)
                    }
                ),
                CollectionConfig(
                    name="average_shap",
                    parameters={
                        "save_interval":str(save_interval)
                    }
                ),
            ],
        ),
        
        rules=[
            Rule.sagemaker(
                rule_configs.loss_not_decreasing(),
                rule_parameters={
                    "collection_names":"metrics",
                    "num_steps": str(save_interval*2),
                },
            ),
        ],
    )
    
    xgb_training_job_name = "xgb-training-job-{}".format(int(time.time()))
    
    xgboost_estimator.fit(
        {"train":train_input,"validation":validation_input},
        experiment_config={
            "TrialName":xgb_trial.trial_name,
            "TrialComponentDisplayName":"Training",
        },
        wait=False
    )
    
    time.sleep(4)

## List Train Jobs

In [None]:
search_expression = {
    "Filters":[
        {
            "Name":"DisplayName",
            "Operator":"Equals",
            "Value":"Training",
        }
    ],
}

In [None]:
trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=Session(sess,sm),
    experiment_name=xgboost_experiment.experiment_name,
    search_expression=search_expression,
    sort_by="metrics.validation:rmse.max",
    sort_order="Descending",
    metric_names=['validation:rmse'],
    parameter_names=['max_depth','eta','silent','gamma']
)

In [None]:
trial_component_analytics.dataframe()

## Best Training Job

In [None]:
result_df =trial_component_analytics.dataframe(force_refresh=True).sort_values(["validation:rmse - StdDev"])
#check - more sort needs to be added

In [None]:
result_df

In [None]:
best_trial_component_name = result_df.iloc[0]['TrialComponentName']
best_trial_component = TrialComponent.load(best_trial_component_name)

In [None]:
print(best_trial_component.parameters['max_depth'])
print(best_trial_component.parameters['eta'])
print(best_trial_component.parameters['min_child_weight'])

In [None]:
best_trial_component.source.source_arn.split("/")[-1]

## Lineage

In [None]:
trial_name_map

In [None]:
lineage_table = ExperimentAnalytics(
    sagemaker_session=Session(sess,sm),
    search_expression={
        "Filters":[{
            "Name":"Parents.TrialName",
            "Operator":"Equals",
            "Value":trial_name_map[(0.5,6)]
        }]
    },
    sort_by="CreationTime",
    sort_order="Ascending",
)

lineage_table.dataframe()

# Get the Best Model

In [None]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
import boto3
from time import gmtime,strftime
from sagemaker.estimator import Estimator
from sagemaker import PipelineModel

In [None]:
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S",gmtime())

In [None]:
scikit_learn_inferencee_model = sklearn_preprocessor.create_model()
scikit_learn_inferencee_model.env = {"SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT":"text/csv"}
algo_estimator = Estimator.attach(best_trial_component.source.source_arn.split("/")[-1])
best_algo_model = algo_estimator.create_model(env={'SAGEMAKER_DEFAULT_INVOCATIONS_ACCEPT':"text/csv"})

In [None]:
model_containers = [scikit_learn_inferencee_model,best_algo_model]

# Explain Best Model

## Load Best Model Tensors

In [None]:
algo_estimator.latest_training_job.rule_job_summary()

In [None]:
algo_estimator.latest_job_debugger_artifacts_path()

In [None]:
best_model_track_path = os.path.join(
    debug_path,
    best_trial_component.source.source_arn.split("/")[-1],
    "debug-output")
best_model_track_path

In [None]:
trial = create_trial(best_model_track_path)

In [None]:
trial.tensor_names()

In [None]:
trial.tensor("train-rmse").steps()

In [None]:
trial.tensor("validation-rmse").values()

In [None]:
trial.tensor("validation-rmse").steps()

In [None]:
{i for i in trial.tensor_names() if "average_shap" in i}

In [None]:
feature_names = [
    'longitude',
    'latitude',
    'housing_mdeian_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'rooms_per_household',
    'population_per_household',
    'bedrooms_per_room',
    '<1H OCEAN',
    'INLAND',
    'ISLAND',
    'NEAR BAY',
    'NEAR OCEAN'
]

In [None]:
def get_data(trial,tname):
    tensor = trial.tensor(tname)
    steps = tensor.steps()
    vals = [tensor.value(s) for s in steps]
    return steps, vals

def plot_collection(trial,collection_name,regex='.*',figsize=(8,6)):
    fig,ax = plt.subplots(figsize=figsize)
    sns.despine()
    
    print(collection_name)
    if(collection_name == "metrics"):
        tensors = trial.collection(collection_name).tensor_names
    else:
        tensors = {i for i in trial.tensor_names() if collection_name in i}
     
    for tensor_name in sorted(tensors):
        if re.match(regex,tensor_name):
            steps, data = get_data(trial,tensor_name)
            if(len(tensors)) ==2:
                label=tensor_name
            else:
                label= tensor_name+"-"+feature_names[int(tensor_name.split("/")[-1].split("f")[1])]
            ax.plot(
                steps,
                data,
                label=label)
    
    ax.legend(loc='center left',bbox_to_anchor=(1,0.5))
    ax.set_xlabel('Iteration')

In [None]:
def plot_feature_importance(trial,importance_type="weight"):
    SUPPORTED_IMPORTANCE_TYPES=["Weight","gain","cover","total_gain","total_cover"]
    if importance_type not in SUPPORTED_IMPORTANCE_TYPES:
        raise ValueError(f"(importance_type) is not one of the supported importance types.")
    plot_collection(
        trial,
        "feature_importance",
        regex=f"feature_importance/{importance_type}/.*")

## Metrics - VIZ

In [None]:
plot_collection(trial,"metrics")

## Cover - VIZ

In [None]:
plot_feature_importance(trial,importance_type="cover")

## Average SHAP - VIZ

In [None]:
plot_collection(trial,"average_shap")

## Global Explanations

In [None]:
shap_values = trial.tensor("full_shap/f0").value(trial.last_complete_step)
shap_no_base = shap_values[:,:-1]
shap_base_value = shap_values[0,-1]
shap.summary_plot(shap_no_base,plot_type='bar',feature_names=feature_names)

In [None]:
f"{data_output_path}"

## Local explanations

In [None]:
data = pd.read_csv(f"{data_output_path}/processed-train/processed_train_data.csv",header=None)
data.columns = ["median_house_value"]+ feature_names

In [None]:
data.columns

In [None]:
shap.summary_plot(shap_no_base,data[['longitude',
    'latitude',
    'housing_mdeian_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'rooms_per_household',
    'population_per_household',
    'bedrooms_per_room',
    '<1H OCEAN',
    'INLAND',
    'ISLAND',
    'NEAR BAY',
    'NEAR OCEAN']],feature_names=feature_names)

## Local explanations, for a record

In [None]:
shap.initjs()

In [None]:
shap.force_plot(shap_base_value,shap_no_base[100,:],
               data[['longitude',
    'latitude',
    'housing_mdeian_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'rooms_per_household',
    'population_per_household',
    'bedrooms_per_room',
    '<1H OCEAN',
    'INLAND',
    'ISLAND',
    'NEAR BAY',
    'NEAR OCEAN']].iloc[100,:],link="identity",matplotlib=True,text_rotation=90,figsize(20,3))

## Stacked force plot

In [None]:
N_ROWS = shap_no_base.shape[0]
N_SAMPLES = min(100,N_ROWS)
sampled_indices=np.random.randint(N_ROWS,size=N_SAMPLES)

In [None]:
shap.force_plot(shap_base_value,
                shap_no_base[sampled_indices,:],
               data[['longitude',
    'latitude',
    'housing_mdeian_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'rooms_per_household',
    'population_per_household',
    'bedrooms_per_room',
    '<1H OCEAN',
    'INLAND',
    'ISLAND',
    'NEAR BAY',
    'NEAR OCEAN']].iloc[sampled_indices,:],
            link='identity')

## Outlier

In [None]:
N_OUTLIERS = 3

shap_sum = np.sum(shap_no_base,axis=1)
z_scores = stats.zscore(shap_sum)
outlier_indices = (np.argpartition(z_scores,-N_OUTLIERS)
                  [-N_OUTLIERS:]).tolist()
outlier_indices += (np.argpartition(z_scores,N_OUTLIERS)
                   [:N_OUTLIERS]).tolist()

In [None]:
for fig_index,outlier_index in enumerate(outlier_indices,start=1):
    shap.force_plot(shap_base_value,
                   shap_no_base[outlier_index,:],
                   data[['longitude',
    'latitude',
    'housing_mdeian_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'rooms_per_household',
    'population_per_household',
    'bedrooms_per_room',
    '<1H OCEAN',
    'INLAND',
    'ISLAND',
    'NEAR BAY',
    'NEAR OCEAN']].iloc[outlier_index,:],
                   matplotlib=True,
                   link='identity',text_rotation=90,figsize=(20,3))
    