In [2]:
import io

import boto3
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import sagemaker as sage
from IPython.display import display

In [3]:
S3_BUCKET_NAME = 'newfdsb'

In [4]:
boto_session = boto3.Session(
    aws_access_key_id='',
    aws_secret_access_key='',
    region_name='us-west-2'
)
boto_session

Session(region_name='us-west-2')

In [5]:
sess = sage.Session(boto_session)
s3 = sess.boto_session.resource('s3')

## Parquet files downloading:

In [6]:
def read_parquet_from_s3_as_df(s3_resource, bucket_name: str, file_key: str) -> pd.DataFrame:
    parquet_obj = s3_resource.Object(bucket_name, file_key)
    buffer = io.BytesIO()
    parquet_obj.download_fileobj(buffer)
    df = pd.read_parquet(buffer)
    buffer.close()
    return df

In [7]:
bcns_df = read_parquet_from_s3_as_df(s3, S3_BUCKET_NAME, 'parquet/bcns/year=2019/day=17914/data.parquet')
bcns_df.head()

Unnamed: 0,txid,type,win_price,appuid,campaign_item_id,creative_category,creative_id,domain
0,61da1f55-d040-47ec-adf7-a7d57c55e052,click,,,,,,
1,deaf9911-054e-4531-ba01-074bb4c171fb,bid,0.0,3gYaYlNEbN,1306029.0,8CgznkdgQk,qnROFVHsLj,JyyQQuKacg
2,9b9615f7-e86e-4719-8f8f-d06f5fd19b13,click,,,,,,
3,e949655e-f503-49b2-945d-4f141704cf6e,click,,,,,,
4,05a68561-e08f-460a-95c3-b912eb519975,bid,0.0,3gYaYlNEbN,1306029.0,8CgznkdgQk,qnROFVHsLj,JyyQQuKacg


In [8]:
impressions_df = read_parquet_from_s3_as_df(s3, S3_BUCKET_NAME, 'parquet/impressions/year=2019/day=17914/data.parquet')
impressions_df.head()

Unnamed: 0,txid,campaign_item_id,domain,creative_id,creative_category,appuid,win_price
0,e6b4c5cc-7fb3-4527-924d-3588e496d234,1306029,JyyQQuKacg,qnROFVHsLj,8CgznkdgQk,3gYaYlNEbN,10
1,8529bf8a-c127-4a20-8cbc-75019dd203ff,1306029,JyyQQuKacg,qnROFVHsLj,8CgznkdgQk,3gYaYlNEbN,10
2,a826d470-53e1-4ed7-9494-b61d51e6cd65,1306029,JyyQQuKacg,qnROFVHsLj,8CgznkdgQk,3gYaYlNEbN,10
3,817e4447-ea40-49b1-b55a-5cf2456ea0b3,1306029,JyyQQuKacg,qnROFVHsLj,8CgznkdgQk,3gYaYlNEbN,10
4,698aa9b6-c449-4f35-9514-2faea4432851,1306029,JyyQQuKacg,qnROFVHsLj,8CgznkdgQk,3gYaYlNEbN,10


## Prepairing dumb training data:

Categorical to numeric mapping:

In [9]:
categorical_mappings = {}
for column_name in ['campaign_item_id', 'domain', 'creative_id', 'creative_category']:
    unique_categories = set(bcns_df[column_name].dropna().unique()) \
        | set(impressions_df[column_name].dropna().unique())
    categorical_mappings[column_name] = dict(zip(unique_categories, range(len(unique_categories))))
    
del column_name, unique_categories

In [10]:
categorical_mappings

{'campaign_item_id': {1690211.0: 0,
  1216772.0: 1,
  1306029.0: 2,
  1042798.0: 3,
  1708722.0: 4,
  1843637.0: 5},
 'domain': {'JyyQQuKacg': 0,
  '6UHb0iRpzo': 1,
  'YihcuvbEcN': 2,
  'kPBDkx64Sz': 3,
  'eCUGJHWViJ': 4,
  '0N04TrwhYj': 5},
 'creative_id': {'qnROFVHsLj': 0,
  'Eds4RcZYi8': 1,
  'BktGCSF3p1': 2,
  'cyStkGtLf0': 3,
  'mUIsTWTuvz': 4,
  'Kg2LsaXYlU': 5},
 'creative_category': {'DAqWva5Jz3': 0,
  'cZheiXv0Ec': 1,
  'TIkVjf8V1h': 2,
  'ohePbhPYsK': 3,
  'wJgxfkbP4x': 4,
  '8CgznkdgQk': 5}}

Negative data:

In [11]:
neg_df = bcns_df[bcns_df.type == 'bid'][['campaign_item_id', 'domain', 'creative_id', 'creative_category', 'win_price']]

for column_name, col_mapping in categorical_mappings.items():
    neg_df[column_name] = neg_df[column_name].apply(lambda x: col_mapping[x])

x_neg = neg_df.values.astype('float32')
y_neg = np.zeros(len(x_neg), dtype='float32')

Positive data:

In [12]:
pos_df = impressions_df[['campaign_item_id', 'domain', 'creative_id', 'creative_category', 'win_price']]

for column_name, col_mapping in categorical_mappings.items():
    pos_df[column_name] = pos_df[column_name].apply(lambda x: col_mapping[x])

x_pos = pos_df.values.astype('float32')
y_pos = np.ones(len(x_pos), dtype='float32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Merge, shuffle and split:

In [13]:
def joint_shuffle(*arrays: np.ndarray) -> tuple:
    permuted_indexes = np.random.permutation(arrays[0].shape[0])
    return tuple(arr[permuted_indexes] for arr in arrays)

In [14]:
X, y = joint_shuffle(np.concatenate((x_pos, x_neg)), np.concatenate((y_pos, y_neg)))

train_size = int(0.9 * X.shape[0])

X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

## Data conversion and uploading:

In [15]:
def convert_and_upload(features: np.ndarray, labels: np.ndarray, s3_resource, bucket_name: str, file_key: str):
    with io.BytesIO() as buffer:
        sage.amazon.common.write_numpy_to_dense_tensor(buffer, features, labels)
        buffer.seek(0)
        s3_resource.Object(bucket_name, file_key).upload_fileobj(buffer)

In [16]:
%%time

train_data_key = 'ml/data/train/recordio-data'
convert_and_upload(X_train, y_train, s3, S3_BUCKET_NAME, train_data_key)

CPU times: user 643 ms, sys: 0 ns, total: 643 ms
Wall time: 747 ms


In [17]:
%%time

test_data_key = 'ml/data/test/recordio-data'
convert_and_upload(X_test, y_test, s3, S3_BUCKET_NAME, test_data_key)

CPU times: user 78.2 ms, sys: 0 ns, total: 78.2 ms
Wall time: 261 ms


## Training model:

In [18]:
container = sage.amazon.amazon_estimator.get_image_uri(boto_session.region_name, 'linear-learner')

In [19]:
%%time

linear = sage.estimator.Estimator(container,
                                  sage.get_execution_role(), 
                                  train_instance_count=1, 
                                  train_instance_type='ml.m5.large',
                                  output_path=f's3://{S3_BUCKET_NAME}/ml/model',
                                  sagemaker_session=sess)

linear.set_hyperparameters(feature_dim=X_train.shape[1],
                           predictor_type='binary_classifier',
                           mini_batch_size=128, 
                           epochs=1)



CPU times: user 85.8 ms, sys: 746 µs, total: 86.5 ms
Wall time: 934 ms


In [20]:
%%time

linear.fit({
    'train': f's3://{S3_BUCKET_NAME}/{train_data_key}', 
    'validation': f's3://{S3_BUCKET_NAME}/{test_data_key}', 
})

INFO:sagemaker:Creating training-job with name: linear-learner-2019-01-21-11-57-23-259


2019-01-21 11:57:23 Starting - Starting the training job...
2019-01-21 11:57:25 Starting - Launching requested ML instances......
2019-01-21 11:58:24 Starting - Preparing the instances for training...
2019-01-21 11:59:19 Downloading - Downloading input data...
2019-01-21 11:59:48 Training - Downloading the training image..
[31mDocker entrypoint called with argument(s): train[0m
[31m[01/21/2019 12:00:05 INFO 139720803370816] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_method': u'uniform', u'init_sigma': u'0.01', u'lr_scheduler_minimum_lr': u'auto', u'target_recall

Billable seconds: 63
CPU times: user 359 ms, sys: 23.7 ms, total: 383 ms
Wall time: 3min 11s


## Inference:

In [21]:
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: linear-learner-2019-01-21-12-04-06-996
INFO:sagemaker:Creating endpoint with name linear-learner-2019-01-21-11-57-23-259


--------------------------------------------------------------------------!

In [22]:
linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = sage.predictor.csv_serializer
linear_predictor.deserializer = sage.predictor.json_deserializer

We predict probability of winning for the range of `win_price` with other features fixed:

In [23]:
win_price_range = np.arange(0, 101, 10)
win_price_range

array([  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100])

In [24]:
predict_features = np.hstack([
    np.tile(X_test[0, :4], (len(win_price_range), 1)), 
    win_price_range.reshape((-1, 1))
])
predict_features

array([[  0.,   2.,   1.,   1.,   0.],
       [  0.,   2.,   1.,   1.,  10.],
       [  0.,   2.,   1.,   1.,  20.],
       [  0.,   2.,   1.,   1.,  30.],
       [  0.,   2.,   1.,   1.,  40.],
       [  0.,   2.,   1.,   1.,  50.],
       [  0.,   2.,   1.,   1.,  60.],
       [  0.,   2.,   1.,   1.,  70.],
       [  0.,   2.,   1.,   1.,  80.],
       [  0.,   2.,   1.,   1.,  90.],
       [  0.,   2.,   1.,   1., 100.]])

Prediction on batch:

In [25]:
results = linear_predictor.predict(predict_features)

In [26]:
results

{'predictions': [{'score': 0.3523321747779846, 'predicted_label': 0.0},
  {'score': 0.6477640867233276, 'predicted_label': 1.0},
  {'score': 0.8614343404769897, 'predicted_label': 1.0},
  {'score': 0.9545783996582031, 'predicted_label': 1.0},
  {'score': 0.9861197471618652, 'predicted_label': 1.0},
  {'score': 0.9958535432815552, 'predicted_label': 1.0},
  {'score': 0.9987698197364807, 'predicted_label': 1.0},
  {'score': 0.9996358156204224, 'predicted_label': 1.0},
  {'score': 0.9998922348022461, 'predicted_label': 1.0},
  {'score': 0.9999681711196899, 'predicted_label': 1.0},
  {'score': 0.9999905824661255, 'predicted_label': 1.0}]}

In [36]:
def pretty_print_results(features_to_predict: np.ndarray, predictions_list: list):
    for pred_idx in np.flipud(np.argsort([item['score'] for item in predictions_list])):
        print(f"""Bid price: {features_to_predict[pred_idx][4]}, score: {predictions_list[pred_idx]['score']} - {'win' if predictions_list[pred_idx]['predicted_label'] == 1. else 'loss'}""")

In [38]:
pretty_print_results(predict_features, results['predictions'])

Bid price: 100.0, score: 0.9999905824661255 - win
Bid price: 90.0, score: 0.9999681711196899 - win
Bid price: 80.0, score: 0.9998922348022461 - win
Bid price: 70.0, score: 0.9996358156204224 - win
Bid price: 60.0, score: 0.9987698197364807 - win
Bid price: 50.0, score: 0.9958535432815552 - win
Bid price: 40.0, score: 0.9861197471618652 - win
Bid price: 30.0, score: 0.9545783996582031 - win
Bid price: 20.0, score: 0.8614343404769897 - win
Bid price: 10.0, score: 0.6477640867233276 - win
Bid price: 0.0, score: 0.3523321747779846 - loss


## Delete endpoint:

In [39]:
linear_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint with name: linear-learner-2019-01-21-11-57-23-259
