In [2]:
import boto3
import botocore
import os
import sagemaker


bucket = sagemaker.Session().default_bucket()
prefix = "sagemaker/ipinsight-uae"
execution_role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# check if the bucket exists
try:
    boto3.Session().client("s3").head_bucket(Bucket=bucket)
except botocore.exceptions.ParamValidationError as e:
    print(
        "Hey! You either forgot to specify your S3 bucket or you gave your bucket an invalid name!"
    )
except botocore.exceptions.ClientError as e:
    if e.response["Error"]["Code"] == "403":
        print(f"Hey! You don't have permission to access the bucket, {bucket}.")
    elif e.response["Error"]["Code"] == "404":
        print(f"Hey! Your bucket, {bucket}, doesn't exist!")
    else:
        raise
else:
    print(f"Training input/output will be stored in: s3://{bucket}/{prefix}")

Training input/output will be stored in: s3://sagemaker-us-east-1-349596211722/sagemaker/ipinsight-uae


In [3]:
from os import path

tools_bucket = f"jumpstart-cache-prod-{region}"  # Bucket containing the data generation module.
tools_prefix = "1p-algorithms-assets/ip-insights"  # Prefix for the data generation module
s3 = boto3.client("s3")

data_generation_file = "generate_data.py"  # Synthetic data generation module
script_parameters_file = "ip2asn-v4-u32.tsv.gz"

if not path.exists(data_generation_file):
    s3.download_file(tools_bucket, f"{tools_prefix}/{data_generation_file}", data_generation_file)

if not path.exists(script_parameters_file):
    s3.download_file(
        tools_bucket, f"{tools_prefix}/{script_parameters_file}", script_parameters_file
    )

In [4]:
from generate_data import generate_dataset

# We simulate traffic for 10,000 users. This should yield about 3 million log lines (~700 MB).
NUM_USERS = 10000
log_file = "ipinsights_web_traffic.log"
generate_dataset(NUM_USERS, log_file)

# Visualize a few log lines
!head $log_file

Loaded ASN List: 827696 ASNs.
Starting User Activity Simulation


100%|██████████| 10000/10000 [01:05<00:00, 152.95users/s]


Finished simulating web activity for 10000 users.
218.107.23.23 - user_126 [13/Nov/2018:04:26:21 +0000] "GET /login_success HTTP/1.1" 200 476 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/555.33 (KHTML, like Gecko) Chrome/1.1.1111.100 Safari/555.355"
5.162.246.67 - user_126 [12/Nov/2018:10:51:37 +0000] "GET /login_success HTTP/1.1" 200 476 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/555.33 (KHTML, like Gecko) Chrome/1.1.1111.100 Safari/555.355"
218.107.44.138 - user_126 [05/Nov/2018:05:04:03 +0000] "GET /login_success HTTP/1.1" 200 476 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/555.33 (KHTML, like Gecko) Chrome/1.1.1111.100 Safari/555.355"
218.107.18.13 - user_126 [10/Nov/2018:21:12:45 +0000] "GET /login_success HTTP/1.1" 200 476 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/555.33 (KHTML, like Gecko) Chrome/1.1.1111.100 Safari/555.355"
5.162.198.75 - user_126 [05/Nov/2018:04:04:12 +0000] "GET /login_s

In [5]:
import pandas as pd

df = pd.read_csv(
    log_file,
    sep=" ",
    na_values="-",
    header=None,
    names=[
        "ip_address",
        "rcf_id",
        "user",
        "timestamp",
        "time_zone",
        "request",
        "status",
        "size",
        "referer",
        "user_agent",
    ],
)
df.head()

Unnamed: 0,ip_address,rcf_id,user,timestamp,time_zone,request,status,size,referer,user_agent
0,218.107.23.23,,user_126,[13/Nov/2018:04:26:21,+0000],GET /login_success HTTP/1.1,200,476,,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6...
1,5.162.246.67,,user_126,[12/Nov/2018:10:51:37,+0000],GET /login_success HTTP/1.1,200,476,,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6...
2,218.107.44.138,,user_126,[05/Nov/2018:05:04:03,+0000],GET /login_success HTTP/1.1,200,476,,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6...
3,218.107.18.13,,user_126,[10/Nov/2018:21:12:45,+0000],GET /login_success HTTP/1.1,200,476,,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6...
4,5.162.198.75,,user_126,[05/Nov/2018:04:04:12,+0000],GET /login_success HTTP/1.1,200,476,,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6...


In [6]:
# Convert time stamps to DateTime objects
df["timestamp"] = pd.to_datetime(df["timestamp"], format="[%d/%b/%Y:%H:%M:%S")

In [7]:
# Check if they are all in the same timezone
num_time_zones = len(df["time_zone"].unique())
num_time_zones

1

In [8]:
from datetime import datetime
import pytz


def apply_timezone(row):
    tz = row[1]
    tz_offset = int(tz[:3]) * 60  # Hour offset
    tz_offset += int(tz[3:5])  # Minutes offset
    return row[0].replace(tzinfo=pytz.FixedOffset(tz_offset))


if num_time_zones > 1:
    df["timestamp"] = df[["timestamp", "time_zone"]].apply(apply_timezone, axis=1)

In [9]:
df = df[(df["request"].str.startswith("GET /login_success")) & (df["status"] == 200)]


In [10]:
df = df[["user", "ip_address", "timestamp"]]


In [11]:
df["timestamp"].describe()


  if __name__ == '__main__':


count                 3300601
unique                 844979
top       2018-11-05 13:21:46
freq                       15
first     2018-11-04 00:00:01
last      2018-11-14 00:00:00
Name: timestamp, dtype: object

In [12]:
time_partition = (
    datetime(2018, 11, 11, tzinfo=pytz.FixedOffset(0))
    if num_time_zones > 1
    else datetime(2018, 11, 11)
)

train_df = df[df["timestamp"] <= time_partition]
test_df = df[df["timestamp"] > time_partition]

In [13]:
# Shuffle train data
train_df = train_df.sample(frac=1)
train_df.head()

Unnamed: 0,user,ip_address,timestamp
809068,user_2116,58.184.255.210,2018-11-09 13:36:22
2998037,user_9193,185.152.103.220,2018-11-10 17:02:53
2149373,user_6424,203.124.54.8,2018-11-04 16:40:51
1446823,user_4399,209.131.67.75,2018-11-09 20:33:26
1344817,user_3842,94.19.229.194,2018-11-07 08:22:10


In [14]:
# Output dataset as headerless CSV
train_data = train_df.to_csv(index=False, header=False, columns=["user", "ip_address"])

In [15]:
# Upload data to S3 key
train_data_file = "train.csv"
key = os.path.join(prefix, "train", train_data_file)
s3_train_data = f"s3://{bucket}/{key}"

print(f"Uploading data to: {s3_train_data}")
boto3.resource("s3").Bucket(bucket).Object(key).put(Body=train_data)

# Configure SageMaker IP Insights Input Channels
input_data = {
    "train": sagemaker.session.s3_input(
        s3_train_data, distribution="FullyReplicated", content_type="text/csv"
    )
}

Uploading data to: s3://sagemaker-us-east-1-349596211722/sagemaker/ipinsight-uae/train/train.csv


The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [16]:
from sagemaker.amazon.amazon_estimator import get_image_uri

image = get_image_uri(boto3.Session().region_name, "ipinsights")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


In [19]:
# Set up the estimator with training job configuration
ip_insights = sagemaker.estimator.Estimator(
    image,
    execution_role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",
    output_path=f"s3://{bucket}/{prefix}/output",
    sagemaker_session=sagemaker.Session(),
)

# Configure algorithm-specific hyperparameters
ip_insights.set_hyperparameters(
    num_entity_vectors="20000",
    random_negative_sampling_rate="5",
    vector_dim="128",
    mini_batch_size="1000",
    epochs="5",
    learning_rate="0.01",
)

# Start the training job (should take about ~1.5 minute / epoch to complete)
ip_insights.fit(input_data)

2021-09-26 09:49:15 Starting - Starting the training job...
2021-09-26 09:49:39 Starting - Launching requested ML instancesProfilerReport-1632649755: InProgress
...
2021-09-26 09:50:13 Starting - Preparing the instances for training.........
2021-09-26 09:51:40 Downloading - Downloading input data
2021-09-26 09:51:40 Training - Downloading the training image......
2021-09-26 09:52:40 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[09/26/2021 09:52:34 INFO 140486137542464] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'batch_metrics_publish_interval': '1000', 'epochs': '10', 'learning_rate': '0.001', 'mini_batch_size': '5000', 'num_entity_vectors': '100000', 'num_ip_encoder_layers': '1', 'random_negative_sampling_rate': '1', 'shuffled_negative_sampling_rate': '1', 'vector_dim': '

In [20]:
print(f"Training job name: {ip_insights.latest_training_job.job_name}")


Training job name: ipinsights-2021-09-26-09-49-15-303


In [21]:
predictor = ip_insights.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")


-------------!

In [22]:
print(f"Endpoint name: {predictor.endpoint}")


The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Endpoint name: ipinsights-2021-09-26-10-05-12-084


In [23]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.serializer = csv_serializer
predictor.deserializer = json_deserializer

In [24]:
inference_data = [(data[0], data[1]) for data in train_df[:5].values]
predictor.predict(
    inference_data, initial_args={"ContentType": "text/csv", "Accept": "application/json"}
)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


{'predictions': [{'dot_product': 0.2648221254348755},
  {'dot_product': 2.234119415283203},
  {'dot_product': 1.3333816528320312},
  {'dot_product': 2.1200246810913086},
  {'dot_product': 4.378648281097412}]}

In [25]:
predictor.predict(
    inference_data,
    initial_args={"ContentType": "text/csv", "Accept": "application/json; verbose=True"},
)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


{'predictions': [{'dot_product': 0.2648221254348755,
   'ip_embedding': [1.0634498596191406,
    0.9177546501159668,
    0.1376996487379074,
    -0.08418992161750793,
    -0.3245652914047241,
    0.6578858494758606,
    0.61216139793396,
    -0.3860875368118286,
    -0.04040323197841644,
    0.5282613039016724,
    0.11067644506692886,
    0.3585263788700104,
    -0.4301910996437073,
    0.18772876262664795,
    0.18719545006752014,
    -0.41265755891799927,
    -0.0993213951587677,
    0.3405100107192993,
    -0.24680255353450775,
    -0.5901927351951599,
    -0.25176435708999634,
    0.4046957492828369,
    -0.43084537982940674,
    -0.05121690779924393,
    -0.25743603706359863,
    -0.8286483287811279,
    -0.3503531217575073,
    0.8528644442558289,
    0.8747204542160034,
    0.28543195128440857,
    -0.4783822298049927,
    0.10015376657247543,
    0.6241172552108765,
    -0.011017639189958572,
    0.4101026654243469,
    -0.17794351279735565,
    -0.07609633356332779,
    0.145