In [22]:
%%writefile requirements.txt
joblib
scipy
numpy
scikit-learn

Overwriting requirements.txt


In [2]:
!mkdir -p opt/ml/model  
!cp model.pkl opt/ml/model/model.pkl
!cp tfidf_vectorizer.pkl opt/ml/model/tfidf_vectorizer.pkl

In [5]:
%%writefile inference.py
import os
import json
import joblib
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Deserialize fitted model
def model_fn(model_dir):
    """
    Load the trained RandomForestClassifier model from the model directory.

    Args:
    model_dir (str): Directory where model artifacts are stored.

    Returns:
    RandomForestClassifier: Loaded model object.
    """
    model_path = os.path.join(model_dir, "model.pkl")
    model = joblib.load(model_path)
    return model

# Deserialize input data
def input_fn(request_body, request_content_type):
    """
    Deserialize the input data from the request body.

    Args:
    request_body (str): Raw request body containing JSON-formatted data.
    request_content_type (str): Content type of the request body.

    Returns:
    dict: Deserialized input data.
    """
    if request_content_type == 'application/json':
        input_data = json.loads(request_body)
        return input_data
    else:
        raise ValueError("This model only supports application/json input")

# Preprocess input data
def process_input(input_data, model):
    """
    Preprocess the input data before passing it to the model for prediction.
    Here, we transform the input data using the TF-IDF vectorizer.

    Args:
    input_data (dict): Input data dictionary containing 'url' key with list of URLs.
    model (RandomForestClassifier): Trained model object.

    Returns:
    numpy.ndarray: Processed input data ready for prediction.
    """
    X = input_data['url']
    vectorizer_path = os.path.join("opt/ml/model", "tfidf_vectorizer.pkl")
    vectorizer = joblib.load(vectorizer_path)
    X_vect = vectorizer.transform(X)
    return X_vect

# Perform inference using the model
def predict_fn(input_data, model):
    """
    Perform inference using the trained model on the preprocessed input data.

    Args:
    input_data (dict): Preprocessed input data.
    model (RandomForestClassifier): Trained model object.

    Returns:
    numpy.ndarray: Model predictions.
    """
    # Process the input data if necessary
    processed_data = process_input(input_data, model)
    # Make predictions using the model
    predictions = model.predict(processed_data)
    return predictions

# Serialize the output predictions
def output_fn(prediction, content_type):
    """
    Serialize the model predictions to JSON format.

    Args:
    prediction (numpy.ndarray): Model predictions.
    content_type (str): Expected content type of the response.

    Returns:
    str: JSON-formatted string representing the predictions.
    """
    prediction_str = prediction.tolist()
    response = {"type": prediction_str}
    return json.dumps(response)


Overwriting inference.py


In [11]:
import joblib
import os
import json
import numpy as np
import boto3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

"""
Deserialize fitted model
"""
def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, "model.pkl"))
    return model

"""
input_fn
    request_body: The body of the request sent to the model.
    request_content_type: (string) specifies the format/variable type of the request
"""
def input_fn(request_body, request_content_type):
    if request_content_type == 'application/json':
        input_data = json.loads(request_body)
        return input_data
    else:
        raise ValueError("This model only supports application/json input")

"""
predict_fn
    input_data: returned data from input_fn above
    model (sklearn model) returned model loaded from model_fn above
"""
def predict_fn(input_data, model):
    # Process the input data if necessary
    processed_data = process_input(input_data)
    # Make predictions using the model
    predictions = model.predict(processed_data)
    return predictions

def process_input(input_data):
    # Process input data as needed before passing to the model for prediction
    X = input_data['url']
    vectorizer = joblib.load(os.path.join("opt/ml/model", "tfidf_vectorizer.pkl"))
    X_vect = vectorizer.transform(X)
    return X_vect

"""
output_fn
    prediction: the returned value from predict_fn above
    content_type: the content type the endpoint expects to be returned. Ex: JSON, string
"""
def output_fn(prediction, content_type):
    prediction_str = prediction[0]
    response = {"type": prediction_str}
    return json.dumps(response)

if __name__ == '__main__':
    # Test the function
    model_dir = "opt/ml/model/"
    model = model_fn(model_dir)
    print(predict_fn({'url': ["http://malicious-site.com"]}, model))


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


['phishing']


In [12]:
import boto3
import json
import os
import joblib
import pickle
import tarfile
import sagemaker
from sagemaker.estimator import Estimator
import time
from time import gmtime, strftime
import subprocess

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [13]:
#Setup
client = boto3.client(service_name="sagemaker")
runtime = boto3.client(service_name="sagemaker-runtime")
boto_session = boto3.session.Session()
s3 = boto_session.resource('s3')
region = boto_session.region_name
print(region)
sagemaker_session = sagemaker.Session()
role = "arn:aws:iam::381492003207:role/LabRole"

us-east-1


In [14]:
# retrieve sklearn image
image_uri = sagemaker.image_uris.retrieve(
    framework="sklearn",
    region=region,
    version="1.2-1",
    py_version="py3",
    instance_type="ml.t2.medium",
)

In [15]:
#Bucket for model artifacts
default_bucket = sagemaker_session.default_bucket()
print(default_bucket)

#Upload tar.gz to bucket
model_artifacts = f"s3://{default_bucket}/model.tar.gz"
response = s3.meta.client.upload_file('model.tar.gz', default_bucket, 'model.tar.gz')

sagemaker-us-east-1-381492003207


In [16]:
#Step 1: Model Creation
model_name = "MidMalwareDetection"
print("Model name: " + model_name)
create_model_response = client.create_model(
    ModelName=model_name,
    Containers=[
        {
            "Image": image_uri,
            "Mode": "SingleModel",
            "ModelDataUrl": model_artifacts,
            "Environment": {'SAGEMAKER_SUBMIT_DIRECTORY': model_artifacts,
                           'SAGEMAKER_PROGRAM': 'inference.py'} 
        }
    ],
    ExecutionRoleArn=role,
)
print("Model Arn: " + create_model_response["ModelArn"])

Model name: MidMalwareDetection
Model Arn: arn:aws:sagemaker:us-east-1:381492003207:model/MidMalwareDetection


In [19]:
#Step 2: EPC Creation
sklearn_epc_name = "sklearn-epc" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + "midterm-malware-detection"
endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName=sklearn_epc_name,
    ProductionVariants=[
        {
            "VariantName": "sklearnvariant",
            "ModelName": model_name,
            "InstanceType": "ml.t2.medium",
            "InitialInstanceCount": 1
        },
    ],
)
print("Endpoint Configuration Arn: " + endpoint_config_response["EndpointConfigArn"])

Endpoint Configuration Arn: arn:aws:sagemaker:us-east-1:381492003207:endpoint-config/sklearn-epc2024-05-05-05-32-19midterm-malware-detection


In [20]:
#Step 3: EP Creation
endpoint_name = "sklearn-local-ep" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + "mid-malware-detection"
create_endpoint_response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=sklearn_epc_name,
)
print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

Endpoint Arn: arn:aws:sagemaker:us-east-1:381492003207:endpoint/sklearn-local-ep2024-05-05-05-33-14mid-malware-detection


In [21]:
#Monitor creation
describe_endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)
while describe_endpoint_response["EndpointStatus"] == "Creating":
    describe_endpoint_response = client.describe_endpoint(EndpointName=endpoint_name)
    print(describe_endpoint_response["EndpointStatus"])
    time.sleep(15)
print(describe_endpoint_response)

Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
Creating
InService
{'EndpointName': 'sklearn-local-ep2024-05-05-05-33-14mid-malware-detection', 'EndpointArn': 'arn:aws:sagemaker:us-east-1:381492003207:endpoint/sklearn-local-ep2024-05-05-05-33-14mid-malware-detection', 'EndpointConfigName': 'sklearn-epc2024-05-05-05-32-19midterm-malware-detection', 'ProductionVariants': [{'VariantName': 'sklearnvariant', 'DeployedImages': [{'SpecifiedImage': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3', 'ResolvedImage': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn@sha256:20bb6714740d1b80a6b39b6ae06b928c59b40b394b083f72435b87e59dda0364', 'ResolutionTime': datetime.datetime(2024, 5, 5, 5, 33, 15, 160000, tzinfo=tzlocal())}], 'CurrentWeight': 1.0, 'DesiredWeight': 1.0, 'CurrentInstanceCount': 1, 'DesiredInstanceCount': 1}], 'EndpointStatus': 'InService', 'CreationTime

In [23]:
import json
import boto3

# Initialize the SageMaker runtime client
runtime = boto3.client('sagemaker-runtime')

# Define your input data
"""vanderbilt.rivals.com/viewcoach.asp?coach=2079&sport=1&year=2011	benign
http://peluqueriadeautor.com/index.php?option=com_virtuemart&page=shop.browse&category_id=31&Itemid=70	defacement
movies.yahoo.com/shop?d=hv&cf=info&id=1800340831	benign
cyndislist.com/us/pa/counties	benign
http://www.824555.com/app/member/SportOption.php?uid=guest&langx=gb	malware
http://www.raci.it/component/user/reset.html	defacement
https://docs.google.com/spreadsheet/viewform?formkey=dGg2Z1lCUHlSdjllTVNRUW50TFIzSkE6MQ	phishing
psychology.wikia.com/wiki/Phonemes	benign
"""
input_data = {
    'url': ["http://peluqueriadeautor.com/index.php?option=com_virtuemart&page=shop.browse&category_id=31&Itemid=70"]
}

# Convert input data to JSON string
payload = json.dumps(input_data)

# Specify the endpoint name
endpoint_name = 'sklearn-local-ep2024-05-05-05-33-14mid-malware-detection'

# Call the endpoint
response = runtime.invoke_endpoint(EndpointName=endpoint_name,
                                   ContentType='application/json',
                                   Body=payload)

# Decode and print the response
result = json.loads(response['Body'].read().decode())
print(result)

{'type': 'defacement'}
