# Put Customer Reviews On Kinesis Data Firehose

<img src="img/use_case_1_analytics.png" width="80%" align="left">

In [1]:
import boto3
import sagemaker
import pandas as pd
import json

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)
firehose = boto3.Session().client(service_name='firehose', region_name=region)
kinesis_analytics = boto3.Session().client(service_name='kinesisanalytics', region_name=region)


In [2]:
%store -r firehose_name

In [3]:
try:
    firehose_name
except NameError:
    print('+++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in this section before you continue.')
    print('+++++++++++++++++++++++++++++++')

In [4]:
print(firehose_name)

dsoaws-kinesis-data-firehose


In [5]:
%store -r firehose_arn

In [6]:
try:
    firehose_arn
except NameError:
    print('+++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in this section before you continue.')
    print('+++++++++++++++++++++++++++++++')

In [7]:
print(firehose_arn)

arn:aws:firehose:us-west-2:085964654406:deliverystream/dsoaws-kinesis-data-firehose


In [8]:
%store -r iam_role_kinesis_arn

In [9]:
try:
    iam_role_kinesis_arn
except NameError:
    print('+++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in this section before you continue.')
    print('+++++++++++++++++++++++++++++++')

In [10]:
print(iam_role_kinesis_arn)

arn:aws:iam::085964654406:role/DSOAWS_Kinesis


In [11]:
%store -r kinesis_data_analytics_app_name

In [12]:
try:
    kinesis_data_analytics_app_name
except NameError:
    print('+++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in this section before you continue.')
    print('+++++++++++++++++++++++++++++++')

In [13]:
print(kinesis_data_analytics_app_name)

dsoaws-kinesis-data-analytics-sql-app


In [14]:
%store -r lambda_fn_name

In [15]:
try:
    lambda_fn_name
except NameError:
    print('+++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in this section before you continue.')
    print('+++++++++++++++++++++++++++++++')

In [16]:
print(lambda_fn_name)

DeliverKinesisAnalyticsToCloudWatch


In [17]:
firehoses = firehose.list_delivery_streams(DeliveryStreamType='DirectPut')

print(json.dumps(firehoses, indent=4, sort_keys=True, default=str))

{
    "DeliveryStreamNames": [
        "dsoaws-kinesis-data-firehose"
    ],
    "HasMoreDeliveryStreams": false,
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "87",
            "content-type": "application/x-amz-json-1.1",
            "date": "Sat, 26 Sep 2020 20:57:30 GMT",
            "x-amz-id-2": "yC9sVy2y/mbcGgzMHvq4CG9ccmUJLDrXc2uY3Jmob03G3/DCMzEGoSdxJk8xX4i65wauQyQ9Qj6Cfs7dpuL04rw22i30CeH5",
            "x-amzn-requestid": "f006ac87-25f4-a7d0-a7ef-01fcb952f80c"
        },
        "HTTPStatusCode": 200,
        "RequestId": "f006ac87-25f4-a7d0-a7ef-01fcb952f80c",
        "RetryAttempts": 0
    }
}


# Download Dataset

In [18]:
!aws s3 cp 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz' ./data/

download: s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz to data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz


In [19]:
import csv
import pandas as pd

df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', 
                 delimiter='\t', 
                 quoting=csv.QUOTE_NONE,
                 compression='gzip')
df.shape

(102084, 15)

In [20]:
df.head(5)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,17747349,R2EI7QLPK4LF7U,B00U7LCE6A,106182406,CCleaner Free [Download],Digital_Software,4,0,0,N,Y,Four Stars,So far so good,2015-08-31
1,US,10956619,R1W5OMFK1Q3I3O,B00HRJMOM4,162269768,ResumeMaker Professional Deluxe 18,Digital_Software,3,0,0,N,Y,Three Stars,Needs a little more work.....,2015-08-31
2,US,13132245,RPZWSYWRP92GI,B00P31G9PQ,831433899,Amazon Drive Desktop [PC],Digital_Software,1,1,2,N,Y,One Star,Please cancel.,2015-08-31
3,US,35717248,R2WQWM04XHD9US,B00FGDEPDY,991059534,Norton Internet Security 1 User 3 Licenses,Digital_Software,5,0,0,N,Y,Works as Expected!,Works as Expected!,2015-08-31
4,US,17710652,R1WSPK2RA2PDEF,B00FZ0FK0U,574904556,SecureAnywhere Intermet Security Complete 5 De...,Digital_Software,4,1,2,N,Y,Great antivirus. Worthless customer support,I've had Webroot for a few years. It expired a...,2015-08-31


In [21]:
df_star_rating_and_review_body = df[['review_id', 
                                         'star_rating', 
                                         'product_category', 
                                         'review_body']][0:1]

df_star_rating_and_review_body.to_csv(sep='\t',
                                      header=None,
                                      index=False)

'R2EI7QLPK4LF7U\t4\tDigital_Software\tSo far so good\n'

# Check that Kinesis Data Analytics Application Is Running

In [22]:
response = kinesis_analytics.describe_application(ApplicationName=kinesis_data_analytics_app_name)

In [23]:
%%time

import time

app_status = response['ApplicationDetail']['ApplicationStatus']

while app_status != 'RUNNING':
    time.sleep(5)
    response = kinesis_analytics.describe_application(
        ApplicationName=kinesis_data_analytics_app_name)
    app_status = response['ApplicationDetail']['ApplicationStatus']
    print('Application status {}'.format(app_status))

print('Application status {}'.format(app_status))

Application status RUNNING
CPU times: user 62 µs, sys: 6 µs, total: 68 µs
Wall time: 61 µs


# _Wait For The Application Status ^^ Running ^^_

# Simulate Producer Application Writing Records to the Stream

# Open Lambda Logs

In [24]:
from IPython.core.display import display, HTML
    
display(HTML('<b>Review <a target="top" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=%252Faws%252Flambda%252F{}">Lambda Logs</a></b>'.format(region, lambda_fn_name)))


# _Keep That ^^ Link ^^ Open In Your Browser_

# Open Custom CloudWatch Metrics

In [25]:
from IPython.core.display import display, HTML
    
display(HTML("""<b>Review <a target="top" href="https://console.aws.amazon.com/cloudwatch/home?region={}#metricsV2:graph=~(metrics~(~(~'kinesis*2fanalytics*2fAVGStarRating~'AVGStarRating~'Product*20Category~'All))~view~'timeSeries~stacked~false~start~'-PT5M~end~'P0D~region~'us-east-1~liveData~true~stat~'Average~period~1~title~'Avg*20Star*20Rating);query=~'*7bkinesis*2fanalytics*2fAVGStarRating*2c*22Product*20Category*22*7d">CloudWatch Metrics</a></b>""".format(region, region)))


# _Keep That ^^ Link ^^ Open In Your Browser_

# Open Kinesis Data Analytics Console UI

In [26]:
from IPython.core.display import display, HTML
        
display(HTML('<b>Review <a target="top" href="https://console.aws.amazon.com/kinesisanalytics/home?region={}#/wizard/editor?applicationName={}"> Kinesis Data Analytics App</a></b>'.format(region, kinesis_data_analytics_app_name)))


# _Keep That ^^ Link ^^ Open In Your Browser To See Live Records Coming In After Running Next Cells_

# Put Records onto Firehose

In [27]:
firehose_response = firehose.describe_delivery_stream(
    DeliveryStreamName=firehose_name
)

print(json.dumps(firehose_response, indent=4, sort_keys=True, default=str))

{
    "DeliveryStreamDescription": {
        "CreateTimestamp": "2020-09-26 20:39:08.017000+00:00",
        "DeliveryStreamARN": "arn:aws:firehose:us-west-2:085964654406:deliverystream/dsoaws-kinesis-data-firehose",
        "DeliveryStreamEncryptionConfiguration": {
            "Status": "DISABLED"
        },
        "DeliveryStreamName": "dsoaws-kinesis-data-firehose",
        "DeliveryStreamStatus": "ACTIVE",
        "DeliveryStreamType": "DirectPut",
        "Destinations": [
            {
                "DestinationId": "destinationId-000000000001",
                "ExtendedS3DestinationDescription": {
                    "BucketARN": "arn:aws:s3:::sagemaker-us-west-2-085964654406",
                    "BufferingHints": {
                        "IntervalInSeconds": 300,
                        "SizeInMBs": 5
                    },
                    "CloudWatchLoggingOptions": {
                        "Enabled": false
                    },
                    "CompressionForma

In [28]:
step = 1
for start_idx in range(0, 10000, step):
    end_idx = start_idx + step

    df_star_rating_and_review_body = df[['review_id', 
                                         'star_rating', 
                                         'product_category', 
                                         'review_body']][start_idx:end_idx]

    reviews_tsv = df_star_rating_and_review_body.to_csv(sep='\t',
                                                        header=None,
                                                        index=False)
    
    # print(reviews_tsv.encode('utf-8'))
    
    response = firehose.put_record(        
        Record={
            'Data': reviews_tsv.encode('utf-8')
        },
        DeliveryStreamName=firehose_name
    )

In [29]:
from IPython.core.display import display, HTML
        
display(HTML('<b>Review <a target="top" href="https://console.aws.amazon.com/kinesisanalytics/home?region={}#/wizard/editor?applicationName={}"> Kinesis Data Analytics App</a></b>'.format(region, kinesis_data_analytics_app_name)))


# Go To Kinesis Analytics UI: 

# _Note: If You See This Error `No rows in source stream`:_

<img src="img/no_rows_in_source_kinesis_firehose_stream.png" width="80%" align="left">

## _Click On `Source` Or `Real-Time analytics` Tab Or Re-Run ^^ Above ^^ Cell `Put Records onto Firehose`_

# Go To Kinesis Analytics UI: 

In [30]:
from IPython.core.display import display, HTML
        
display(HTML('<b>Go To UI <a target="top" href="https://console.aws.amazon.com/kinesisanalytics/home?region={}#/wizard/editor?applicationName={}"> Kinesis Data Analytics App</a></b>'.format(region, kinesis_data_analytics_app_name)))


# ---- You can see our reviews streaming data coming in under the `Source` tab:

<img src="img/kinesis_analytics_1.png" width="80%" align="left">

# ---- Go to `Real-time analytics` tab and select `AVG_STAR_RATING_SQL_STREAM`

<img src="img/kinesis_analytics_5.png" width="80%" align="left">

## ------ Go to `Real-time analytics` tab and select `APPROXIMATE_COUNT_SQL_STREAM`

<img src="img/kinesis_analytics_4.png" width="80%" align="left">

# Go To Kinesis Analytics UI and Check Anomaly Detection Score

In [31]:
from IPython.core.display import display, HTML
        
display(HTML('<b>Go To <a target="top" href="https://console.aws.amazon.com/kinesisanalytics/home?region={}#/wizard/editor?applicationName={}"> Kinesis Data Analytics App</a></b>'.format(region, kinesis_data_analytics_app_name)))


# Create and Put Anomaly Data Onto Stream

In [32]:
import time

anomaly_step = 1

for start_idx in range(0, 10000, anomaly_step):
    timestamp = int(time.time())

    df_anomalies = pd.DataFrame([
        {'review_id': str(timestamp), 
         'star_rating': 100, 
         'product_category': 'Digital_Software', 
         'review_body': 'blahblah'},     
    ], columns=['review_id', 'star_rating', 'product_category', 'review_body'])

    reviews_tsv_anomalies = df_anomalies.to_csv(sep='\t',
                                                header=None,
                                                index=False)
    
    response = firehose.put_record(           
        Record={
            'Data': reviews_tsv_anomalies.encode('utf-8')
        },
        DeliveryStreamName=firehose_name
    )

In [33]:
from IPython.core.display import display, HTML
        
display(HTML('<b>Go To <a target="top" href="https://console.aws.amazon.com/kinesisanalytics/home?region={}#/wizard/editor?applicationName={}"> Kinesis Data Analytics App</a></b>'.format(region, kinesis_data_analytics_app_name)))


# ---- Go to `Real-time analytics` tab and select `ANOMALY_SCORE_SQL_STREAM`

<img src="img/kinesis_analytics_3.png" width="80%" align="left">

In [34]:
#%%javascript
#Jupyter.notebook.save_checkpoint();
#Jupyter.notebook.session.delete();