# Explore Data In Redshift

In [None]:
# Install SQL Alchemy
!pip install -q SQLAlchemy==1.3.13

### Set Redshift Connection Parameters

In [None]:
redshift_schema = "redshift"
redshift_cluster_identifier = "dsoaws"
redshift_host = "dsoaws"
redshift_database = "dsoaws"
redshift_port = "5439"
redshift_table_2015 = "amazon_reviews_tsv_2015"
redshift_table_2014 = "amazon_reviews_tsv_2014"

### Load the Redshift Secrets from Secrets Manager

In [None]:
import json
import boto3

secretsmanager = boto3.client("secretsmanager")

secret = secretsmanager.get_secret_value(SecretId="dsoaws_redshift_login")
cred = json.loads(secret["SecretString"])

redshift_username = cred[0]["username"]
redshift_pw = cred[1]["password"]

In [None]:
redshift = boto3.client("redshift")

response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)

redshift_endpoint_address = response["Clusters"][0]["Endpoint"]["Address"]

print(redshift_endpoint_address)

## Create the Redshift Connection

In [None]:
import awswrangler as wr

con_redshift = wr.data_api.redshift.connect(
    cluster_id=redshift_cluster_identifier,
    database=redshift_database,
    db_user=redshift_username,
)

### Using APPROXIMATE COUNT for 'blazing fast' results

The COUNT function counts the rows defined by the expression.

The COUNT function has three variations. COUNT ( * ) counts all the rows in the target table whether they include nulls or not. COUNT ( expression ) computes the number of rows with non-NULL values in a specific column or expression. COUNT ( DISTINCT expression ) computes the number of distinct non-NULL values in a column or expression.

When used with APPROXIMATE, a COUNT ( DISTINCT expression ) function uses a HyperLogLog algorithm to approximate the number of distinct non-NULL values in a column or expression. Queries that use the APPROXIMATE keyword execute much faster, with a low relative error of around 2%. Approximation is warranted for queries that return a large number of distinct values, in the millions or more per query, or per group, if there is a group by clause. For smaller sets of distinct values, in the thousands, approximation might be slower than a precise count. APPROXIMATE can only be used with COUNT ( DISTINCT ).

#### Compare the query execution times of the two queries below.

In [None]:
%%time
df = wr.data_api.redshift.read_sql_query(
    sql="""SELECT approximate count(distinct customer_id)
                        FROM {}.{}
                        GROUP BY product_category""".format(
        redshift_schema, redshift_table_2015
    ),
    con=con_redshift,
)

In [None]:
%%time
df = wr.data_api.redshift.read_sql_query(
    sql="""SELECT count(distinct customer_id)
                                FROM {}.{}
                                GROUP BY product_category""".format(
            redshift_schema, redshift_table_2015
    ),
    con=con_redshift,
)

### Let's do some Visualizations

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
statement = """
SELECT product_category,
COUNT(star_rating) AS count_star_rating
FROM {}.{}
GROUP BY product_category
ORDER BY count_star_rating DESC
""".format(
    redshift_schema, redshift_table_2015
)

print(statement)

In [None]:
df = wr.data_api.redshift.read_sql_query(
    sql=statement,
    con=con_redshift,
)

In [None]:
df.head()

In [None]:
# Store number of categories
num_categories = df.shape[0]
print(num_categories)

In [None]:
# Store max ratings
max_ratings = df["count_star_rating"].max()
print(max_ratings)

In [None]:
# Set size and style to use
if num_categories > 10:
    plt.figure(figsize=(10, 10))
else:
    plt.figure(figsize=(10, 5))

plt.style.use("seaborn-whitegrid")

# Create Seaborn barplot
barplot = sns.barplot(y="product_category", x="count_star_rating", data=df, saturation=1)

# Set title
plt.title("Number of Ratings per Product Category (Redshift)")

# Set x-axis ticks to match scale from 10mio reviews to 20mio reviews
if max_ratings <= 8000:
    plt.xticks(
        [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000],
        ["10K", "20K", "30K", "40K", "50K", "60K", "70K", "80K"],
    )
    plt.xlim(0, 80000)
elif max_ratings <= 200000:
    plt.xticks([50000, 100000, 150000, 200000], ["50K", "100K", "1500K", "200K"])
    plt.xlim(0, 200000)
elif max_ratings > 200000:
    plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ["100K", "1m", "5m", "10m", "15m", "20m"])
    plt.xlim(0, 20000000)

plt.xlabel("Number of Ratings")
plt.ylabel("Product Category")

plt.tight_layout()

# Export plot if needed
# plt.savefig('ratings_per_category.png', dpi=300)

# Show the barplot
plt.show(barplot)

## Query Athena using Redshift Spectrum

In [None]:
athena_schema = "athena"
athena_table_name = "amazon_reviews_tsv"

In [None]:
statement = """
SELECT product_category, COUNT(star_rating) AS count_star_rating
FROM {}.{}
GROUP BY product_category
ORDER BY count_star_rating DESC
""".format(
    athena_schema, athena_table_name
)

print(statement)

In [None]:
df = wr.data_api.redshift.read_sql_query(
    sql=statement,
    con=con_redshift,
)
df.head(5)

In [None]:
# Set size and style to use
if num_categories > 10:
    plt.figure(figsize=(10, 10))
else:
    plt.figure(figsize=(10, 5))

plt.style.use("seaborn-whitegrid")

# Create Seaborn barplot
barplot = sns.barplot(y="product_category", x="count_star_rating", data=df, saturation=1)

# Set title
plt.title("Number of Ratings per Product Category (Athena via Redshift Spectrum)")

# Set x-axis ticks to match scale from 10mio reviews to 20mio reviews
# Set x-axis ticks to match scale from 10mio reviews to 20mio reviews
if max_ratings <= 8000:
    plt.xticks(
        [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000],
        ["10K", "20K", "30K", "40K", "50K", "60K", "70K", "80K"],
    )
    plt.xlim(0, 80000)
elif max_ratings <= 200000:
    plt.xticks([50000, 100000, 150000, 200000], ["50K", "100K", "1500K", "200K"])
    plt.xlim(0, 200000)
elif max_ratings > 200000:
    plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ["100K", "1m", "5m", "10m", "15m", "20m"])
    plt.xlim(0, 20000000)

plt.xlabel("Number of Ratings")
plt.ylabel("Product Category")

plt.tight_layout()

# Export plot if needed
# plt.savefig('ratings_per_category.png', dpi=300)

# Show the barplot
plt.show(barplot)

In [None]:
# Release Resources

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>

<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}
</script>