# Part B: Athena Queries

Create a database in athena using the partitioned data in S3 of the Quien es Quien en los Precios data from 2018 to 2024. 

@roman

21 apr 2024

---
# Settings

In [6]:
import os
import boto3
from pyathena import connect
import awswrangler as wr
import pandas as pd
import yaml
from datetime import datetime
from tqdm import tqdm

In [7]:
# get config file
with open("../../config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [14]:
# AWS Settings
session = boto3.Session(profile_name="arquitectura")
s3 = session.client("s3")

# set region east-1 to boto3
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"

# Bucket
BUCKET_NAME = config["s3"]["bucket"]

# Sub Bucket
FOLDER_NAME = config["s3"]["folder"]

# Bucket Query
FILE = config["s3"]["file"]

# Database Name
DATABASE_NAME = config["athena"]["db_name"]

# Table Name
TABLE_NAME = config["athena"]["table_name"]

# Bucket Query
BUCKET_QUERY = config["athena"]["bucket_queries"]

---
# Configure Database at Athena

## S1: DB

In [10]:
# create glue client with profile session
glue_client = session.client("glue")

In [11]:
# create database
response = glue_client.create_database(
    DatabaseInput={
        "Name": DATABASE_NAME,
        "Description": "Database for QQP"
    }
)

print(response)

{'ResponseMetadata': {'RequestId': '27ad870a-6479-4da6-8ec7-60a744dc7015', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 22 Apr 2024 02:53:57 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '2', 'connection': 'keep-alive', 'x-amzn-requestid': '27ad870a-6479-4da6-8ec7-60a744dc7015'}, 'RetryAttempts': 0}}


In [12]:
# set query bocket
response = s3.create_bucket(Bucket=BUCKET_QUERY)
print(response)

{'ResponseMetadata': {'RequestId': '898ZVP4E0MNSAGW6', 'HostId': 'dte10Xm+FzLibrBSP/dOpsGfnmHkIk+kdFoTNdYAbrluevKxfFwjyml4hZgilodBZIs2YmD1cV8=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'dte10Xm+FzLibrBSP/dOpsGfnmHkIk+kdFoTNdYAbrluevKxfFwjyml4hZgilodBZIs2YmD1cV8=', 'x-amz-request-id': '898ZVP4E0MNSAGW6', 'date': 'Mon, 22 Apr 2024 02:54:01 GMT', 'location': '/itam-analytics-roman-queries', 'server': 'AmazonS3', 'content-length': '0'}, 'RetryAttempts': 0}, 'Location': '/itam-analytics-roman-queries'}


## S2: Tables

### Create Table

In [None]:
CREATE EXTERNAL TABLE IF NOT EXISTS `profeco_db`.`profeco` (
  `product` string,
  `presentation` string,
  `brand` string,
  `category` string,
  `catalog` string,
  `price` double,
  `created_at` date,
  `store` string,
  `type_of_store` string,
  `branch` string,
  `direction` string,
  `state` string,
  `city` string,
  `latitude` string,
  `longitude` string
) COMMENT "djasof"
PARTITIONED BY (`catalog` string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION 's3://itam-analytics-roman/qqp/qqp/'
TBLPROPERTIES ('classification' = 'parquet');

In [28]:
dict_qux =                 [{"Name": "product", "Type": "string"},
                {"Name": "presentation", "Type": "string"},
                {"Name": "brand", "Type": "string"},
                {"Name": "category", "Type": "string"},
                {"Name": "catalog", "Type": "string"},
                {"Name": "price", "Type": "double"},
                {"Name": "created_at", "Type": "date"},
                {"Name": "store", "Type": "string"},
                {"Name": "type_of_store", "Type": "string"},
                {"Name": "branch", "Type": "string"},
                {"Name": "direction", "Type": "string"},
                {"Name": "state", "Type": "string"},
                {"Name": "city", "Type": "string"},
                {"Name": "latitude", "Type": "string"},
                {"Name": "longitude", "Type": "string"}]

# get tuples of dict in the following way key value, key value, 
columns = [f"{list(item.values())[0]} {list(item.values())[1]}" for item in dict_qux]
print(columns)

['product string', 'presentation string', 'brand string', 'category string', 'catalog string', 'price double', 'created_at date', 'store string', 'type_of_store string', 'branch string', 'direction string', 'state string', 'city string', 'latitude string', 'longitude string']


In [29]:
len(dict_qux)

15

In [None]:
product string, presentation string, brand string, category string, catalog string, price double, created_at date, store string, type_of_store string, branch string, direction string, state string, city string, latitude string, longitude string

In [23]:
response = glue_client.create_table(
    DatabaseName=DATABASE_NAME,
    TableInput={
        "Name": TABLE_NAME,
        "Description": "Quien es Quien en Precios Data; from 2018 to 2024",
        # "PartitionKeys": [   # Define partition keys
        #     {
        #         "Name": "catalog",
        #         "Type": "string"
        #     }
        # ],
        "TableType": "EXTERNAL_TABLE",
        "Parameters": {
            "classification": "parquet",   # Set classification as parquet
            "EXTERNAL": "TRUE",            # Specify it as an external table
            "skip.header.line.count": "1"  # Depending on your data, you might need this
        },
        "StorageDescriptor": {
            "Columns": [   # Define columns
                {"Name": "product", "Type": "string"},
                {"Name": "presentation", "Type": "string"},
                {"Name": "brand", "Type": "string"},
                {"Name": "category", "Type": "string"},
                {"Name": "catalog", "Type": "string"},
                {"Name": "price", "Type": "double"},
                {"Name": "created_at", "Type": "date"},
                {"Name": "store", "Type": "string"},
                {"Name": "type_of_store", "Type": "string"},
                {"Name": "branch", "Type": "string"},
                {"Name": "direction", "Type": "string"},
                {"Name": "state", "Type": "string"},
                {"Name": "city", "Type": "string"},
                {"Name": "latitude", "Type": "string"},
                {"Name": "longitude", "Type": "string"}
            ],
            "Location": f"s3://{BUCKET_NAME}/{FOLDER_NAME}/{FILE}/",
            "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
            "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
            "SerdeInfo": {
                "SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
                "Parameters": {
                    "serialization.format": "1"
                }
            }
        }
    }
)

print(response)

{'ResponseMetadata': {'RequestId': '78c3a0f3-fdaf-41c0-864d-462f3f740b44', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 22 Apr 2024 03:51:23 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '2', 'connection': 'keep-alive', 'x-amzn-requestid': '78c3a0f3-fdaf-41c0-864d-462f3f740b44'}, 'RetryAttempts': 0}}


In [22]:
f"s3://{BUCKET_NAME}/{FOLDER_NAME}/{FILE}/"

's3://itam-analytics-roman/qqp/qqp/'