Skip to content

Commit

Permalink
Factor sql maps into files and build less SQL in python
Browse files Browse the repository at this point in the history
  • Loading branch information
cgoodfred committed Dec 13, 2023
1 parent a9e27af commit 593c1dd
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 80 deletions.
86 changes: 12 additions & 74 deletions koku/subs/subs_data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,51 +32,6 @@
Provider.PROVIDER_AZURE: AZURE_TABLE,
}

ID_COLUMN_MAP = {
Provider.PROVIDER_AWS: "lineitem_usageaccountid",
Provider.PROVIDER_AZURE: "COALESCE(NULLIF(subscriptionid, ''), subscriptionguid)",
}

RECORD_FILTER_MAP = {
Provider.PROVIDER_AWS: (
" lineitem_productcode = 'AmazonEC2' AND lineitem_lineitemtype IN ('Usage', 'SavingsPlanCoveredUsage') "
"AND product_vcpu != '' AND strpos(lower(resourcetags), 'com_redhat_rhel') > 0"
),
Provider.PROVIDER_AZURE: (
" metercategory = 'Virtual Machines' AND chargetype = 'Usage' "
"AND json_extract_scalar(lower(additionalinfo), '$.vcpus') IS NOT NULL "
"AND json_extract_scalar(lower(tags), '$.com_redhat_rhel') IS NOT NULL"
),
}

RESOURCE_ID_FILTER_MAP = {
Provider.PROVIDER_AWS: (
" AND lineitem_productcode = 'AmazonEC2' "
"AND strpos(lower(resourcetags), 'com_redhat_rhel') > 0 AND lineitem_usageaccountid = {{usage_account}}"
),
Provider.PROVIDER_AZURE: (
" AND metercategory = 'Virtual Machines' "
"AND json_extract_scalar(lower(additionalinfo), '$.vcpus') IS NOT NULL "
"AND json_extract_scalar(lower(tags), '$.com_redhat_rhel') IS NOT NULL "
"AND (subscriptionid = {{usage_account}} or subscriptionguid = {{usage_account}}) "
),
}

RESOURCE_SELECT_MAP = {
Provider.PROVIDER_AWS: " SELECT lineitem_resourceid, max(lineitem_usagestartdate) ",
Provider.PROVIDER_AZURE: " SELECT coalesce(NULLIF(resourceid, ''), instancename), date_add('day', -1, max(coalesce(date, usagedatetime))) ", # noqa E501
}

RESOURCE_ID_GROUP_BY_MAP = {
Provider.PROVIDER_AWS: " GROUP BY lineitem_resourceid",
Provider.PROVIDER_AZURE: " GROUP BY resourceid, instancename",
}

RESOURCE_ID_EXCLUSION_CLAUSE_MAP = {
Provider.PROVIDER_AWS: " AND lineitem_resourceid NOT IN {{excluded_ids | inclause}} ",
Provider.PROVIDER_AZURE: " and coalesce(NULLIF(resourceid, ''), instancename) NOT IN {{excluded_ids | inclause}} ",
}

RESOURCE_ID_SQL_CLAUSE_MAP = {
Provider.PROVIDER_AWS: (
" ( lineitem_resourceid = {{{{ rid_{0} }}}} "
Expand Down Expand Up @@ -124,13 +79,7 @@ def __init__(self, tracing_id, context):
self.context = context
# The following variables all change depending on the provider type to run the correct SQL
self.table = TABLE_MAP.get(self.provider_type)
self.id_column = ID_COLUMN_MAP.get(self.provider_type)
self.provider_where_clause = RECORD_FILTER_MAP.get(self.provider_type)
self.resource_select_sql = RESOURCE_SELECT_MAP.get(self.provider_type)
self.resource_id_where_clause = RESOURCE_ID_FILTER_MAP.get(self.provider_type)
self.resource_id_group_by = RESOURCE_ID_GROUP_BY_MAP.get(self.provider_type)
self.resource_id_sql_clause = RESOURCE_ID_SQL_CLAUSE_MAP.get(self.provider_type)
self.resource_id_exclusion_clause = RESOURCE_ID_EXCLUSION_CLAUSE_MAP.get(self.provider_type)
self.post_or_clause_sql = POST_OR_CLAUSE_SQL_MAP.get(self.provider_type)

@cached_property
Expand Down Expand Up @@ -177,23 +126,18 @@ def determine_ids_for_provider(self, year, month):
excluded_ids = list(
SubsIDMap.objects.exclude(source_uuid=self.provider_uuid).values_list("usage_id", flat=True)
)
sql = (
"SELECT DISTINCT {{id_column | sqlsafe}} FROM hive.{{schema | sqlsafe}}.{{table | sqlsafe}} WHERE"
" source={{source_uuid}} AND year={{year}} AND month={{month}}"
)
if excluded_ids:
sql += " AND {{id_column | sqlsafe}} NOT IN {{excluded_ids | inclause}}"
sql_file = f"trino_sql/{self.provider_type.lower()}_determine_ids.sql"
summary_sql = pkgutil.get_data("subs", sql_file)
summary_sql = summary_sql.decode("utf-8")
sql_params = {
"schema": self.schema,
"source_uuid": self.provider_uuid,
"year": year,
"month": month,
"excluded_ids": excluded_ids,
"id_column": self.id_column,
"table": self.table,
}
ids = self._execute_trino_raw_sql_query(
sql, sql_params=sql_params, context=self.context, log_ref="subs_determine_ids_for_provider"
summary_sql, sql_params=sql_params, context=self.context, log_ref="subs_determine_ids_for_provider"
)
id_list = []
bulk_maps = []
Expand All @@ -213,15 +157,15 @@ def determine_line_item_count(self, where_clause, sql_params):

def determine_where_clause_and_params(self, year, month):
"""Determine the where clause to use when processing subs data"""
where_clause = "WHERE source={{source_uuid}} AND year={{year}} AND month={{month}} AND"
# different provider types have different required filters here
where_clause += self.provider_where_clause
sql_file = f"trino_sql/{self.provider_type.lower()}_subs_where_clause.sql"
where_clause_sql = pkgutil.get_data("subs", sql_file)
where_clause_sql = where_clause_sql.decode("utf-8")
sql_params = {
"source_uuid": self.provider_uuid,
"year": year,
"month": month,
}
return where_clause, sql_params
return where_clause_sql, sql_params

def get_resource_ids_for_usage_account(self, usage_account, year, month):
"""Determine the relevant resource ids and end time to process to for each resource id."""
Expand All @@ -230,25 +174,19 @@ def get_resource_ids_for_usage_account(self, usage_account, year, month):
excluded_ids = list(
SubsLastProcessed.objects.exclude(source_uuid=self.provider_uuid).values_list("resource_id", flat=True)
)
sql = self.resource_select_sql + (
" FROM hive.{{schema | sqlsafe}}.{{table | sqlsafe}} WHERE"
" source={{source_uuid}} AND year={{year}} AND month={{month}}"
)
sql += self.resource_id_where_clause
if excluded_ids:
sql += self.resource_id_exclusion_clause
sql += self.resource_id_group_by
sql_file = f"trino_sql/{self.provider_type.lower()}_determine_rids_for_account.sql"
summary_sql = pkgutil.get_data("subs", sql_file)
summary_sql = summary_sql.decode("utf-8")
sql_params = {
"schema": self.schema,
"source_uuid": self.provider_uuid,
"year": year,
"month": month,
"excluded_ids": excluded_ids,
"usage_account": usage_account,
"table": self.table,
}
ids = self._execute_trino_raw_sql_query(
sql, sql_params=sql_params, context=self.context, log_ref="subs_determine_rids_for_provider"
summary_sql, sql_params=sql_params, context=self.context, log_ref="subs_determine_rids_for_provider"
)
return ids

Expand Down
11 changes: 5 additions & 6 deletions koku/subs/test/test_subs_data_extractor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import pkgutil
import uuid
from datetime import timedelta
from unittest.mock import MagicMock
Expand Down Expand Up @@ -92,13 +93,11 @@ def test_determine_where_clause_and_params(self):
"year": year,
"month": month,
}
expected_clause = (
"WHERE source={{source_uuid}} AND year={{year}} AND month={{month}} AND"
" lineitem_productcode = 'AmazonEC2' AND lineitem_lineitemtype IN ('Usage', 'SavingsPlanCoveredUsage') AND"
" product_vcpu != '' AND strpos(lower(resourcetags), 'com_redhat_rhel') > 0"
)
sql_file = f"trino_sql/{self.extractor.provider_type.lower()}_subs_where_clause.sql"
expected_clause_sql = pkgutil.get_data("subs", sql_file)
expected_clause_sql = expected_clause_sql.decode("utf-8")
actual_clause, actual_params = self.extractor.determine_where_clause_and_params(year, month)
self.assertEqual(expected_clause, actual_clause)
self.assertEqual(expected_clause_sql, actual_clause)
self.assertEqual(expected_sql_params, actual_params)

@patch("subs.subs_data_extractor.SUBSDataExtractor.bulk_update_latest_processed_time")
Expand Down
11 changes: 11 additions & 0 deletions koku/subs/trino_sql/aws_determine_ids.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
DISTINCT lineitem_usageaccountid
FROM
hive.{{schema | sqlsafe}}.aws_line_items
WHERE
source = {{source_uuid}}
AND year = {{year}}
AND month = {{month}}
{ % if excluded_ids % }
AND lineitem_usageaccountid NOT IN {{excluded_ids | inclause}}
{ % endif % }
20 changes: 20 additions & 0 deletions koku/subs/trino_sql/aws_determine_rids_for_account.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
SELECT
lineitem_resourceid,
max(lineitem_usagestartdate)
FROM
hive.{{schema | sqlsafe}}.aws_line_items
WHERE
source = {{source_uuid}}
AND year = {{year}}
AND month = {{month}}
AND lineitem_productcode = 'AmazonEC2'
AND strpos(
lower(resourcetags),
'com_redhat_rhel'
) > 0
AND lineitem_usageaccountid = {{usage_account}}
{ % if excluded_ids % }
AND lineitem_resourceid NOT IN {{excluded_ids | inclause}}
{ % endif % }
GROUP BY
lineitem_resourceid
13 changes: 13 additions & 0 deletions koku/subs/trino_sql/aws_subs_where_clause.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
WHERE
source = {{source_uuid}}
AND year = {{year}}
AND month = {{month}}
AND lineitem_productcode = 'AmazonEC2'
AND lineitem_lineitemtype IN (
'Usage', 'SavingsPlanCoveredUsage'
)
AND product_vcpu != ''
AND strpos(
lower(resourcetags),
'com_redhat_rhel'
) > 0
17 changes: 17 additions & 0 deletions koku/subs/trino_sql/azure_determine_ids.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
SELECT
DISTINCT COALESCE(
NULLIF(subscriptionid, ''),
subscriptionguid
)
FROM
hive.{{schema | sqlsafe}}.azure_line_items
WHERE
source = {{source_uuid}}
AND year = {{year}}
AND month = {{month}}
{ % if excluded_ids % }
AND COALESCE(
NULLIF(subscriptionid, ''),
subscriptionguid
) NOT IN {{excluded_ids | inclause}}
{ % endif % }
40 changes: 40 additions & 0 deletions koku/subs/trino_sql/azure_determine_rids_for_account.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
SELECT
coalesce(
NULLIF(resourceid, ''),
instancename
),
date_add(
'day',
-1,
max(
coalesce(date, usagedatetime)
)
)
FROM
hive.{{schema | sqlsafe}}.azure_line_items
WHERE
source = {{source_uuid}}
AND year = {{year}}
AND month = {{month}}
AND metercategory = 'Virtual Machines'
AND json_extract_scalar(
lower(additionalinfo),
'$.vcpus'
) IS NOT NULL
AND json_extract_scalar(
lower(tags),
'$.com_redhat_rhel'
) IS NOT NULL
AND (
subscriptionid = {{usage_account}}
or subscriptionguid = {{usage_account}}
)
{ % if excluded_ids % }
AND coalesce(
NULLIF(resourceid, ''),
instancename
) NOT IN {{excluded_ids | inclause}}
{ % endif % }
GROUP BY
resourceid,
instancename
14 changes: 14 additions & 0 deletions koku/subs/trino_sql/azure_subs_where_clause.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
WHERE
source = {{source_uuid}}
AND year = {{year}}
AND month = {{month}}
AND metercategory = 'Virtual Machines'
AND chargetype = 'Usage'
AND json_extract_scalar(
lower(additionalinfo),
'$.vcpus'
) IS NOT NULL
AND json_extract_scalar(
lower(tags),
'$.com_redhat_rhel'
) IS NOT NULL

0 comments on commit 593c1dd

Please sign in to comment.