Skip to content

Commit

Permalink
[COST-4481] use StringDtype(storage="pyarrow") (#4826)
Browse files Browse the repository at this point in the history
  • Loading branch information
maskarb committed Dec 7, 2023
1 parent 61de77c commit 9999949
Show file tree
Hide file tree
Showing 5 changed files with 8 additions and 5 deletions.
2 changes: 1 addition & 1 deletion koku/masu/external/downloader/aws/aws_report_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def create_daily_archives(
local_file,
chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE,
usecols=lambda x: x in use_cols,
dtype="str",
dtype=pd.StringDtype(storage="pyarrow"),
) as reader:
for i, data_frame in enumerate(reader):
if data_frame.empty:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,10 @@ def create_daily_archives(
{"UsageDateTime", "Date", "date", "usagedatetime"}
)[0]
with pd.read_csv(
local_file, chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE, parse_dates=[time_interval], dtype="str"
local_file,
chunksize=settings.PARQUET_PROCESSING_BATCH_SIZE,
parse_dates=[time_interval],
dtype=pd.StringDtype(storage="pyarrow"),
) as reader:
for i, data_frame in enumerate(reader):
if data_frame.empty:
Expand Down
2 changes: 1 addition & 1 deletion koku/masu/external/downloader/gcp/gcp_report_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class GCPReportDownloaderError(Exception):

def pd_read_csv(local_file_path):
try:
return pd.read_csv(local_file_path, dtype="str")
return pd.read_csv(local_file_path, dtype=pd.StringDtype(storage="pyarrow"))
except Exception as error:
LOG.error(log_json(msg="file could not be parsed", file_path=local_file_path), exc_info=error)
raise GCPReportDownloaderError(error)
Expand Down
2 changes: 1 addition & 1 deletion koku/masu/external/downloader/oci/oci_report_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def divide_csv_monthly(file_path, filename):
directory = os.path.dirname(file_path)

try:
data_frame = pd.read_csv(file_path, dtype="str")
data_frame = pd.read_csv(file_path, dtype=pd.StringDtype(storage="pyarrow"))
except Exception as error:
LOG.error(f"File {file_path} could not be parsed. Reason: {error}")
raise error
Expand Down
2 changes: 1 addition & 1 deletion koku/masu/external/kafka_msg_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def divide_csv_daily(file_path: os.PathLike, manifest_id: int):
daily_files = []

try:
data_frame = pd.read_csv(file_path, dtype="str")
data_frame = pd.read_csv(file_path, dtype=pd.StringDtype(storage="pyarrow"))
except Exception as error:
LOG.error(f"File {file_path} could not be parsed. Reason: {str(error)}")
raise error
Expand Down

0 comments on commit 9999949

Please sign in to comment.