In [1]:
import boto3
import pandas as pd
import io
from s3_fetcher import S3DataFetcher

  model = BedrockChat(


In [2]:
def parse_s3_path(s3_path: str):
    """
    Parses an S3 path of the form 's3://bucket-name/path/to/folder'
    into bucket and prefix.
    """
    if not s3_path.startswith("s3://"):
        raise ValueError("S3 path must start with 's3://'")
    path_without_scheme = s3_path[len("s3://"):]
    parts = path_without_scheme.split("/", 1)
    bucket = parts[0]
    prefix = parts[1] if len(parts) > 1 else ""
    return bucket, prefix

In [3]:
# S3 folder path.
s3_folder_path = "s3://prod-ai-and-automation/usecase_CTVCopilot/BI/"
bucket_name, prefix = parse_s3_path(s3_folder_path)
print("Bucket:", bucket_name)
print("Prefix:", prefix)

Bucket: prod-ai-and-automation
Prefix: usecase_CTVCopilot/BI/


In [4]:
s3 = boto3.client('s3')
paginator = s3.get_paginator('list_objects_v2')
object_keys = []
for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
    for obj in page.get('Contents', []):
        object_keys.append(obj['Key'])

print(f"\nFound {len(object_keys)} objects under '{s3_folder_path}':")
for key in object_keys:
    print(" -", key)


Found 15 objects under 's3://prod-ai-and-automation/usecase_CTVCopilot/BI/':
 - usecase_CTVCopilot/BI/_SUCCESS
 - usecase_CTVCopilot/BI/_committed_4837909289728916165
 - usecase_CTVCopilot/BI/_committed_6367247307692206839
 - usecase_CTVCopilot/BI/_committed_6975755882774218683
 - usecase_CTVCopilot/BI/_committed_7913142500625727807
 - usecase_CTVCopilot/BI/_committed_vacuum2396813374514155360
 - usecase_CTVCopilot/BI/_started_4837909289728916165
 - usecase_CTVCopilot/BI/part-00000-tid-4837909289728916165-5ea6c318-c143-46a7-81aa-53e280c72c4a-739-1-c000.snappy.parquet
 - usecase_CTVCopilot/BI/part-00001-tid-4837909289728916165-5ea6c318-c143-46a7-81aa-53e280c72c4a-740-1-c000.snappy.parquet
 - usecase_CTVCopilot/BI/part-00002-tid-4837909289728916165-5ea6c318-c143-46a7-81aa-53e280c72c4a-741-1-c000.snappy.parquet
 - usecase_CTVCopilot/BI/part-00003-tid-4837909289728916165-5ea6c318-c143-46a7-81aa-53e280c72c4a-742-1-c000.snappy.parquet
 - usecase_CTVCopilot/BI/part-00004-tid-4837909289728916

In [5]:
sample_key = "usecase_CTVCopilot/BI/part-00000-tid-4837909289728916165-5ea6c318-c143-46a7-81aa-53e280c72c4a-739-1-c000.snappy.parquet" 

fetcher = S3DataFetcher()

try:
    df_sample = fetcher.sample_rows(bucket_name, sample_key, num_rows=5)
    print("Sample rows from file:")
    print(df_sample)
except Exception as e:
    print(f"Error sampling rows from '{sample_key}': {e}")

Sample rows from file:
     DSP      Period  Client_ID                          Client  \
0  yahoo  2024-01-01       3804                         MBMG-LA   
1  yahoo  2024-01-01       3804                         MBMG-LA   
2  dv360  2024-03-01         76                  LMO - Virginia   
3    ttd  2024-01-01        638  Time and Space Media - Halifax   
4    ttd  2024-01-01       1274                 iprospect (Van)   

               Agency           SF_Adv_ID                SF_Adv  Lab_Adv_ID  \
0             MBMG-LA  a0r1v00001Ovv0JAAR  Keck Medicine of USC       14017   
1             MBMG-LA  a0r1v00001Ovv0JAAR  Keck Medicine of USC       14017   
2                 LMO  a0r0Y00000ZHarDQAT           Coast Guard        2069   
3  Time + Space Media  a0r1v00001MIaBQAA1    Medavie Blue Cross       11981   
4           Iprospect  a0r1v00001I5GB2AAN      BC Liquor Stores       10612   

                Lab_Adv DSP_Adv_ID  \
0  Keck Medicine of USC   14625301   
1  Keck Medicine of USC

In [6]:
# List column metadata using json conversion
try:
    columns_info = fetcher.list_columns_in_file(bucket_name, sample_key)
    print("Column information:")
    print(columns_info)
except Exception as e:
    print(f"Error listing columns for '{sample_key}': {e}")

Column information:
|-- DSP: object (nullable = false)
|-- Period: object (nullable = false)
|-- Client_ID: int64 (nullable = false)
|-- Client: object (nullable = false)
|-- Agency: object (nullable = false)
|-- SF_Adv_ID: object (nullable = false)
|-- SF_Adv: object (nullable = false)
|-- Lab_Adv_ID: int64 (nullable = false)
|-- Lab_Adv: object (nullable = false)
|-- DSP_Adv_ID: object (nullable = false)
|-- DSP_Adv: object (nullable = false)
|-- DSP_IO_ID: object (nullable = false)
|-- DSP_IO_Name: object (nullable = false)
|-- SF_IO_Line_Item_ID: object (nullable = false)
|-- SF_IO_Line_Item: object (nullable = false)
|-- SF_Start_Date: object (nullable = false)
|-- SF_End_Date: object (nullable = false)
|-- SF_Country: object (nullable = false)

