In [9]:

import boto3
import pandas as pd
from faker import Faker
import random
from io import StringIO
from datetime import datetime

In [10]:
# Initialize Faker and AWS S3 client
fake = Faker()
s3 = boto3.client('s3')

In [11]:
def generate_cdc_order_data(num_rows=500):
    data = []
    for _ in range(num_rows):
        order = {
            'order_id': fake.uuid4(),
            'customer_id': fake.uuid4(),
            'order_date': fake.date_this_year(),
            'status': random.choice(['CREATED', 'SHIPPED', 'DELIVERED', 'CANCELLED']),
            'product_id': fake.uuid4(),
            'quantity': random.randint(1, 5),
            'price': round(random.uniform(10.0, 500.0), 2),
            'total_amount': 0.0,  # We'll calculate this next
            'cdc_timestamp': datetime.now()   # Simulate CDC timestamp
        }
        order['total_amount'] = round(order['quantity'] * order['price'], 2)
        data.append(order)

    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

In [12]:
# Function to upload data to S3
def upload_to_s3(bucket_name, file_name, df):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    s3.put_object(Bucket=bucket_name, Key=file_name, Body=csv_buffer.getvalue())
    print(f"Data uploaded to s3://{bucket_name}/{file_name}")

# Generate 100 rows of fake CDC order data
df_cdc_order_data = generate_cdc_order_data(num_rows=20)

# Define S3 bucket and file path
bucket_name = 'your S3 bucket'
file_name = 'file_name' #can be parquet, csv etc.

# Upload the generated data to S3
upload_to_s3(bucket_name, file_name, df_cdc_order_data)

Data uploaded to s3://ckdatatech/orders/fake_cdc_order_data_3.csv
