In [1]:
import pandas as pd
import os

# Read your sales data
df = pd.read_csv('../data/Fashion_Retail_Sales.csv')

print(f"Total sales records: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"\nFirst few records:")
df.head()

Total sales records: 3400
Columns: ['Customer Reference ID', 'Item Purchased', 'Purchase Amount (USD)', 'Date Purchase', 'Review Rating', 'Payment Method']

First few records:


Unnamed: 0,Customer Reference ID,Item Purchased,Purchase Amount (USD),Date Purchase,Review Rating,Payment Method
0,4018,Handbag,4619.0,2023-02-05,,Credit Card
1,4115,Tunic,2456.0,2023-07-11,2.0,Credit Card
2,4019,Tank Top,2102.0,2023-03-23,4.1,Cash
3,4097,Leggings,3126.0,2023-03-15,3.2,Cash
4,3997,Wallet,3003.0,2022-11-27,4.7,Cash


In [2]:
# Create streaming directory
os.makedirs('../streaming/sales', exist_ok=True)
print("✓ Created streaming/sales/ directory")

✓ Created streaming/sales/ directory


In [3]:
# Split into 10 JSON files for streaming simulation
chunk_size = len(df) // 10

for i in range(10):
    start = i * chunk_size
    end = start + chunk_size if i < 9 else len(df)
    chunk = df.iloc[start:end]
    
    # Save as JSON (one record per line - required for PySpark streaming)
    filename = f'../streaming/sales/sales_{i+1:03d}.json'
    chunk.to_json(filename, orient='records', lines=True)
    print(f"✓ Created {filename} with {len(chunk)} records")
    
print(f"\n✅ Successfully created 10 streaming JSON files in streaming/sales/!")
print(f"Total records distributed: {len(df)}")

✓ Created ../streaming/sales/sales_001.json with 340 records
✓ Created ../streaming/sales/sales_002.json with 340 records
✓ Created ../streaming/sales/sales_003.json with 340 records
✓ Created ../streaming/sales/sales_004.json with 340 records
✓ Created ../streaming/sales/sales_005.json with 340 records
✓ Created ../streaming/sales/sales_006.json with 340 records
✓ Created ../streaming/sales/sales_007.json with 340 records
✓ Created ../streaming/sales/sales_008.json with 340 records
✓ Created ../streaming/sales/sales_009.json with 340 records
✓ Created ../streaming/sales/sales_010.json with 340 records

✅ Successfully created 10 streaming JSON files in streaming/sales/!
Total records distributed: 3400


In [4]:
# Verify streaming files exist
import glob

streaming_files = sorted(glob.glob('../streaming/sales/*.json'))
print(f"Found {len(streaming_files)} streaming files:")
for f in streaming_files:
    print(f"  - {f}")

Found 10 streaming files:
  - ../streaming/sales/sales_001.json
  - ../streaming/sales/sales_002.json
  - ../streaming/sales/sales_003.json
  - ../streaming/sales/sales_004.json
  - ../streaming/sales/sales_005.json
  - ../streaming/sales/sales_006.json
  - ../streaming/sales/sales_007.json
  - ../streaming/sales/sales_008.json
  - ../streaming/sales/sales_009.json
  - ../streaming/sales/sales_010.json


In [6]:
# Look at first file to verify format
with open('../streaming/sales/sales_001.json', 'r') as f:
    first_line = f.readline()
    print("First record in streaming file:")
    print(first_line)

First record in streaming file:
{"Customer Reference ID":4018,"Item Purchased":"Handbag","Purchase Amount (USD)":4619.0,"Date Purchase":"2023-02-05","Review Rating":null,"Payment Method":"Credit Card"}

