# Generate Data

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Seed for reproducibility
np.random.seed(42)
random.seed(42)

# Constants
NUM_ROWS = 1000
CATEGORIES = ['ART', 'MEMORABILIA', 'FURNITURE', 'DOCUMENTS', 'SCULPTURE']
LOCATIONS = ['Goldenberg', 'Warehouse-A', 'Storage-Facility-7', 'Gallery-East']
STATUS_DIST = ['ACTIVE']*85 + ['INACTIVE']*10 + ['UNDER_REVIEW']*5
ACQUISITION_MODES = ['Donation', 'Purchase', 'Transfer']

# Generate synthetic data
data = {
    'accession_id': [],
    'amo_number': [],
    'description': [],
    'quantity': [],
    'category': [],
    'current_market_value': [],
    'asset_location': [],
    'room_number': [],
    'box_number': [],
    'mode_of_acquisition': [],
    'acquisition_date': [],
    'acquisition_price': [],
    'remarks': [],
    'dimensions': [],
    'status': [],
    'date_created': [],
    'date_modified': []
}

for i in range(1, NUM_ROWS + 1):
    category = random.choice(CATEGORIES)
    location = random.choice(LOCATIONS)
    year = 2024 if i % 100 != 0 else random.randint(1800, 2023)
    
    # Generate required fields
    data['accession_id'].append(f"{year}-{category}-{i:04d}-{location.split('-')[0]}")
    data['amo_number'].append(f"AMO-{random.randint(10000, 99999)}")
    data['description'].append(f"{category} item #{i} with historical significance")
    data['quantity'].append(1 if i % 100 == 0 else random.randint(1, 100))
    data['category'].append(category)
    data['asset_location'].append(location)
    data['room_number'].append(f"{location[:2]}{random.randint(1, 5)}-{random.choice(['A', 'B', 'C'])}")
    data['mode_of_acquisition'].append(random.choice(ACQUISITION_MODES))
    data['status'].append(random.choice(STATUS_DIST))
    
    # Generate fields with missing values (10-15% nulls)
    data['current_market_value'].append(np.random.normal(5000, 25000) if random.random() > 0.12 else None)
    data['box_number'].append(random.randint(1, 200) if random.random() > 0.15 else None)
    data['acquisition_date'].append(datetime(year, random.randint(1, 12), random.randint(1, 28)).strftime("%m/%d/%Y") if random.random() > 0.1 else None)
    data['acquisition_price'].append(np.random.normal(1000, 5000) if random.random() > 0.1 else None)
    data['remarks'].append(f"Remark for item {i}" if random.random() > 0.15 else None)
    data['dimensions'].append(f"{random.randint(10, 200)} cm x {random.randint(10, 200)} cm" if random.random() > 0.1 else None)
    
    # Timestamps
    created = datetime(2024, 1, 1) + timedelta(days=random.randint(0, 365))
    modified = created + timedelta(days=random.randint(0, 30))
    data['date_created'].append(created.strftime("%m/%d/%Y %I:%M:%S %p"))
    data['date_modified'].append(modified.strftime("%m/%d/%Y %I:%M:%S %p"))

# Add outliers and duplicates
for _ in range(5):
    data['quantity'][random.randint(0, 999)] = 19  # Outliers
    data['quantity'][random.randint(0, 999)] = 1000  # Bulk items

df = pd.DataFrame(data)
df.to_csv("museum_inventory.csv", index=False)