In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import uuid
from faker import Faker
import json

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

fake = Faker()
Faker.seed(42)

class LicenseDatasetGenerator:
    def __init__(self, num_customers=100, num_products=20, num_features=50):
        self.num_customers = num_customers
        self.num_products = num_products
        self.num_features = num_features
        
        # License models
        self.license_models = ['Perpetual', 'Subscription', 'Usage-based', 'Concurrent', 'Named User']
        
        # Product categories
        self.product_categories = ['Security', 'Analytics', 'Development', 'Communication', 'Productivity']
        
        # Feature types
        self.feature_types = [
            'Authentication', 'Encryption', 'Reporting', 'API Access', 'Mobile Support',
            'Cloud Integration', 'Machine Learning', 'Real-time Analytics', 'Multi-language',
            'Advanced Visualization', 'Audit Logging', 'Single Sign-On', 'Data Export',
            'Custom Dashboards', 'Workflow Automation', 'Advanced Security', 'Backup & Recovery',
            'Third-party Integration', 'Advanced Permissions', 'Premium Support'
        ]
        
        # Device types
        self.device_types = ['Windows', 'macOS', 'Linux', 'iOS', 'Android', 'Web Browser']
        
        # Generate base data
        self.customers = self.generate_customers()
        self.features = self.generate_features()
        self.products = self.generate_products()
        self.product_features = self.generate_product_features()
        self.entitlements = self.generate_entitlements()
        self.activations = self.generate_activations()
        self.users = self.generate_users()
        self.renewals = self.generate_renewals()
    
    def generate_customers(self):
        """Generate customer records"""
        customers = []
        
        for i in range(self.num_customers):
            customer = {
                'customer_id': f'CUST-{str(i+1).zfill(4)}',
                'name': fake.company(),
                'location': fake.city() + ', ' + fake.country(),
                'contact_email': fake.company_email(),
                'phone': fake.phone_number(),
                'industry': fake.random_element(['Technology', 'Healthcare', 'Finance', 'Manufacturing', 'Retail', 'Education']),
                'company_size': fake.random_element(['Small (1-50)', 'Medium (51-500)', 'Large (501-5000)', 'Enterprise (5000+)']),
                'created_date': fake.date_between(start_date='-3y', end_date='today'),
                'status': np.random.choice(['Active', 'Inactive', 'Suspended'], p=[0.85, 0.1, 0.05])
            }
            customers.append(customer)
        
        return pd.DataFrame(customers)
    
    def generate_features(self):
        """Generate feature records"""
        features = []
        
        for i in range(self.num_features):
            created_date = fake.date_between(start_date='-2y', end_date='-6m')
            modified_date = fake.date_between(start_date=created_date, end_date='today')
            
            feature = {
                'feature_id': f'FEAT-{str(i+1).zfill(3)}',
                'name': fake.random_element(self.feature_types) + f' v{random.randint(1,5)}.{random.randint(0,9)}',
                'description': fake.text(max_nb_chars=200),
                'version': f'{random.randint(1,5)}.{random.randint(0,9)}.{random.randint(0,20)}',
                'created_date': created_date,
                'modified_date': modified_date,
                'license_model': fake.random_element(self.license_models),
                'feature_type': fake.random_element(['Core', 'Premium', 'Enterprise', 'Add-on']),
                'status': np.random.choice(['Active', 'Deprecated', 'Suspended'], p=[0.80, 0.15, 0.05])
                
            }
            features.append(feature)
        
        return pd.DataFrame(features)
    
    def generate_products(self):
        """Generate product records"""
        products = []
        
        for i in range(self.num_products):
            created_date = fake.date_between(start_date='-2y', end_date='-3m')
            modified_date = fake.date_between(start_date=created_date, end_date='today')
            
            category = fake.random_element(self.product_categories)
            product = {
                'product_id': f'PROD-{str(i+1).zfill(3)}',
                'name': f'{category} Suite {fake.random_element(["Pro", "Enterprise", "Standard", "Premium"])}',
                'description': fake.text(max_nb_chars=300),
                'version': f'{random.randint(1,10)}.{random.randint(0,9)}.{random.randint(0,50)}',
                'created_date': created_date,
                'modified_date': modified_date,
                'category': category,
                'price_tier': fake.random_element(['Basic', 'Standard', 'Professional', 'Enterprise']),
                'status': np.random.choice(['Active', 'End-of-Life', 'Beta'], p=[0.85, 0.10, 0.05])
                
            }
            products.append(product)
        
        return pd.DataFrame(products)
    
    def generate_product_features(self):
        """Generate product-feature mapping"""
        mappings = []
        
        for _, product in self.products.iterrows():
            # Each product has 3-15 features
            num_features = random.randint(3, 15)
            selected_features = random.sample(list(self.features['feature_id']), num_features)
            
            for feature_id in selected_features:
                mapping = {
                    'product_id': product['product_id'],
                    'feature_id': feature_id,
                    'is_core_feature': np.random.choice([True,False ], p=[0.30, 0.70]),
                    
                    'added_date': fake.date_between(start_date=product['created_date'], end_date='today')
                }
                mappings.append(mapping)
        
        return pd.DataFrame(mappings)
    
    def generate_entitlements(self):
        """Generate entitlement records"""
        entitlements = []
        entitlement_counter = 1
        
        for _, customer in self.customers.iterrows():
            # Each customer has 1-5 entitlements
            num_entitlements = random.randint(1, 5)
            
            for _ in range(num_entitlements):
                product = self.products.sample(1).iloc[0]
                purchase_date = fake.date_between(start_date='-2y', end_date='today')
                
                # Calculate entitlement dates based on license model
                if random.choice([True, False]):  # Subscription model
                    start_date = purchase_date
                    end_date = start_date + timedelta(days=random.choice([30, 90, 365, 730]))  # 1 month to 2 years
                else:  # Perpetual
                    start_date = purchase_date
                    end_date = None
                
                entitlement = {
                    'entitlement_id': f'ENT-{str(entitlement_counter).zfill(5)}',
                    'customer_id': customer['customer_id'],
                    'product_id': product['product_id'],
                    'purchase_date': purchase_date,
                    'purchase_quantity': random.randint(1, 100),
                    'entitlement_start_date': start_date,
                    'entitlement_end_date': end_date,
                    'license_model': fake.random_element(self.license_models),
                    'purchase_price': round(random.uniform(100, 50000), 2),
                    'status': np.random.choice(['Active', 'Expired', 'Suspended'], p=[0.70, 0.20, 0.10])
                    
                }
                entitlements.append(entitlement)
                entitlement_counter += 1
        
        return pd.DataFrame(entitlements)
    
    def generate_activations(self):
        """Generate activation records"""
        activations = []
        activation_counter = 1
        
        for _, entitlement in self.entitlements.iterrows():
            # Each entitlement has 1-3 activations
            max_activations = min(3, entitlement['purchase_quantity'])
            num_activations = random.randint(1, max_activations)
            
            for _ in range(num_activations):
                activation_date = fake.date_between(
                    start_date=entitlement['entitlement_start_date'], 
                    end_date='today'
                )
                
                activation = {
                    'activation_id': f'ACT-{str(activation_counter).zfill(6)}',
                    'entitlement_id': entitlement['entitlement_id'],
                    'quantity': random.randint(1, min(10, entitlement['purchase_quantity'])),
                    'activation_date': activation_date,
                    'device_fingerprint': str(uuid.uuid4()),
                    'activation_type': fake.random_element(['Online', 'Offline', 'Manual']),
                    'status': np.random.choice(['Active', 'Deactivated', 'Expired'], p=[0.80, 0.15, 0.05])
                    
                }
                activations.append(activation)
                activation_counter += 1
        
        return pd.DataFrame(activations)
    
    def generate_users(self):
        """Generate user records"""
        users = []
        user_counter = 1
        
        # Define realistic city coordinates (latitude, longitude)
        real_cities = [
            # North America
            (40.7128, -74.0060, "New York", "United States"),
            (34.0522, -118.2437, "Los Angeles", "United States"),
            (41.8781, -87.6298, "Chicago", "United States"),
            (29.7604, -95.3698, "Houston", "United States"),
            (43.6532, -79.3832, "Toronto", "Canada"),
            (45.5017, -73.5673, "Montreal", "Canada"),
            (49.2827, -123.1207, "Vancouver", "Canada"),
            
            # Europe
            (51.5074, -0.1278, "London", "United Kingdom"),
            (48.8566, 2.3522, "Paris", "France"),
            (52.5200, 13.4050, "Berlin", "Germany"),
            (41.9028, 12.4964, "Rome", "Italy"),
            (40.4168, -3.7038, "Madrid", "Spain"),
            (59.9139, 10.7522, "Oslo", "Norway"),
            (55.7558, 37.6176, "Moscow", "Russia"),
            
            # Asia
            (35.6762, 139.6503, "Tokyo", "Japan"),
            (37.5665, 126.9780, "Seoul", "South Korea"),
            (39.9042, 116.4074, "Beijing", "China"),
            (31.2304, 121.4737, "Shanghai", "China"),
            (28.6139, 77.2090, "New Delhi", "India"),
            (19.0760, 72.8777, "Mumbai", "India"),
            (1.3521, 103.8198, "Singapore", "Singapore"),
            (13.7563, 100.5018, "Bangkok", "Thailand"),
            
            # Australia & Oceania
            (-33.8688, 151.2093, "Sydney", "Australia"),
            (-37.8136, 144.9631, "Melbourne", "Australia"),
            (-36.8485, 174.7633, "Auckland", "New Zealand"),
            
            # South America
            (-23.5505, -46.6333, "São Paulo", "Brazil"),
            (-22.9068, -43.1729, "Rio de Janeiro", "Brazil"),
            (-34.6118, -58.3960, "Buenos Aires", "Argentina"),
            (-12.0464, -77.0428, "Lima", "Peru"),
            
            # Africa
            (-26.2041, 28.0473, "Johannesburg", "South Africa"),
            (-33.9249, 18.4241, "Cape Town", "South Africa"),
            (30.0444, 31.2357, "Cairo", "Egypt"),
            (-1.2921, 36.8219, "Nairobi", "Kenya"),
            (6.5244, 3.3792, "Lagos", "Nigeria"),
            
            # Middle East
            (25.2048, 55.2708, "Dubai", "United Arab Emirates"),
            (31.7683, 35.2137, "Jerusalem", "Israel"),
            (35.6892, 51.3890, "Tehran", "Iran"),
            (33.8938, 35.5018, "Beirut", "Lebanon")
        ]
        
        for _, activation in self.activations.iterrows():
            # Each activation has 1-5 users
            num_users = random.randint(1, 5)
            
            for _ in range(num_users):
                # Select a random real city
                lat, lon, city, country = random.choice(real_cities)
                
                # Add small random offset to simulate different locations within the city
                latitude = round(lat + random.uniform(-0.1, 0.1), 6)
                longitude = round(lon + random.uniform(-0.1, 0.1), 6)
                
                user = {
                    'user_id': f'USR-{str(user_counter).zfill(6)}',
                    'activation_id': activation['activation_id'],
                    'username': fake.user_name(),
                    'email': fake.email(),
                    'first_login_date': fake.date_between(start_date=activation['activation_date'], end_date='today'),
                    'last_login_date': fake.date_between(start_date='-30d', end_date='today'),
                    'device_type': fake.random_element(self.device_types),
                    'device_id': str(uuid.uuid4()),
                    'latitude': latitude,
                    'longitude': longitude,
                    'ip_address': fake.ipv4(),
                    'country': country,  # Use real country from the selected city
                    'city': city,        # Use real city from the selected city
                    'timezone': fake.timezone(),
                    'user_role': fake.random_element(['Admin', 'Power User', 'Standard User', 'Read Only']),
                    'status': np.random.choice(['Active', 'Inactive', 'Locked'], p=[0.85, 0.10, 0.05])
                }
                users.append(user)
                user_counter += 1
        
        return pd.DataFrame(users)
    
    def generate_renewals(self):
        """Generate renewal records"""
        renewals = []
        
        # Generate renewals for subscription-based entitlements
        subscription_entitlements = self.entitlements[
            self.entitlements['entitlement_end_date'].notna()
        ]
        
        for _, entitlement in subscription_entitlements.iterrows():
            # 60% chance of renewal
            if random.random() < 0.6:
                # Calculate renewal dates
                original_end = pd.to_datetime(entitlement['entitlement_end_date'])
                renewal_start = original_end
                
                # Determine renewal period (same as original or different)
                original_period = (original_end - pd.to_datetime(entitlement['entitlement_start_date'])).days
                renewal_period = random.choice([original_period, 365, 730])  # Keep same or go to 1-2 years
                renewal_end = renewal_start + timedelta(days=renewal_period)
                
                renewal = {
                    'renewal_id': f'REN-{str(len(renewals)+1).zfill(5)}',
                    'entitlement_id': entitlement['entitlement_id'],
                    'entitlement_start_date': renewal_start,
                    'entitlement_end_date': renewal_end,
                    'license_model': entitlement['license_model'],
                    'renewal_date': renewal_start - timedelta(days=random.randint(1, 30)),  # Renewed before expiry
                    'renewal_price': round(random.uniform(0.8, 1.2) * entitlement.get('purchase_price', 1000), 2),
                    'discount_applied': round(random.uniform(0, 25), 2),  # Discount percentage
                    'status': np.random.choice(['Active', 'Pending', 'Cancelled'], p=[0.80, 0.10, 0.10])
                    
                }
                renewals.append(renewal)
        
        return pd.DataFrame(renewals)
    
    def save_to_csv(self, output_dir='dataset'):
        """Save all tables to CSV files"""
        import os
        os.makedirs(output_dir, exist_ok=True)
        
        tables = {
            'customers': self.customers,
            'products': self.products,
            'features': self.features,
            'product_features': self.product_features,
            'entitlements': self.entitlements,
            'activations': self.activations,
            'users': self.users,
            'renewals': self.renewals
        }
        
        for table_name, df in tables.items():
            filename = f'{output_dir}/{table_name}.csv'
            df.to_csv(filename, index=False)
            print(f'Saved {filename} with {len(df)} records')
    
    def generate_summary_report(self):
        """Generate a summary report of the dataset"""
        report = {
            'Dataset Summary': {
                'Total Customers': len(self.customers),
                'Total Products': len(self.products),
                'Total Features': len(self.features),
                'Total Entitlements': len(self.entitlements),
                'Total Activations': len(self.activations),
                'Total Users': len(self.users),
                'Total Renewals': len(self.renewals)
            },
            'Data Quality Metrics': {
                'Average Features per Product': round(len(self.product_features) / len(self.products), 2),
                'Average Entitlements per Customer': round(len(self.entitlements) / len(self.customers), 2),
                'Average Activations per Entitlement': round(len(self.activations) / len(self.entitlements), 2),
                'Average Users per Activation': round(len(self.users) / len(self.activations), 2),
                'Renewal Rate': round(len(self.renewals) / len(self.entitlements[self.entitlements['entitlement_end_date'].notna()]) * 100, 2)
            }
        }
        
        return report

# Generate the dataset
def main():
    print("Generating synthetic license management dataset...")
    
    # Create generator instance
    generator = LicenseDatasetGenerator(
        num_customers=150,    # Number of customers
        num_products=25,      # Number of products
        num_features=60       # Number of features
    )
    
    # Save to CSV files
    generator.save_to_csv()
    
    # Generate and print summary report
    report = generator.generate_summary_report()
    print("\n" + "="*50)
    print("DATASET GENERATION COMPLETE")
    print("="*50)
    
    for section, metrics in report.items():
        print(f"\n{section}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")
    
    print(f"\nAll CSV files have been saved to the 'dataset' directory.")
    print("Tables generated:")
    print("  - customers.csv")
    print("  - products.csv") 
    print("  - features.csv")
    print("  - product_features.csv")
    print("  - entitlements.csv")
    print("  - activations.csv")
    print("  - users.csv")
    print("  - renewals.csv")

if __name__ == "__main__":
    main()

Generating synthetic license management dataset...
Saved dataset/customers.csv with 150 records
Saved dataset/products.csv with 25 records
Saved dataset/features.csv with 60 records
Saved dataset/product_features.csv with 215 records
Saved dataset/entitlements.csv with 448 records
Saved dataset/activations.csv with 898 records
Saved dataset/users.csv with 2738 records
Saved dataset/renewals.csv with 121 records

DATASET GENERATION COMPLETE

Dataset Summary:
  Total Customers: 150
  Total Products: 25
  Total Features: 60
  Total Entitlements: 448
  Total Activations: 898
  Total Users: 2738
  Total Renewals: 121

Data Quality Metrics:
  Average Features per Product: 8.6
  Average Entitlements per Customer: 2.99
  Average Activations per Entitlement: 2.0
  Average Users per Activation: 3.05
  Renewal Rate: 62.37

All CSV files have been saved to the 'dataset' directory.
Tables generated:
  - customers.csv
  - products.csv
  - features.csv
  - product_features.csv
  - entitlements.csv
  

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import uuid
from faker import Faker
import json

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

fake = Faker()
Faker.seed(42)

class LicenseDatasetGenerator:
    def __init__(self, num_customers=100, num_products=20, num_features=50):
        self.num_customers = num_customers
        self.num_products = num_products
        self.num_features = num_features
        
        # License models
        self.license_models = ['Perpetual', 'Subscription', 'Usage-based', 'Concurrent', 'Named User']
        
        # Product categories
        self.product_categories = ['Security', 'Analytics', 'Development', 'Communication', 'Productivity']
        
        # Feature types
        self.feature_types = [
            'Authentication', 'Encryption', 'Reporting', 'API Access', 'Mobile Support',
            'Cloud Integration', 'Machine Learning', 'Real-time Analytics', 'Multi-language',
            'Advanced Visualization', 'Audit Logging', 'Single Sign-On', 'Data Export',
            'Custom Dashboards', 'Workflow Automation', 'Advanced Security', 'Backup & Recovery',
            'Third-party Integration', 'Advanced Permissions', 'Premium Support'
        ]
        
        # Device types
        self.device_types = ['Windows', 'macOS', 'Linux', 'iOS', 'Android', 'Web Browser']
        
        # Generate base data
        self.customers = self.generate_customers()
        self.features = self.generate_features()
        self.products = self.generate_products()
        self.product_features = self.generate_product_features()
        self.entitlements = self.generate_entitlements()
        self.activations = self.generate_activations()
        self.users = self.generate_users()
        self.renewals = self.generate_renewals()
    
    def generate_customers(self):
        """Generate customer records"""
        customers = []
        
        for i in range(self.num_customers):
            customer = {
                'customer_id': f'CUST-{str(i+1).zfill(4)}',
                'name': fake.company(),
                'location': fake.city() + ', ' + fake.country(),
                'contact_email': fake.company_email(),
                'phone': fake.phone_number(),
                'industry': fake.random_element(['Technology', 'Healthcare', 'Finance', 'Manufacturing', 'Retail', 'Education']),
                'company_size': fake.random_element(['Small (1-50)', 'Medium (51-500)', 'Large (501-5000)', 'Enterprise (5000+)']),
                'created_date': fake.date_between(start_date='-3y', end_date='today'),
                'status': fake.random_element(['Active', 'Inactive', 'Suspended'], weights=[85, 10, 5])
            }
            customers.append(customer)
        
        return pd.DataFrame(customers)
    
    def generate_features(self):
        """Generate feature records"""
        features = []
        
        for i in range(self.num_features):
            created_date = fake.date_between(start_date='-2y', end_date='-6m')
            modified_date = fake.date_between(start_date=created_date, end_date='today')
            
            feature = {
                'feature_id': f'FEAT-{str(i+1).zfill(3)}',
                'name': fake.random_element(self.feature_types) + f' v{random.randint(1,5)}.{random.randint(0,9)}',
                'description': fake.text(max_nb_chars=200),
                'version': f'{random.randint(1,5)}.{random.randint(0,9)}.{random.randint(0,20)}',
                'created_date': created_date,
                'modified_date': modified_date,
                'license_model': fake.random_element(self.license_models),
                'feature_type': fake.random_element(['Core', 'Premium', 'Enterprise', 'Add-on']),
                'status': fake.random_element(['Active', 'Deprecated', 'Beta'], weights=[80, 15, 5])
            }
            features.append(feature)
        
        return pd.DataFrame(features)
    
    def generate_products(self):
        """Generate product records"""
        products = []
        
        for i in range(self.num_products):
            created_date = fake.date_between(start_date='-2y', end_date='-3m')
            modified_date = fake.date_between(start_date=created_date, end_date='today')
            
            category = fake.random_element(self.product_categories)
            product = {
                'product_id': f'PROD-{str(i+1).zfill(3)}',
                'name': f'{category} Suite {fake.random_element(["Pro", "Enterprise", "Standard", "Premium"])}',
                'description': fake.text(max_nb_chars=300),
                'version': f'{random.randint(1,10)}.{random.randint(0,9)}.{random.randint(0,50)}',
                'created_date': created_date,
                'modified_date': modified_date,
                'category': category,
                'price_tier': fake.random_element(['Basic', 'Standard', 'Professional', 'Enterprise']),
                'status': fake.random_element(['Active', 'End-of-Life', 'Beta'], weights=[85, 10, 5])
            }
            products.append(product)
        
        return pd.DataFrame(products)
    
    def generate_product_features(self):
        """Generate product-feature mapping"""
        mappings = []
        
        for _, product in self.products.iterrows():
            # Each product has 3-15 features
            num_features = random.randint(3, 15)
            selected_features = random.sample(list(self.features['feature_id']), num_features)
            
            for feature_id in selected_features:
                mapping = {
                    'product_id': product['product_id'],
                    'feature_id': feature_id,
                    'is_core_feature': fake.random_element([True, False], weights=[30, 70]),
                    'added_date': fake.date_between(start_date=product['created_date'], end_date='today')
                }
                mappings.append(mapping)
        
        return pd.DataFrame(mappings)
    
    def generate_entitlements(self):
        """Generate entitlement records"""
        entitlements = []
        entitlement_counter = 1
        
        for _, customer in self.customers.iterrows():
            # Each customer has 1-5 entitlements
            num_entitlements = random.randint(1, 5)
            
            for _ in range(num_entitlements):
                product = self.products.sample(1).iloc[0]
                purchase_date = fake.date_between(start_date='-2y', end_date='today')
                
                # Calculate entitlement dates based on license model
                if random.choice([True, False]):  # Subscription model
                    start_date = purchase_date
                    end_date = start_date + timedelta(days=random.choice([30, 90, 365, 730]))  # 1 month to 2 years
                else:  # Perpetual
                    start_date = purchase_date
                    end_date = None
                
                entitlement = {
                    'entitlement_id': f'ENT-{str(entitlement_counter).zfill(5)}',
                    'customer_id': customer['customer_id'],
                    'product_id': product['product_id'],
                    'purchase_date': purchase_date,
                    'purchase_quantity': random.randint(1, 100),
                    'entitlement_start_date': start_date,
                    'entitlement_end_date': end_date,
                    'license_model': fake.random_element(self.license_models),
                    'purchase_price': round(random.uniform(100, 50000), 2),
                    'status': fake.random_element(['Active', 'Expired', 'Suspended'], weights=[70, 20, 10])
                }
                entitlements.append(entitlement)
                entitlement_counter += 1
        
        return pd.DataFrame(entitlements)
    
    def generate_activations(self):
        """Generate activation records"""
        activations = []
        activation_counter = 1
        
        for _, entitlement in self.entitlements.iterrows():
            # Each entitlement has 1-3 activations
            max_activations = min(3, entitlement['purchase_quantity'])
            num_activations = random.randint(1, max_activations)
            
            for _ in range(num_activations):
                activation_date = fake.date_between(
                    start_date=entitlement['entitlement_start_date'], 
                    end_date='today'
                )
                
                activation = {
                    'activation_id': f'ACT-{str(activation_counter).zfill(6)}',
                    'entitlement_id': entitlement['entitlement_id'],
                    'quantity': random.randint(1, min(10, entitlement['purchase_quantity'])),
                    'activation_date': activation_date,
                    'device_fingerprint': str(uuid.uuid4()),
                    'activation_type': fake.random_element(['Online', 'Offline', 'Manual']),
                    'status': fake.random_element(['Active', 'Deactivated', 'Expired'], weights=[80, 15, 5])
                }
                activations.append(activation)
                activation_counter += 1
        
        return pd.DataFrame(activations)
    
    def generate_users(self):
        """Generate user records"""
        users = []
        user_counter = 1
        
        # Define realistic city coordinates (latitude, longitude)
        real_cities = [
            # North America
            (40.7128, -74.0060, "New York", "United States"),
            (34.0522, -118.2437, "Los Angeles", "United States"),
            (41.8781, -87.6298, "Chicago", "United States"),
            (29.7604, -95.3698, "Houston", "United States"),
            (43.6532, -79.3832, "Toronto", "Canada"),
            (45.5017, -73.5673, "Montreal", "Canada"),
            (49.2827, -123.1207, "Vancouver", "Canada"),
            
            # Europe
            (51.5074, -0.1278, "London", "United Kingdom"),
            (48.8566, 2.3522, "Paris", "France"),
            (52.5200, 13.4050, "Berlin", "Germany"),
            (41.9028, 12.4964, "Rome", "Italy"),
            (40.4168, -3.7038, "Madrid", "Spain"),
            (59.9139, 10.7522, "Oslo", "Norway"),
            (55.7558, 37.6176, "Moscow", "Russia"),
            
            # Asia
            (35.6762, 139.6503, "Tokyo", "Japan"),
            (37.5665, 126.9780, "Seoul", "South Korea"),
            (39.9042, 116.4074, "Beijing", "China"),
            (31.2304, 121.4737, "Shanghai", "China"),
            (28.6139, 77.2090, "New Delhi", "India"),
            (19.0760, 72.8777, "Mumbai", "India"),
            (1.3521, 103.8198, "Singapore", "Singapore"),
            (13.7563, 100.5018, "Bangkok", "Thailand"),
            
            # Australia & Oceania
            (-33.8688, 151.2093, "Sydney", "Australia"),
            (-37.8136, 144.9631, "Melbourne", "Australia"),
            (-36.8485, 174.7633, "Auckland", "New Zealand"),
            
            # South America
            (-23.5505, -46.6333, "São Paulo", "Brazil"),
            (-22.9068, -43.1729, "Rio de Janeiro", "Brazil"),
            (-34.6118, -58.3960, "Buenos Aires", "Argentina"),
            (-12.0464, -77.0428, "Lima", "Peru"),
            
            # Africa
            (-26.2041, 28.0473, "Johannesburg", "South Africa"),
            (-33.9249, 18.4241, "Cape Town", "South Africa"),
            (30.0444, 31.2357, "Cairo", "Egypt"),
            (-1.2921, 36.8219, "Nairobi", "Kenya"),
            (6.5244, 3.3792, "Lagos", "Nigeria"),
            
            # Middle East
            (25.2048, 55.2708, "Dubai", "United Arab Emirates"),
            (31.7683, 35.2137, "Jerusalem", "Israel"),
            (35.6892, 51.3890, "Tehran", "Iran"),
            (33.8938, 35.5018, "Beirut", "Lebanon")
        ]
        
        for _, activation in self.activations.iterrows():
            # Each activation has 1-5 users
            num_users = random.randint(1, 5)
            
            for _ in range(num_users):
                # Select a random real city
                lat, lon, city, country = random.choice(real_cities)
                
                # Add small random offset to simulate different locations within the city
                latitude = round(lat + random.uniform(-0.1, 0.1), 6)
                longitude = round(lon + random.uniform(-0.1, 0.1), 6)
                
                user = {
                    'user_id': f'USR-{str(user_counter).zfill(6)}',
                    'activation_id': activation['activation_id'],
                    'username': fake.user_name(),
                    'email': fake.email(),
                    'first_login_date': fake.date_between(start_date=activation['activation_date'], end_date='today'),
                    'last_login_date': fake.date_between(start_date='-30d', end_date='today'),
                    'device_type': fake.random_element(self.device_types),
                    'device_id': str(uuid.uuid4()),
                    'latitude': latitude,
                    'longitude': longitude,
                    'ip_address': fake.ipv4(),
                    'country': country,  # Use real country from the selected city
                    'city': city,        # Use real city from the selected city
                    'timezone': fake.timezone(),
                    'user_role': fake.random_element(['Admin', 'Power User', 'Standard User', 'Read Only']),
                    'status': np.random.choice(['Active', 'Inactive', 'Locked'], p=[0.85, 0.10, 0.05])
                }
                users.append(user)
                user_counter += 1
        
        return pd.DataFrame(users)
    
    def generate_renewals(self):
        """Generate renewal records"""
        renewals = []
        
        # Generate renewals for subscription-based entitlements
        subscription_entitlements = self.entitlements[
            self.entitlements['entitlement_end_date'].notna()
        ]
        
        for _, entitlement in subscription_entitlements.iterrows():
            # 60% chance of renewal
            if random.random() < 0.6:
                # Calculate renewal dates
                original_end = pd.to_datetime(entitlement['entitlement_end_date'])
                renewal_start = original_end
                
                # Determine renewal period (same as original or different)
                original_period = (original_end - pd.to_datetime(entitlement['entitlement_start_date'])).days
                renewal_period = random.choice([original_period, 365, 730])  # Keep same or go to 1-2 years
                renewal_end = renewal_start + timedelta(days=renewal_period)
                
                renewal = {
                    'renewal_id': f'REN-{str(len(renewals)+1).zfill(5)}',
                    'entitlement_id': entitlement['entitlement_id'],
                    'entitlement_start_date': renewal_start,
                    'entitlement_end_date': renewal_end,
                    'license_model': entitlement['license_model'],
                    'renewal_date': renewal_start - timedelta(days=random.randint(1, 30)),  # Renewed before expiry
                    'renewal_price': round(random.uniform(0.8, 1.2) * entitlement.get('purchase_price', 1000), 2),
                    'discount_applied': round(random.uniform(0, 25), 2),  # Discount percentage
                    'status': fake.random_element(['Active', 'Pending', 'Cancelled'], weights=[80, 10, 10])
                }
                renewals.append(renewal)
        
        return pd.DataFrame(renewals)
    
    def save_to_csv(self, output_dir='dataset'):
        """Save all tables to CSV files"""
        import os
        os.makedirs(output_dir, exist_ok=True)
        
        tables = {
            'customers': self.customers,
            'products': self.products,
            'features': self.features,
            'product_features': self.product_features,
            'entitlements': self.entitlements,
            'activations': self.activations,
            'users': self.users,
            'renewals': self.renewals
        }
        
        for table_name, df in tables.items():
            filename = f'{output_dir}/{table_name}.csv'
            df.to_csv(filename, index=False)
            print(f'Saved {filename} with {len(df)} records')
    
    def generate_summary_report(self):
        """Generate a summary report of the dataset"""
        report = {
            'Dataset Summary': {
                'Total Customers': len(self.customers),
                'Total Products': len(self.products),
                'Total Features': len(self.features),
                'Total Entitlements': len(self.entitlements),
                'Total Activations': len(self.activations),
                'Total Users': len(self.users),
                'Total Renewals': len(self.renewals)
            },
            'Data Quality Metrics': {
                'Average Features per Product': round(len(self.product_features) / len(self.products), 2),
                'Average Entitlements per Customer': round(len(self.entitlements) / len(self.customers), 2),
                'Average Activations per Entitlement': round(len(self.activations) / len(self.entitlements), 2),
                'Average Users per Activation': round(len(self.users) / len(self.activations), 2),
                'Renewal Rate': round(len(self.renewals) / len(self.entitlements[self.entitlements['entitlement_end_date'].notna()]) * 100, 2)
            }
        }
        
        return report

# Generate the dataset
def main():
    print("Generating synthetic license management dataset...")
    
    # Create generator instance
    generator = LicenseDatasetGenerator(
        num_customers=150,    # Number of customers
        num_products=25,      # Number of products
        num_features=60       # Number of features
    )
    
    # Save to CSV files
    generator.save_to_csv()
    
    # Generate and print summary report
    report = generator.generate_summary_report()
    print("\n" + "="*50)
    print("DATASET GENERATION COMPLETE")
    print("="*50)
    
    for section, metrics in report.items():
        print(f"\n{section}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value}")
    
    print(f"\nAll CSV files have been saved to the 'dataset' directory.")
    print("Tables generated:")
    print("  - customers.csv")
    print("  - products.csv") 
    print("  - features.csv")
    print("  - product_features.csv")
    print("  - entitlements.csv")
    print("  - activations.csv")
    print("  - users.csv")
    print("  - renewals.csv")

if __name__ == "__main__":
    main()

Generating synthetic license management dataset...


TypeError: BaseProvider.random_element() got an unexpected keyword argument 'weights'