In [51]:
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from typing import Dict, List, Any, Optional, Union
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)

print("✅ Libraries imported successfully!")
print("📊 Ready to generate synthetic sales data")

✅ Libraries imported successfully!
📊 Ready to generate synthetic sales data


## Step2- Schema Parser class

In [52]:
class SchemaParser:
    """
    Parses BigQuery schema JSON and provides utilities for synthetic data generation
    """
    
    def __init__(self, schema_json: Dict):
        self.schema = schema_json
        self.table_name = schema_json.get('table_name', 'unknown_table')
        self.columns = schema_json.get('columns', {})
        
    def get_column_info(self, column_name: str) -> Dict:
        """Get complete metadata for a specific column"""
        return self.columns.get(column_name, {})
    
    def get_categorical_columns(self) -> List[str]:
        """Get list of categorical columns"""
        categorical_cols = []
        for col_name, col_info in self.columns.items():
            if 'categorical_values' in col_info:
                categorical_cols.append(col_name)
        return categorical_cols
    
    def get_numerical_columns(self) -> List[str]:
        """Get list of numerical columns"""
        numerical_cols = []
        for col_name, col_info in self.columns.items():
            if 'numerical_stats' in col_info:
                numerical_cols.append(col_name)
        return numerical_cols
    
    def get_primary_key_columns(self) -> List[str]:
        """Get list of primary key columns"""
        pk_cols = []
        for col_name, col_info in self.columns.items():
            if col_info.get('is_primary_key', False):
                pk_cols.append(col_name)
        return pk_cols
    
    def get_nullable_columns(self) -> List[str]:
        """Get list of columns that can have NULL values"""
        nullable_cols = []
        for col_name, col_info in self.columns.items():
            if col_info.get('nullable', False):
                nullable_cols.append(col_name)
        return nullable_cols
    
    def print_summary(self):
        """Print a summary of the schema"""
        print(f"📋 Table: {self.table_name}")
        print(f"📊 Total Columns: {len(self.columns)}")
        print(f"🏷️  Categorical Columns: {len(self.get_categorical_columns())}")
        print(f"🔢 Numerical Columns: {len(self.get_numerical_columns())}")
        print(f"🔑 Primary Key Columns: {len(self.get_primary_key_columns())}")
        print(f"❓ Nullable Columns: {len(self.get_nullable_columns())}")
        
        print("\n📋 Column Details:")
        for col_name, col_info in self.columns.items():
            data_type = col_info.get('data_type', 'UNKNOWN')
            nullable = "✓" if col_info.get('nullable', False) else "✗"
            pk = "🔑" if col_info.get('is_primary_key', False) else ""
            
            if 'categorical_values' in col_info:
                unique_vals = len(col_info['categorical_values'])
                print(f"  • {col_name:<20} | {data_type:<10} | Nullable: {nullable} | Categories: {unique_vals} {pk}")
            elif 'numerical_stats' in col_info:
                stats = col_info['numerical_stats']
                print(f"  • {col_name:<20} | {data_type:<10} | Nullable: {nullable} | Range: {stats.get('min_value', 'N/A')}-{stats.get('max_value', 'N/A')} {pk}")
            else:
                print(f"  • {col_name:<20} | {data_type:<10} | Nullable: {nullable} {pk}")



## Step 3: Data generators for categorical, numerical, and other data types

In [53]:
class DataGenerators:
    """
    Collection of data generators for different column types
    """
    
    @staticmethod
    def generate_categorical_data(column_info: Dict, num_rows: int) -> List[Any]:
        """
        Generate categorical data based on value distribution
        """
        categorical_values = column_info.get('categorical_values', [])
        value_distribution = column_info.get('value_distribution', {})
        nullable = column_info.get('nullable', False)
        
        if not categorical_values:
            return [None] * num_rows
        
        # Create weighted choices based on distribution
        values = list(value_distribution.keys())
        weights = list(value_distribution.values())
        
        # Normalize weights to probabilities
        total_weight = sum(weights)
        probabilities = [w / total_weight for w in weights]
        
        # Generate data
        generated_data = np.random.choice(values, size=num_rows, p=probabilities).tolist()
        
        # Handle nullable columns - introduce some NULLs if column is nullable
        if nullable:
            # Introduce ~5% NULL values for nullable columns
            null_indices = np.random.choice(num_rows, size=int(num_rows * 0.05), replace=False)
            for idx in null_indices:
                generated_data[idx] = None
        
        return generated_data
    
    @staticmethod
    def generate_numerical_data(column_info: Dict, num_rows: int) -> List[Any]:
        """
        Generate numerical data based on statistical parameters
        """
        stats = column_info.get('numerical_stats', {})
        data_type = column_info.get('data_type', 'FLOAT64')
        nullable = column_info.get('nullable', False)
        
        min_val = stats.get('min_value', 0)
        max_val = stats.get('max_value', 100)
        mean_val = stats.get('mean_value', (min_val + max_val) / 2)
        std_dev = stats.get('std_dev', (max_val - min_val) / 6)
        null_percentage = stats.get('null_percentage', 0)
        is_categorical = stats.get('is_actually_categorical', False)
        distinct_count = stats.get('distinct_count', num_rows)
        
        # Handle low-cardinality numerics as categorical
        if is_categorical or distinct_count < 50:
            # Generate discrete values within the range
            unique_values = min(distinct_count, int(max_val - min_val + 1))
            possible_values = np.linspace(min_val, max_val, unique_values)
            
            if data_type == 'INTEGER':
                possible_values = [int(v) for v in possible_values]
            
            generated_data = np.random.choice(possible_values, size=num_rows).tolist()
        else:
            # Generate continuous values using normal distribution, clipped to min/max
            generated_data = np.random.normal(mean_val, std_dev, num_rows)
            generated_data = np.clip(generated_data, min_val, max_val)
            
            if data_type == 'INTEGER':
                generated_data = np.round(generated_data).astype(int)
            
            generated_data = generated_data.tolist()
        
        # Handle NULL values based on null_percentage
        if nullable and null_percentage > 0:
            null_count = int(num_rows * (null_percentage / 100))
            null_indices = np.random.choice(num_rows, size=null_count, replace=False)
            for idx in null_indices:
                generated_data[idx] = None
        
        return generated_data
    
    @staticmethod
    def generate_primary_key_data(column_info: Dict, num_rows: int, start_id: int = 1) -> List[Any]:
        """
        Generate primary key data (sequential integers)
        """
        data_type = column_info.get('data_type', 'INTEGER')
        
        if data_type == 'STRING':
            # Generate string-based IDs
            return [f"ID_{start_id + i:06d}" for i in range(num_rows)]
        else:
            # Generate integer-based IDs
            return list(range(start_id, start_id + num_rows))
    
    @staticmethod
    def generate_datetime_data(column_info: Dict, num_rows: int, 
                             start_date: datetime = None, end_date: datetime = None) -> List[Any]:
        """
        Generate datetime data for date columns
        """
        if start_date is None:
            start_date = datetime(2022, 1, 1)
        if end_date is None:
            end_date = datetime(2025, 5, 31)
        
        nullable = column_info.get('nullable', False)
        
        # Generate random dates between start and end
        time_between_dates = end_date - start_date
        days_between_dates = time_between_dates.days
        
        generated_data = []
        for _ in range(num_rows):
            random_number_of_days = random.randrange(days_between_dates)
            random_date = start_date + timedelta(days=random_number_of_days)
            generated_data.append(random_date)
        
        # Handle nullable columns
        if nullable:
            null_indices = np.random.choice(num_rows, size=int(num_rows * 0.02), replace=False)
            for idx in null_indices:
                generated_data[idx] = None
        
        return generated_data

class DataValidator:
    """
    Validates generated data against schema constraints
    """
    
    @staticmethod
    def validate_column(data: List[Any], column_info: Dict, column_name: str) -> Dict[str, Any]:
        """
        Validate a single column's data against its schema
        """
        validation_results = {
            'column_name': column_name,
            'total_rows': len(data),
            'null_count': sum(1 for x in data if x is None),
            'null_percentage': (sum(1 for x in data if x is None) / len(data)) * 100,
            'issues': []
        }
        
        data_type = column_info.get('data_type', 'STRING')
        nullable = column_info.get('nullable', False)
        
        # Check nullable constraint
        if not nullable and validation_results['null_count'] > 0:
            validation_results['issues'].append(f"Non-nullable column has {validation_results['null_count']} NULL values")
        
        # Validate data types
        non_null_data = [x for x in data if x is not None]
        
        if 'categorical_values' in column_info:
            expected_values = set(column_info['categorical_values'])
            actual_values = set(non_null_data)
            unexpected_values = actual_values - expected_values
            if unexpected_values:
                validation_results['issues'].append(f"Unexpected categorical values: {unexpected_values}")
        
        if 'numerical_stats' in column_info:
            stats = column_info['numerical_stats']
            min_val = stats.get('min_value')
            max_val = stats.get('max_value')
            
            if min_val is not None and max_val is not None:
                out_of_range = [x for x in non_null_data if x < min_val or x > max_val]
                if out_of_range:
                    validation_results['issues'].append(f"Values out of range [{min_val}, {max_val}]: {len(out_of_range)} values")
        
        validation_results['is_valid'] = len(validation_results['issues']) == 0
        
        return validation_results
    
    @staticmethod
    def validate_dataframe(df: pd.DataFrame, schema: SchemaParser) -> Dict[str, Any]:
        """
        Validate entire dataframe against schema
        """
        overall_results = {
            'total_columns': len(df.columns),
            'total_rows': len(df),
            'validation_timestamp': datetime.now().isoformat(),
            'column_results': {},
            'summary': {'valid_columns': 0, 'invalid_columns': 0, 'total_issues': 0}
        }
        
        for column_name in df.columns:
            if column_name in schema.columns:
                column_info = schema.get_column_info(column_name)
                column_data = df[column_name].tolist()
                
                validation_result = DataValidator.validate_column(column_data, column_info, column_name)
                overall_results['column_results'][column_name] = validation_result
                
                if validation_result['is_valid']:
                    overall_results['summary']['valid_columns'] += 1
                else:
                    overall_results['summary']['invalid_columns'] += 1
                    overall_results['summary']['total_issues'] += len(validation_result['issues'])
        
        overall_results['is_valid'] = overall_results['summary']['invalid_columns'] == 0
        
        return overall_results

print("✅ Data Generators and Validators created successfully!")
print("🔧 Ready to generate and validate synthetic data")

✅ Data Generators and Validators created successfully!
🔧 Ready to generate and validate synthetic data


## Step 4: Main synthetic data generator with validation


In [54]:
class SyntheticDataGenerator:
    """
    Main class for generating synthetic data based on schema
    """
    
    def __init__(self, schema_json: Dict):
        self.parser = SchemaParser(schema_json)
        self.generators = DataGenerators()
        self.validator = DataValidator()
    
    def generate_synthetic_data(self, num_rows: int = 1000, 
                              start_date: datetime = None, 
                              end_date: datetime = None,
                              validate_output: bool = True) -> pd.DataFrame:
        """
        Generate synthetic data based on schema
        
        Args:
            num_rows: Number of rows to generate
            start_date: Start date for datetime columns
            end_date: End date for datetime columns  
            validate_output: Whether to validate generated data
            
        Returns:
            pandas DataFrame with synthetic data
        """
        
        print(f"🚀 Generating {num_rows:,} rows of synthetic data...")
        
        synthetic_data = {}
        
        # Process each column based on its type
        for column_name, column_info in self.parser.columns.items():
            print(f"   📝 Generating data for column: {column_name}")
            
            # Check if it's a primary key
            if column_info.get('is_primary_key', False):
                synthetic_data[column_name] = self.generators.generate_primary_key_data(
                    column_info, num_rows
                )
            
            # Check if it's categorical
            elif 'categorical_values' in column_info:
                synthetic_data[column_name] = self.generators.generate_categorical_data(
                    column_info, num_rows
                )
            
            # Check if it's numerical
            elif 'numerical_stats' in column_info:
                synthetic_data[column_name] = self.generators.generate_numerical_data(
                    column_info, num_rows
                )
            
            # Check if it's datetime (based on column name patterns)
            elif any(keyword in column_name.lower() for keyword in ['date', 'time', 'created', 'updated']):
                synthetic_data[column_name] = self.generators.generate_datetime_data(
                    column_info, num_rows, start_date, end_date
                )
            
            # Default to string generation
            else:
                # Generate generic string data
                synthetic_data[column_name] = [f"Value_{i}" for i in range(num_rows)]
        
        # Create DataFrame
        df = pd.DataFrame(synthetic_data)
        
        print(f"✅ Generated DataFrame with shape: {df.shape}")
        
        # Validate if requested
        if validate_output:
            print("🔍 Validating generated data...")
            validation_results = self.validator.validate_dataframe(df, self.parser)
            
            if validation_results['is_valid']:
                print("✅ Data validation passed!")
            else:
                print(f"⚠️  Data validation found {validation_results['summary']['total_issues']} issues")
                print("   Use show_validation_details() to see details")
                
            # Store validation results for later access
            self._last_validation = validation_results
        
        return df
    
    def show_validation_details(self):
        """Show detailed validation results from last generation"""
        if not hasattr(self, '_last_validation'):
            print("❌ No validation results available. Generate data first with validate_output=True")
            return
        
        results = self._last_validation
        print(f"\n📊 Validation Summary:")
        print(f"   ✅ Valid columns: {results['summary']['valid_columns']}")
        print(f"   ❌ Invalid columns: {results['summary']['invalid_columns']}")
        print(f"   ⚠️  Total issues: {results['summary']['total_issues']}")
        
        if results['summary']['invalid_columns'] > 0:
            print(f"\n📋 Column-level Issues:")
            for col_name, col_result in results['column_results'].items():
                if not col_result['is_valid']:
                    print(f"   🔴 {col_name}:")
                    for issue in col_result['issues']:
                        print(f"      • {issue}")
    
    def generate_sample_data(self, sample_size: int = 10) -> pd.DataFrame:
        """Generate a small sample for quick testing"""
        return self.generate_synthetic_data(num_rows=sample_size, validate_output=True)
    
    def get_column_statistics(self, df: pd.DataFrame) -> Dict:
        """Get statistics for generated data"""
        stats = {}
        
        for column in df.columns:
            col_stats = {
                'dtype': str(df[column].dtype),
                'null_count': df[column].isnull().sum(),
                'null_percentage': (df[column].isnull().sum() / len(df)) * 100,
                'unique_count': df[column].nunique()
            }
            
            if df[column].dtype in ['int64', 'float64']:
                col_stats.update({
                    'min': df[column].min(),
                    'max': df[column].max(),
                    'mean': df[column].mean(),
                    'std': df[column].std(),
                    'median': df[column].median()
                })
            
            stats[column] = col_stats
        
        return stats
    
    def compare_with_original_schema(self, df: pd.DataFrame) -> Dict:
        """Compare generated data statistics with original schema"""
        generated_stats = self.get_column_statistics(df)
        comparison = {}
        
        for column_name, col_info in self.parser.columns.items():
            if column_name not in df.columns:
                continue
                
            comparison[column_name] = {'generated': generated_stats[column_name]}
            
            # Add original stats if available
            if 'numerical_stats' in col_info:
                original_stats = col_info['numerical_stats']
                comparison[column_name]['original'] = {
                    'min': original_stats.get('min_value'),
                    'max': original_stats.get('max_value'),
                    'mean': original_stats.get('mean_value'),
                    'std': original_stats.get('std_dev'),
                    'null_percentage': original_stats.get('null_percentage', 0)
                }
            
            if 'value_distribution' in col_info:
                original_total = sum(col_info['value_distribution'].values())
                comparison[column_name]['original_distribution'] = {
                    value: (count / original_total) * 100 
                    for value, count in col_info['value_distribution'].items()
                }
        
        return comparison



## Step 5: Testing with small samples

In [55]:
# # Test with a small sample first
# print("🧪 Testing with 10 sample rows...")
# sample_df = generator.generate_sample_data(sample_size=100)

# print("\n📋 Sample Data Preview:")
# print(sample_df)

# print("\n📊 Data Types:")
# print(sample_df.dtypes)

# print("\n📈 Basic Statistics:")
# stats = generator.get_column_statistics(sample_df)
# for col, col_stats in stats.items():
#     print(f"\n🔸 {col}:")
#     print(f"   Type: {col_stats['dtype']}")
#     print(f"   Nulls: {col_stats['null_count']} ({col_stats['null_percentage']:.1f}%)")
#     print(f"   Unique: {col_stats['unique_count']}")
    
#     if col_stats['dtype'] in ['int64', 'float64'] and 'min' in col_stats:
#         print(f"   Range: {col_stats['min']} - {col_stats['max']}")
#         print(f"   Mean: {col_stats['mean']:.2f}")

# print("\n✅ Small sample generation successful!")

## Step 6: Full dataset generation

In [56]:
# Generate a larger dataset
# Test the schema parser
# Read JSON file
# file_path= '..\sales_data_schema_for_synthetic_Product.json'
file_path= "..\sales_data_schema_for_synthetic_layer4_FT_VW_FACT_DAILY_SALES_DATA_ANALYSIS.json"
# file_path="..\sales_data_schema_for_synthetic_layer4_FT_VW_FACT_SECONDARY_SALES_DATA.json"
# file_path= "..\sales_data_schema_for_synthetic_CUSTOMER.json"
# file_path= "..\sales_data_schema_for_synthetic_VW_FT_FACT_EXTERNAL_SALES.json"
filename=file_path.split('\\')[-1].split('.')[0]
with open(file_path, 'r') as file:
    schema_json: Dict[str, Any] = json.load(file)

print("✅ JSON file loaded successfully!")

# Initialize the generator
generator = SyntheticDataGenerator(schema_json)
print("✅ Synthetic Data Generator initialized successfully!")
print("🎯 Ready to generate pharmaceutical product data")


# Initialize and test the schema parser
parser = SchemaParser(schema_json)
parser.print_summary()
print("🚀 Generating full synthetic dataset (n rows)...")
full_df = generator.generate_synthetic_data(num_rows=100000, validate_output=True)

print("\n📋 Full Dataset Info:")
print(f"Shape: {full_df.shape}")
print(f"Memory usage: {full_df.memory_usage(deep=True).sum() / 1024:.1f} KB")

print("\n📊 Column Summary:")
for col in full_df.columns:
    dtype = full_df[col].dtype
    nulls = full_df[col].isnull().sum()
    unique = full_df[col].nunique()
    print(f"  {col:<25} | {str(dtype):<10} | Nulls: {nulls:>3} | Unique: {unique:>3}")

# Show validation details if there are any issues
generator.show_validation_details()

print("\n🎯 Dataset generation completed successfully!")

# Test the export utilities


print("💾 Testing export utilities...")

#Save full_df as CSV
full_df.to_csv(f'{filename}.csv', index=False)




✅ JSON file loaded successfully!
✅ Synthetic Data Generator initialized successfully!
🎯 Ready to generate pharmaceutical product data
📋 Table: BI_Sales_Chatbot.layer4_FT_VW_FACT_DAILY_SALES_DATA_ANALYSIS
📊 Total Columns: 203
🏷️  Categorical Columns: 90
🔢 Numerical Columns: 104
🔑 Primary Key Columns: 0
❓ Nullable Columns: 203

📋 Column Details:
  • DAILY_SALES_ID       | INTEGER    | Nullable: ✓ | Range: 1.0-10341903.0 
  • BU_ID                | STRING     | Nullable: ✓ | Categories: 9 
  • MOLECULE             | STRING     | Nullable: ✓ | Categories: 1213 
  • PTM                  | STRING     | Nullable: ✓ | Categories: 1 
  • PLI                  | STRING     | Nullable: ✓ | Categories: 2 
  • MWP                  | STRING     | Nullable: ✓ | Categories: 2 
  • COUNTRY_ID           | STRING     | Nullable: ✓ | Categories: 101 
  • ACTUAL_PLANNED_GROSS_SALES | FLOAT      | Nullable: ✓ | Range: 1.0-1.0 
  • FLAG                 | STRING     | Nullable: ✓ | Categories: 9 
  • FLAG_DESC

In [59]:
full_df

Unnamed: 0,DAILY_SALES_ID,BU_ID,MOLECULE,PTM,PLI,MWP,COUNTRY_ID,ACTUAL_PLANNED_GROSS_SALES,FLAG,FLAG_DESCRIPTION,...,SHIPMENT_TYPE,INCO_TERMS,DOCUMENT_CURR,DISPATCH_MONTH,BU,BD,ENTITY_CURR,NET_AMOUNT_DC,TOTAL_AMOUNT_LC_LC2,HQ_NAME
0,,,VIGABATRIN,DUMMY,False,False,EUG||DE|SAP,,1,SAP_CE11,...,,,,,APSL,Ganesh,,,,TM HQ - Varanasi
1,1996854.0,EM,NICOTINE,DUMMY,False,False,EUG||ES|SAP,,1,LE_V02,...,,,,,APSL,Daniel,,,,TM HQ - CHENNAI
2,3875500.0,,RIVAROXABAN,,,False,NAG||US|SAP,,5,SAP_CE11K,...,,,,,APSL,Sivaram,,,,TM HQ - Gurgaon
3,5967811.0,EM,AMBROXOL,DUMMY,False,False,GGI||IN|SAP,,1,QTY ADJUSTMENT,...,,,,,APSL,Ganesh,,,,TM HQ - HOWRAH
4,1767652.0,GGINDIA,VOGLIBOSE,DUMMY,,False,NAG||US|SAP,,1,SAP_CE11,...,,,,,APSL,Sivaram,,,,TM HQ - Ahmedabad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1788994.0,NAG,KETOROLAC,DUMMY,False,False,EM|Australia|AU|IBP,,1,SAP_CE11K,...,,,,,,Sivaram,,,,TM HQ - Chennai
99996,,NAG,OMEPRAZOLE,DUMMY,False,,,1.0,1,LE,...,,,,,APSL,Sivaram,,,,TM HQ - JAIPUR
99997,3305411.0,NAG,DIVALPROEX SODIUM,DUMMY,False,False,EM|Brazil|BR|IBP,,1,DAILY_ENTITY_ADJUSTMENT,...,,,,,APSL,Sivaram,,,,
99998,,EUG,ZOLEDRONIC ACID,DUMMY,False,False,EUG||DE|SAP,,5,LE_V02,...,,,,,APSL,Sivaram,,,,TBM HQ - Rohtak


## Step 7: Data quality analysis and comparison with original

In [58]:
# Analyze the quality and distribution of generated data
print("🔍 Analyzing Generated Data Quality...")

# Compare with original schema distributions
comparison = generator.compare_with_original_schema(full_df)

print("\n📊 Distribution Analysis:")

# Analyze categorical columns
categorical_cols = generator.parser.get_categorical_columns()
for col in categorical_cols:  # Show first 3 categorical columns
    if col in full_df.columns:
        print(f"\n🏷️  {col} Distribution:")
        
        # Generated distribution
        generated_dist = full_df[col].value_counts(normalize=True, dropna=False) * 100
        
        # Original distribution
        if col in comparison and 'original_distribution' in comparison[col]:
            original_dist = comparison[col]['original_distribution']
            
            print("   Original vs Generated:")
            for value in generated_dist.index[:5]:  # Top 5 values
                orig_pct = original_dist.get(value, 0)
                gen_pct = generated_dist.get(value, 0)
                print(f"   {str(value):<30} | Orig: {orig_pct:>5.1f}% | Gen: {gen_pct:>5.1f}%")
        else:
            print("   Generated distribution:")
            for value, pct in generated_dist.head().items():
                print(f"   {str(value):<30} | {pct:>5.1f}%")

# Analyze numerical columns  
numerical_cols = generator.parser.get_numerical_columns()
for col in numerical_cols:
    if col in full_df.columns:
        print(f"\n🔢 {col} Statistics:")
        
        generated_stats = comparison[col]['generated']
        
        if 'original' in comparison[col]:
            original_stats = comparison[col]['original']
            print(f"   Min:    Original={original_stats['min']:>8.1f} | Generated={generated_stats['min']:>8.1f}")
            print(f"   Max:    Original={original_stats['max']:>8.1f} | Generated={generated_stats['max']:>8.1f}")
            print(f"   Mean:   Original={original_stats['mean']:>8.1f} | Generated={generated_stats['mean']:>8.1f}")
            print(f"   Nulls:  Original={original_stats['null_percentage']:>6.1f}% | Generated={generated_stats['null_percentage']:>6.1f}%")
        else:
            print(f"   Min: {generated_stats['min']}, Max: {generated_stats['max']}")
            print(f"   Mean: {generated_stats['mean']:.2f}, Nulls: {generated_stats['null_percentage']:.1f}%")

print("\n✅ Data quality analysis completed!")

🔍 Analyzing Generated Data Quality...

📊 Distribution Analysis:

🏷️  BU_ID Distribution:
   Original vs Generated:
   EUG                            | Orig:  32.8% | Gen:  31.2%
   NAG                            | Orig:  32.6% | Gen:  30.9%
   GGINDIA                        | Orig:  25.5% | Gen:  24.2%
   EM                             | Orig:   6.7% | Gen:   6.3%
   None                           | Orig:   0.0% | Gen:   5.0%

🏷️  MOLECULE Distribution:
   Original vs Generated:
   None                           | Orig:   0.0% | Gen:   5.0%
   OMEPRAZOLE                     | Orig:   3.8% | Gen:   3.6%
   NICOTINE                       | Orig:   2.9% | Gen:   2.8%
   FEXOFENADINE                   | Orig:   2.1% | Gen:   2.0%
   OLANZAPINE                     | Orig:   1.7% | Gen:   1.6%

🏷️  PTM Distribution:
   Original vs Generated:
   DUMMY                          | Orig: 100.0% | Gen:  95.0%
   None                           | Orig:   0.0% | Gen:   5.0%

🏷️  PLI Distribution:
   