In [1]:
! pip install matplotlib numpy pandas jupyter

Collecting matplotlib
  Using cached matplotlib-3.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy
  Using cached numpy-2.2.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting jupyter
  Using cached jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.57.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (102 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.8-cp312-

In [2]:
import pandas as pd
from itertools import product

def ipf_synthetic_population():
    # 1. Define target marginal distributions
    marginals = {
        'age': {'18-25': 500, '26-35': 1000},
        'gender': {'Male': 600, 'Female': 900},
        'education': {'School': 800, 'Degree': 700}
    }
    
    # 2. Create all possible combinations of categories
    categories = {
        'age': list(marginals['age'].keys()),
        'gender': list(marginals['gender'].keys()),
        'education': list(marginals['education'].keys())
    }
    
    # Generate all possible combinations
    combinations = list(product(*categories.values()))
    
    # 3. Initialize DataFrame with uniform weights
    df = pd.DataFrame(combinations, columns=categories.keys())
    df['weight'] = 1  # Uniform initial weights
    
    # 4. IPF Algorithm
    max_iterations = 50
    tolerance = 1e-6
    converged = False
    
    for iteration in range(max_iterations):
        max_diff = 0
        
        # Adjust for each dimension
        for dimension in categories:
            # Calculate current marginal sums
            current_margins = df.groupby(dimension)['weight'].sum()
            
            # Calculate adjustment factors
            adjustments = {}
            for category in marginals[dimension]:
                target = marginals[dimension][category]
                current = current_margins.get(category, 0)
                if current > 0:
                    adjustments[category] = target / current
                else:
                    adjustments[category] = 1.0  # No adjustment if no samples
            
            # Apply adjustments
            df['weight'] *= df[dimension].map(adjustments)
            
            # Track maximum adjustment for convergence check
            current_diff = max(abs(1 - x) for x in adjustments.values())
            max_diff = max(max_diff, current_diff)
        
        # Check for convergence
        if max_diff < tolerance:
            print(f"Converged after {iteration+1} iterations")
            converged = True
            break
    
    if not converged:
        print(f"Stopped after {max_iterations} iterations (max reached)")
    
    # 5. Generate synthetic population
    population_size = 1500
    
    # Normalize weights to probabilities
    df['probability'] = df['weight'] / df['weight'].sum()
    
    # Sample individuals according to probabilities
    synthetic_pop = df.sample(
        n=population_size,
        weights='probability',
        replace=True
    ).reset_index(drop=True)
    
    # Add unique IDs
    synthetic_pop['id'] = range(1, len(synthetic_pop)+1)
    
    # Return only needed columns
    return synthetic_pop[['id', 'age', 'gender', 'education']]

# Generate and verify the population
synthetic_pop = ipf_synthetic_population()

print("\nSynthetic population head:")
print(synthetic_pop.head())
print("\nMarginal verification:")
print("Age counts:\n", synthetic_pop['age'].value_counts())
print("\nGender counts:\n", synthetic_pop['gender'].value_counts())
print("\nEducation counts:\n", synthetic_pop['education'].value_counts())

Converged after 2 iterations

Synthetic population head:
   id    age  gender education
0   1  26-35    Male    School
1   2  18-25    Male    Degree
2   3  26-35    Male    Degree
3   4  18-25  Female    Degree
4   5  26-35  Female    School

Marginal verification:
Age counts:
 age
26-35    986
18-25    514
Name: count, dtype: int64

Gender counts:
 gender
Female    916
Male      584
Name: count, dtype: int64

Education counts:
 education
School    788
Degree    712
Name: count, dtype: int64
