In [1]:
import pprint
import random
import math

# Synthetic Population Generation using Iterative Proportional Fitting (IPF)

This notebook demonstrates the creation of a realistic synthetic population by combining two key techniques:  
1) **Iterative Proportional Fitting (IPF)** to balance demographic constraints (age/gender distributions), and  
2) **Probabilistic attribute assignment** to model income distributions within each demographic subgroup.

The pipeline first generates balanced population counts through IPF, then creates individual synthetic records with statistically consistent characteristics. This approach is particularly valuable for  policy simulation and privacy-preserving data analysis where real population data cannot be shared.

In [2]:
survey_data = [
    {'age': 25, 'gender': 'male', 'income': 'low'},
    {'age': 42, 'gender': 'female', 'income': 'high'},
    {'age': 38, 'gender': 'male', 'income': 'medium'},
    {'age': 55, 'gender': 'female', 'income': 'low'},
    {'age': 22, 'gender': 'male', 'income': 'high'},
    {'age': 33, 'gender': 'female', 'income': 'medium'},
    {'age': 45, 'gender': 'male', 'income': 'low'},
    {'age': 50, 'gender': 'female', 'income': 'high'},
    {'age': 28, 'gender': 'male', 'income': 'medium'},
    {'age': 35, 'gender': 'female', 'income': 'low'},
    {'age': 48, 'gender': 'male', 'income': 'high'},
    {'age': 52, 'gender': 'female', 'income': 'medium'},
    {'age': 20, 'gender': 'male', 'income': 'low'},
    {'age': 30, 'gender': 'female', 'income': 'high'},
    {'age': 40, 'gender': 'male', 'income': 'medium'},
    {'age': 50, 'gender': 'female', 'income': 'low'},
    {'age': 60, 'gender': 'male', 'income': 'high'},
    {'age': 27, 'gender': 'female', 'income': 'medium'},
    {'age': 37, 'gender': 'male', 'income': 'low'},
    {'age': 47, 'gender': 'female', 'income': 'high'},
    {'age': 57, 'gender': 'male', 'income': 'medium'},
    {'age': 23, 'gender': 'female', 'income': 'low'},
    {'age': 32, 'gender': 'male', 'income': 'high'},
    {'age': 43, 'gender': 'female', 'income': 'medium'},
    {'age': 53, 'gender': 'male', 'income': 'low'},
    {'age': 29, 'gender': 'female', 'income': 'high'},
    {'age': 39, 'gender': 'male', 'income': 'medium'},
    {'age': 49, 'gender': 'female', 'income': 'low'},
    {'age': 59, 'gender': 'male', 'income': 'high'},
    {'age': 24, 'gender': 'female', 'income': 'medium'},
    {'age': 25, 'gender': 'male', 'income': 'low'},
    {'age': 42, 'gender': 'female', 'income': 'high'},
    {'age': 38, 'gender': 'male', 'income': 'medium'},
    {'age': 55, 'gender': 'female', 'income': 'low'},
    {'age': 22, 'gender': 'male', 'income': 'high'},
    {'age': 33, 'gender': 'female', 'income': 'medium'},
    {'age': 45, 'gender': 'male', 'income': 'low'},
    {'age': 50, 'gender': 'female', 'income': 'high'},
    {'age': 28, 'gender': 'male', 'income': 'medium'},
    {'age': 35, 'gender': 'female', 'income': 'low'},
    {'age': 48, 'gender': 'male', 'income': 'high'},
    {'age': 52, 'gender': 'female', 'income': 'medium'},
    {'age': 20, 'gender': 'male', 'income': 'low'},
    {'age': 30, 'gender': 'female', 'income': 'high'},
    {'age': 40, 'gender': 'male', 'income': 'medium'},
    {'age': 50, 'gender': 'female', 'income': 'low'},
    {'age': 60, 'gender': 'male', 'income': 'high'},
    {'age': 27, 'gender': 'female', 'income': 'medium'},
    {'age': 37, 'gender': 'male', 'income': 'low'},
    {'age': 47, 'gender': 'female', 'income': 'high'},
    {'age': 57, 'gender': 'male', 'income': 'medium'},
    {'age': 23, 'gender': 'female', 'income': 'low'},
    {'age': 32, 'gender': 'male', 'income': 'high'},
    {'age': 43, 'gender': 'female', 'income': 'medium'},
    {'age': 53, 'gender': 'male', 'income': 'low'},
    {'age': 29, 'gender': 'female', 'income': 'high'},
    {'age': 39, 'gender': 'male', 'income': 'medium'},
    {'age': 49, 'gender': 'female', 'income': 'low'},
    {'age': 59, 'gender': 'male', 'income': 'high'},
    {'age': 24, 'gender': 'female', 'income': 'medium'},
    {'age': 25, 'gender': 'male', 'income': 'low'},
    {'age': 42, 'gender': 'female', 'income': 'high'},
    {'age': 38, 'gender': 'male', 'income': 'medium'},
    {'age': 55, 'gender': 'female', 'income': 'low'},
    {'age': 22, 'gender': 'male', 'income': 'high'},
    {'age': 33, 'gender': 'female', 'income': 'medium'},
    {'age': 45, 'gender': 'male', 'income': 'low'},
    {'age': 50, 'gender': 'female', 'income': 'high'},
    {'age': 28, 'gender': 'male', 'income': 'medium'},
    {'age': 35, 'gender': 'female', 'income': 'low'},
    {'age': 48, 'gender': 'male', 'income': 'high'},
    {'age': 52, 'gender': 'female', 'income': 'medium'},
    {'age': 20, 'gender': 'male', 'income': 'low'},
    {'age': 30, 'gender': 'female', 'income': 'high'},
    {'age': 40, 'gender': 'male', 'income': 'medium'},
    {'age': 50, 'gender': 'female', 'income': 'low'},
    {'age': 60, 'gender': 'male', 'income': 'high'},
    {'age': 27, 'gender': 'female', 'income': 'medium'},
    {'age': 37, 'gender': 'male', 'income': 'low'},
    {'age': 47, 'gender': 'female', 'income': 'high'},
    {'age': 57, 'gender': 'male', 'income': 'medium'},
    {'age': 23, 'gender': 'female', 'income': 'low'},
    {'age': 32, 'gender': 'male', 'income': 'high'},
    {'age': 43, 'gender': 'female', 'income': 'medium'},
    {'age': 53, 'gender': 'male', 'income': 'low'},
    {'age': 29, 'gender': 'female', 'income': 'high'},
    {'age': 39, 'gender': 'male', 'income': 'medium'},
    {'age': 49, 'gender': 'female', 'income': 'low'},
    {'age': 59, 'gender': 'male', 'income': 'high'},
    {'age': 24, 'gender': 'female', 'income': 'medium'}
]

# Example usage:
age_categories = {
    '18-30': (18, 30),
    '31-50': (31, 50),
    '51+': (51, 200)  # Using 200 as upper bound for "51+"
}

target_table={"row":{"18-30": 30, "31-50": 50, "51+": 20},"col":{"male": 60, "female": 40}}

In [3]:
def create_contingency(data, row_var, col_var, categories=None):
    """
    Create a contingency table with optional categorization of row variable
    
    Parameters:
    - data: List of dictionaries containing the survey data
    - row_var: The variable to use as rows
    - col_var: The variable to use as columns
    - categories: Optional dictionary specifying how to categorize row_var values
                 Format: {'category_name': (min, max), ...}
                 Example: {'18-30': (18, 30), '31-50': (31, 50), '51+': (51, 200)}
    """
    processed_data = []
    
    # Apply categorization if specified
    if categories and row_var in data[0] and isinstance(data[0][row_var], (int, float)):
        for entry in data:
            value = entry[row_var]
            categorized_value = None
            
            # Find which category the value belongs to
            for cat_name, (min_val, max_val) in categories.items():
                if min_val <= value <= max_val:
                    categorized_value = cat_name
                    break
            
            # If no category matched and there's a default, use it
            if categorized_value is None and 'default' in categories:
                categorized_value = categories['default']
            
            if categorized_value is not None:
                new_entry = entry.copy()
                new_entry[row_var] = categorized_value
                processed_data.append(new_entry)
            else:
                processed_data.append(entry.copy())
    else:
        processed_data = data.copy()
    
    # Initialize counts
    row_categories = sorted({d[row_var] for d in processed_data})
    col_categories = sorted({d[col_var] for d in processed_data})
    
    # Create empty table
    table = {row: {col: 0 for col in col_categories} for row in row_categories}
    
    # Count occurrences
    for entry in processed_data:
        table[entry[row_var]][entry[col_var]] += 1
    
    # Calculate margins
    row_totals = {row: sum(cols.values()) for row, cols in table.items()}
    col_totals = {col: sum(table[row][col] for row in row_categories) 
                 for col in col_categories}
    grand_total = sum(row_totals.values())
    
    return {
        "table": table,
        "row_totals": row_totals,
        "col_totals": col_totals,
        "grand_total": grand_total
    }

# Example usage:


---

# Iterative Proportional Fitting (IPF) Algorithm Breakdown

## Overview
The `ipf()` function implements the **Iterative Proportional Fitting (IPF)** algorithm, which adjusts a contingency table (seed matrix) to match predefined row and column marginal totals (`row_targets` and `col_targets`). It iteratively scales rows and columns until convergence or a maximum iteration limit is reached.

---

## Input Parameters

| Parameter      | Type          | Description                                                                 |
|---------------|--------------|-----------------------------------------------------------------------------|
| `seed`        | `dict`       | Initial matrix as a nested dictionary: `{row: {col: value}}`                |
| `row_targets` | `dict`       | Target row sums: `{row: target_tablesum}`                                       |
| `col_targets` | `dict`       | Target column sums: `{col: target_tablesum}`                                    |
| `max_iter`    | `int`        | Maximum iterations (default: `10`).                                        |
| `tol`         | `float`      | Tolerance for convergence (default: `1e-6`).                               |

---

## Step-by-Step Execution

### 1. Initialization
- **Copy `seed`:**
  ```python
  current = {row: cols.copy() for row, cols in seed.items()}
  ```
  Creates a modifiable copy of the seed matrix.

- **Extract Categories:**
  ```python
  row_categories = list(row_targets.keys())
  col_categories = list(col_targets.keys())
  ```
  Lists all row and column keys from the target dictionaries.

---

### 2. Iterative Adjustment
For each iteration until `max_iter` or convergence:

#### **Row Adjustment**
For each row:
1. Compute current row sum:
   ```python
   row_sum = sum(current[row].values())
   ```
2. Skip if sum is zero (to avoid division by zero).  
3. Calculate scaling factor:
   ```python
   factor = row_targets[row] / row_sum
   ```
4. Scale all columns in the row:
   ```python
   current[row][col] *= factor
   ```

#### **Column Adjustment**
For each column:
1. Compute current column sum across all rows:
   ```python
   col_sum = sum(current[row][col] for row in row_categories)
   ```
2. Skip if sum is zero.  
3. Calculate scaling factor:
   ```python
   factor = col_targets[col] / col_sum
   ```
4. Scale the column in all rows:
   ```python
   current[row][col] *= factor
   ```

---

### 3. Convergence Check
After each iteration:
- **Row Check:**
  ```python
  abs(sum(current[row].values()) - row_targets[row]) > tol
  ```
  Checks if all row sums match targets within tolerance.

- **Column Check:**
  ```python
  abs(sum(current[row][col] for row in row_categories) - col_targets[col]) > tol
  ```
  Checks if all column sums match targets within tolerance.

If both checks pass, the loop breaks early.

---

## Output
Returns the adjusted matrix `current` with row/column sums matching the targets as closely as possible.

---

## Key Properties
- **Preservation of Structure**: Maintains zero entries (structural zeros) from the seed.
- **Convergence**: Guaranteed if targets are compatible (i.e., total row sums = total column sums).
- **Use Case**: Common in statistics for reweighting survey data or contingency tables.

---


In [4]:
def ipf(seed, row_targets, col_targets, max_iter=10, tol=1e-6):
    current = {row: cols.copy() for row, cols in seed.items()}
    row_categories = list(row_targets.keys())
    col_categories = list(col_targets.keys())
    
    for _ in range(max_iter):
        # Adjust rows
        for row in row_categories:
            row_sum = sum(current[row].values())
            if row_sum == 0: continue
            factor = row_targets[row] / row_sum
            for col in col_categories:
                current[row][col] *= factor
        
        # Adjust columns
        for col in col_categories:
            col_sum = sum(current[row][col] for row in row_categories)
            if col_sum == 0: continue
            factor = col_targets[col] / col_sum
            for row in row_categories:
                current[row][col] *= factor
        
        # Check convergence
        converged = True
        for row in row_categories:
            if abs(sum(current[row].values()) - row_targets[row]) > tol:
                converged = False
                break
        for col in col_categories:
            if abs(sum(current[row][col] for row in row_categories) - col_targets[col]) > tol:
                converged = False
                break
        if converged:
            break
    
    return current

In [5]:
def create_crosstab(data):
    # Initialize the crosstab dictionary
    crosstab = {}

    # Calculate the sums for each age group and round the values
    for age_group, values in data.items():
        rounded_values = {key: round(value) for key, value in values.items()}
        crosstab[age_group] = rounded_values
        # crosstab[age_group]['Total'] = sum(rounded_values.values())

    # Calculate the sums for each gender and round the values
    total_female = math.ceil(sum(values['female'] for values in data.values()))
    total_male = math.ceil(sum(values['male'] for values in data.values()))

    # Prepare the totals as a separate data structure
    totals = {
        'female': total_female,
        'male': total_male,
        'Total': total_female + total_male
    }

    return crosstab, totals


In [6]:

# Generate contingency table with age categories
results = create_contingency(
    survey_data, 
    row_var="age", 
    col_var="gender", 
    categories=age_categories
)

print("Contingency Table:")
for age_group, counts in results["table"].items():
    print(f"{age_group}: {counts}")
print("\nRow Totals:", results["row_totals"])
print("Column Totals:", results["col_totals"])
print("Grand Total:", results["grand_total"])
seed = results["table"]
print(seed)

Contingency Table:
18-30: {'female': 15, 'male': 12}
31-50: {'female': 24, 'male': 21}
51+: {'female': 6, 'male': 12}

Row Totals: {'18-30': 27, '31-50': 45, '51+': 18}
Column Totals: {'female': 45, 'male': 45}
Grand Total: 90
{'18-30': {'female': 15, 'male': 12}, '31-50': {'female': 24, 'male': 21}, '51+': {'female': 6, 'male': 12}}


In [7]:
result = ipf(seed, target_table["row"], target_table["col"])
ipf_result,totals = create_crosstab(result) # rounds the fractional values from the ipf
print(target_table)
print(ipf_result)
print(totals)

{'row': {'18-30': 30, '31-50': 50, '51+': 20}, 'col': {'male': 60, 'female': 40}}
{'18-30': {'female': 14, 'male': 16}, '31-50': {'female': 21, 'male': 29}, '51+': {'female': 5, 'male': 15}}
{'female': 40, 'male': 61, 'Total': 101}


In [8]:
def income_distribution(age_categories,survey_data):


    # Initialize the income distribution structure
    income_distribution = {}

    # Initialize counts for each age group and gender
    for age_group in age_categories:
        income_distribution[age_group] = {
            'male': {'low': 0, 'medium': 0, 'high': 0},
            'female': {'low': 0, 'medium': 0, 'high': 0}
        }

    # Count income occurrences
    for person in survey_data:
        age = person['age']
        gender = person['gender']
        income = person['income']
        
        # Find which age category this person belongs to
        age_group = None
        for group, (min_age, max_age) in age_categories.items():
            if min_age <= age <= max_age:
                age_group = group
                break
        
        if age_group:
            income_distribution[age_group][gender][income] += 1

    # Convert counts to probabilities
    for age_group in income_distribution:
        for gender in income_distribution[age_group]:
            total = sum(income_distribution[age_group][gender].values())
            if total > 0:  # Avoid division by zero
                for income in income_distribution[age_group][gender]:
                    income_distribution[age_group][gender][income] /= total

    return income_distribution

In [9]:


a_income_distribution = income_distribution(age_categories,survey_data)
print(a_income_distribution)

{'18-30': {'male': {'low': 0.5, 'medium': 0.25, 'high': 0.25}, 'female': {'low': 0.2, 'medium': 0.4, 'high': 0.4}}, '31-50': {'male': {'low': 0.2857142857142857, 'medium': 0.42857142857142855, 'high': 0.2857142857142857}, 'female': {'low': 0.375, 'medium': 0.25, 'high': 0.375}}, '51+': {'male': {'low': 0.25, 'medium': 0.25, 'high': 0.5}, 'female': {'low': 0.5, 'medium': 0.5, 'high': 0.0}}}


Here's a detailed markdown explanation of the `create_synthetic_population` function:

# Synthetic Population Generation Function

## Overview
This function creates a synthetic population dataset by combining:
1. Population counts from IPF (Iterative Proportional Fitting)
2. Income distribution probabilities

## Function Signature
```python
def create_synthetic_population(ipf_counts, income_dist):
```

## Parameters

| Parameter | Type | Description |
|-----------|------|-------------|
| `ipf_counts` | `dict` | Dictionary containing population counts by age group and gender |
| `income_dist` | `dict` | Dictionary containing income probability distributions by age group and gender |

## Data Structures

### Input Structure
- `ipf_counts`:
  ```python
  {
      'age_group1': {
          'male': count,
          'female': count
      },
      # ... other age groups
  }
  ```
  
- `income_dist`:
  ```python
  {
      'age_group1': {
          'male': {'low': prob, 'medium': prob, 'high': prob},
          'female': {'low': prob, 'medium': prob, 'high': prob}
      },
      # ... other age groups
  }
  ```

### Output Structure
Returns a list of synthetic individuals:
```python
[
    {
        'age': integer,
        'gender': string,
        'income': string,
        'age_group': string
    },
    # ... more individuals
]
```

## Step-by-Step Execution

### 1. Initialization and Verification
```python
synthetic_pop = []
ipf_total = sum(sum(gender_counts.values()) for gender_counts in ipf_counts.values())
```
- Creates empty list to store synthetic population
- Calculates total population count from IPF data

### 2. Population Generation Loop
```python
for age_group in ipf_counts:
    for gender in ipf_counts[age_group]:
        count = ipf_counts[age_group][gender]
        income_probs = income_dist[age_group][gender]
```
- Iterates through each age group and gender combination
- Gets count of individuals and income probabilities for each group

### 3. Income Assignment
```python
income_levels = list(income_probs.keys())
probabilities = list(income_probs.values())

for _ in range(int(count)):
    rand = random.random()
    cumulative_prob = 0
    income = income_levels[-1]  # default
    
    for i, prob in enumerate(probabilities):
        cumulative_prob += prob
        if rand <= cumulative_prob:
            income = income_levels[i]
            break
```
- For each individual:
  - Generates random number between 0 and 1
  - Uses cumulative probability to assign income level
  - Defaults to last category if no match (safeguard)

### 4. Age Assignment
```python
if age_group == '18-30':
    age = random.randint(18, 30)
elif age_group == '31-50':
    age = random.randint(31, 50)
else:  # 51+
    age = random.randint(51, 70)
```
- Randomly assigns age within predefined ranges for each age group
- Uses inclusive bounds (both min and max values possible)

### 5. Individual Creation
```python
synthetic_pop.append({
    'age': age,
    'gender': gender,
    'income': income,
    'age_group': age_group
})
```
- Creates dictionary with all attributes
- Includes age_group for easier analysis later

## Example Usage

```python
ipf_counts = {
    '18-30': {'male': 150, 'female': 160},
    '31-50': {'male': 200, 'female': 210},
    '51+': {'male': 180, 'female': 190}
}

income_dist = {
    '18-30': {
        'male': {'low': 0.4, 'medium': 0.35, 'high': 0.25},
        'female': {'low': 0.45, 'medium': 0.3, 'high': 0.25}
    },
    # ... similar for other age groups
}

synthetic_data = create_synthetic_population(ipf_counts, income_dist)
```

## Key Features

1. **Proportional Sampling**: Maintains exact population counts from IPF
2. **Probabilistic Income Assignment**: Respects income distribution probabilities
3. **Realistic Age Distribution**: Uniform distribution within age groups
4. **Complete Records**: Each individual has all required attributes
5. **Traceability**: Includes age_group for verification

## Output Interpretation
The output contains individual records with:
- Random ages within specified age group ranges
- Gender as specified in the IPF counts
- Income levels assigned according to the probability distributions
- Age group label for grouping/analysis purposes

This function is particularly useful for microsimulation modeling and synthetic data generation for privacy-preserving analytics.

In [10]:
def create_synthetic_population(ipf_counts, income_dist):
    print(ipf_counts)
    print(income_dist)
    synthetic_pop = []
    
    # First verify the IPF counts match the total population
    ipf_total = sum(sum(gender_counts.values()) for gender_counts in ipf_counts.values())
    
    
    for age_group in ipf_counts:
        for gender in ipf_counts[age_group]:
            count = ipf_counts[age_group][gender]
            
            # Get income probabilities for this group
            income_probs = income_dist[age_group][gender]
            income_levels = list(income_probs.keys())
            probabilities = list(income_probs.values())
            
            # Generate individuals for this group
            for _ in range(int(count)):
                # Select income based on probabilities
                rand = random.random()
                cumulative_prob = 0
                income = income_levels[-1]  # default to last category
                
                for i, prob in enumerate(probabilities):
                    cumulative_prob += prob
                    if rand <= cumulative_prob:
                        income = income_levels[i]
                        break
                
                # Generate random age within the age group range
                if age_group == '18-30':
                    age = random.randint(18, 30)
                elif age_group == '31-50':
                    age = random.randint(31, 50)
                else:  # 51+
                    age = random.randint(51, 70)
                
                synthetic_pop.append({
                    'age': age,
                    'gender': gender,
                    'income': income,
                    'age_group': age_group  # Adding for easier analysis
                })
    return synthetic_pop

In [11]:
population = create_synthetic_population(ipf_result, a_income_distribution)

{'18-30': {'female': 14, 'male': 16}, '31-50': {'female': 21, 'male': 29}, '51+': {'female': 5, 'male': 15}}
{'18-30': {'male': {'low': 0.5, 'medium': 0.25, 'high': 0.25}, 'female': {'low': 0.2, 'medium': 0.4, 'high': 0.4}}, '31-50': {'male': {'low': 0.2857142857142857, 'medium': 0.42857142857142855, 'high': 0.2857142857142857}, 'female': {'low': 0.375, 'medium': 0.25, 'high': 0.375}}, '51+': {'male': {'low': 0.25, 'medium': 0.25, 'high': 0.5}, 'female': {'low': 0.5, 'medium': 0.5, 'high': 0.0}}}


In [12]:



# Print results
print("Synthetic Population of 1000 people:")
selected_individuals = random.sample(population, 10)
for i, person in enumerate(selected_individuals):  # Print first 10 as example
    print(f"{i+1}. Age {person['age']} {person['gender']} - Income: {person['income']}")

# Print summary statistics
print("\nSummary Statistics:")
age_groups = {}
genders = {}
incomes = {}

for person in population:
    # Categorize age
    age = person['age']
    if 18 <= age <= 30:
        age_group = '18-30'
    elif 31 <= age <= 50:
        age_group = '31-50'
    else:
        age_group = '51+'
    
    age_groups[age_group] = age_groups.get(age_group, 0) + 1
    genders[person['gender']] = genders.get(person['gender'], 0) + 1
    incomes[person['income']] = incomes.get(person['income'], 0) + 1

print("\nAge Groups:")
for group, count in age_groups.items():
    print(f"{group}: {count} people ({count/100*100:.1f}%)")

print("\nGenders:")
for gender, count in genders.items():
    print(f"{gender}: {count} people ({count/100*100:.1f}%)")

print("\nIncome Levels:")
for income, count in incomes.items():
    print(f"{income}: {count} people ({count/100*100:.1f}%)")
print()
print("The target was:")    
print(target_table)

Synthetic Population of 1000 people:
1. Age 38 male - Income: medium
2. Age 69 male - Income: high
3. Age 29 male - Income: medium
4. Age 59 male - Income: high
5. Age 52 female - Income: low
6. Age 25 female - Income: medium
7. Age 37 female - Income: high
8. Age 43 female - Income: low
9. Age 34 male - Income: high
10. Age 49 male - Income: medium

Summary Statistics:

Age Groups:
18-30: 30 people (30.0%)
31-50: 50 people (50.0%)
51+: 20 people (20.0%)

Genders:
female: 40 people (40.0%)
male: 60 people (60.0%)

Income Levels:
low: 31 people (31.0%)
medium: 38 people (38.0%)
high: 31 people (31.0%)

The target was:
{'row': {'18-30': 30, '31-50': 50, '51+': 20}, 'col': {'male': 60, 'female': 40}}
