In [1]:
def create_contingency(data, row_var, col_var, categories=None):
    """
    Create a contingency table with optional categorization of row variable
    
    Parameters:
    - data: List of dictionaries containing the survey data
    - row_var: The variable to use as rows
    - col_var: The variable to use as columns
    - categories: Optional dictionary specifying how to categorize row_var values
                 Format: {'category_name': (min, max), ...}
                 Example: {'18-30': (18, 30), '31-50': (31, 50), '51+': (51, 200)}
    """
    processed_data = []
    
    # Apply categorization if specified
    if categories and row_var in data[0] and isinstance(data[0][row_var], (int, float)):
        for entry in data:
            value = entry[row_var]
            categorized_value = None
            
            # Find which category the value belongs to
            for cat_name, (min_val, max_val) in categories.items():
                if min_val <= value <= max_val:
                    categorized_value = cat_name
                    break
            
            # If no category matched and there's a default, use it
            if categorized_value is None and 'default' in categories:
                categorized_value = categories['default']
            
            if categorized_value is not None:
                new_entry = entry.copy()
                new_entry[row_var] = categorized_value
                processed_data.append(new_entry)
            else:
                processed_data.append(entry.copy())
    else:
        processed_data = data.copy()
    
    # Initialize counts
    row_categories = sorted({d[row_var] for d in processed_data})
    col_categories = sorted({d[col_var] for d in processed_data})
    
    # Create empty table
    table = {row: {col: 0 for col in col_categories} for row in row_categories}
    
    # Count occurrences
    for entry in processed_data:
        table[entry[row_var]][entry[col_var]] += 1
    
    # Calculate margins
    row_totals = {row: sum(cols.values()) for row, cols in table.items()}
    col_totals = {col: sum(table[row][col] for row in row_categories) 
                 for col in col_categories}
    grand_total = sum(row_totals.values())
    
    return {
        "table": table,
        "row_totals": row_totals,
        "col_totals": col_totals,
        "grand_total": grand_total
    }

# Example usage:


In [2]:
def ipf(seed, row_targets, col_targets, max_iter=10, tol=1e-6):
    current = {row: cols.copy() for row, cols in seed.items()}
    row_categories = list(row_targets.keys())
    col_categories = list(col_targets.keys())
    
    for _ in range(max_iter):
        # Adjust rows
        for row in row_categories:
            row_sum = sum(current[row].values())
            if row_sum == 0: continue
            factor = row_targets[row] / row_sum
            for col in col_categories:
                current[row][col] *= factor
        
        # Adjust columns
        for col in col_categories:
            col_sum = sum(current[row][col] for row in row_categories)
            if col_sum == 0: continue
            factor = col_targets[col] / col_sum
            for row in row_categories:
                current[row][col] *= factor
        
        # Check convergence
        converged = True
        for row in row_categories:
            if abs(sum(current[row].values()) - row_targets[row]) > tol:
                converged = False
                break
        for col in col_categories:
            if abs(sum(current[row][col] for row in row_categories) - col_targets[col]) > tol:
                converged = False
                break
        if converged:
            break
    
    return current

In [17]:
# Example usage
survey_data = [
    {'age': 25, 'gender': 'male', 'income': 'low'},
    {'age': 42, 'gender': 'female', 'income': 'high'},
    {'age': 38, 'gender': 'male', 'income': 'medium'},
    {'age': 55, 'gender': 'female', 'income': 'low'},
    {'age': 22, 'gender': 'male', 'income': 'high'},
    {'age': 33, 'gender': 'female', 'income': 'medium'},
    {'age': 45, 'gender': 'male', 'income': 'low'},
    {'age': 50, 'gender': 'female', 'income': 'high'},
    {'age': 28, 'gender': 'male', 'income': 'medium'},
    {'age': 35, 'gender': 'female', 'income': 'low'},
    {'age': 48, 'gender': 'male', 'income': 'high'},
    {'age': 52, 'gender': 'female', 'income': 'medium'},
    {'age': 20, 'gender': 'male', 'income': 'low'},
    {'age': 30, 'gender': 'female', 'income': 'high'},
    {'age': 40, 'gender': 'male', 'income': 'medium'},
    {'age': 50, 'gender': 'female', 'income': 'low'},
    {'age': 60, 'gender': 'male', 'income': 'high'},
    {'age': 27, 'gender': 'female', 'income': 'medium'},
    {'age': 37, 'gender': 'male', 'income': 'low'},
    {'age': 47, 'gender': 'female', 'income': 'high'},
    {'age': 57, 'gender': 'male', 'income': 'medium'},
    {'age': 23, 'gender': 'female', 'income': 'low'},
    {'age': 32, 'gender': 'male', 'income': 'high'},
    {'age': 43, 'gender': 'female', 'income': 'medium'},
    {'age': 53, 'gender': 'male', 'income': 'low'},
    {'age': 29, 'gender': 'female', 'income': 'high'},
    {'age': 39, 'gender': 'male', 'income': 'medium'},
    {'age': 49, 'gender': 'female', 'income': 'low'},
    {'age': 59, 'gender': 'male', 'income': 'high'},
    {'age': 24, 'gender': 'female', 'income': 'medium'}
]

# Example usage:
age_categories = {
    '18-30': (18, 30),
    '31-50': (31, 50),
    '51+': (51, 200)  # Using 200 as upper bound for "51+"
}

# Generate contingency table with age categories
results = create_contingency(
    survey_data, 
    row_var="age", 
    col_var="gender", 
    categories=age_categories
)

print("Contingency Table:")
for age_group, counts in results["table"].items():
    print(f"{age_group}: {counts}")
print("\nRow Totals:", results["row_totals"])
print("Column Totals:", results["col_totals"])
print("Grand Total:", results["grand_total"])
seed = results["table"]

Contingency Table:
18-30: {'female': 5, 'male': 4}
31-50: {'female': 8, 'male': 7}
51+: {'female': 2, 'male': 4}

Row Totals: {'18-30': 9, '31-50': 15, '51+': 6}
Column Totals: {'female': 15, 'male': 15}
Grand Total: 30


In [19]:
seed

{'18-30': {'female': 5, 'male': 4},
 '31-50': {'female': 8, 'male': 7},
 '51+': {'female': 2, 'male': 4}}

In [20]:


# Target margins
row_targets = {"18-30": 300, "31-50": 500, "51+": 200}
col_targets = {"male": 600, "female": 400}

In [21]:
result = ipf(seed, row_targets, col_targets)
print("Final IPF result:", result)

Final IPF result: {'18-30': {'female': 135.54528484949748, 'male': 164.4547150905441}, '31-50': {'female': 214.86637423771896, 'male': 285.1336257049412}, '51+': {'female': 49.58834091278355, 'male': 150.41165920451465}}


In [25]:
def income_distribution(age_categories,survey_data):


    # Initialize the income distribution structure
    income_distribution = {}

    # Initialize counts for each age group and gender
    for age_group in age_categories:
        income_distribution[age_group] = {
            'male': {'low': 0, 'medium': 0, 'high': 0},
            'female': {'low': 0, 'medium': 0, 'high': 0}
        }

    # Count income occurrences
    for person in survey_data:
        age = person['age']
        gender = person['gender']
        income = person['income']
        
        # Find which age category this person belongs to
        age_group = None
        for group, (min_age, max_age) in age_categories.items():
            if min_age <= age <= max_age:
                age_group = group
                break
        
        if age_group:
            income_distribution[age_group][gender][income] += 1

    # Convert counts to probabilities
    for age_group in income_distribution:
        for gender in income_distribution[age_group]:
            total = sum(income_distribution[age_group][gender].values())
            if total > 0:  # Avoid division by zero
                for income in income_distribution[age_group][gender]:
                    income_distribution[age_group][gender][income] /= total

    return income_distribution

In [29]:
age_categories = {
    '18-30': (18, 30),
    '31-50': (31, 50),
    '51+': (51, 200)
}

a_income_distribution = income_distribution(age_categories,survey_data)

In [41]:
def create_synthetic_population(ipf_counts, income_dist):
    synthetic_pop = []
    
    # First verify the IPF counts match the total population
    ipf_total = sum(sum(gender_counts.values()) for gender_counts in ipf_counts.values())
    
    
    for age_group in ipf_counts:
        for gender in ipf_counts[age_group]:
            count = ipf_counts[age_group][gender]
            
            # Get income probabilities for this group
            income_probs = income_dist[age_group][gender]
            income_levels = list(income_probs.keys())
            probabilities = list(income_probs.values())
            
            # Generate individuals for this group
            for _ in range(int(count)):
                # Select income based on probabilities
                rand = random.random()
                cumulative_prob = 0
                income = income_levels[-1]  # default to last category
                
                for i, prob in enumerate(probabilities):
                    cumulative_prob += prob
                    if rand <= cumulative_prob:
                        income = income_levels[i]
                        break
                
                # Generate random age within the age group range
                if age_group == '18-30':
                    age = random.randint(18, 30)
                elif age_group == '31-50':
                    age = random.randint(31, 50)
                else:  # 51+
                    age = random.randint(51, 70)
                
                synthetic_pop.append({
                    'age': age,
                    'gender': gender,
                    'income': income,
                    'age_group': age_group  # Adding for easier analysis
                })
    
    # Verify we created the correct number

    
    return synthetic_pop

In [44]:
import random
population = create_synthetic_population(result, a_income_distribution)

# Print results
print("Synthetic Population of 1000 people:")
selected_individuals = random.sample(population, 10)
for i, person in enumerate(selected_individuals):  # Print first 10 as example
    print(f"{i+1}. Age {person['age']} {person['gender']} - Income: {person['income']}")

# Print summary statistics
print("\nSummary Statistics:")
age_groups = {}
genders = {}
incomes = {}

for person in population:
    # Categorize age
    age = person['age']
    if 18 <= age <= 30:
        age_group = '18-30'
    elif 31 <= age <= 50:
        age_group = '31-50'
    else:
        age_group = '51+'
    
    age_groups[age_group] = age_groups.get(age_group, 0) + 1
    genders[person['gender']] = genders.get(person['gender'], 0) + 1
    incomes[person['income']] = incomes.get(person['income'], 0) + 1

print("\nAge Groups:")
for group, count in age_groups.items():
    print(f"{group}: {count} people ({count/50*100:.1f}%)")

print("\nGenders:")
for gender, count in genders.items():
    print(f"{gender}: {count} people ({count/50*100:.1f}%)")

print("\nIncome Levels:")
for income, count in incomes.items():
    print(f"{income}: {count} people ({count/50*100:.1f}%)")

Synthetic Population of 1000 people:
1. Age 46 female - Income: medium
2. Age 19 male - Income: medium
3. Age 36 male - Income: low
4. Age 23 male - Income: low
5. Age 37 male - Income: high
6. Age 35 female - Income: low
7. Age 32 male - Income: high
8. Age 23 female - Income: low
9. Age 69 male - Income: high
10. Age 29 male - Income: high

Summary Statistics:

Age Groups:
18-30: 299 people (598.0%)
31-50: 499 people (998.0%)
51+: 199 people (398.0%)

Genders:
female: 398 people (796.0%)
male: 599 people (1198.0%)

Income Levels:
low: 332 people (664.0%)
high: 310 people (620.0%)
medium: 355 people (710.0%)
