In [1]:
using DelimitedFiles

In [2]:
function load_csv_with_headers_matrix(filename::String)
    # Read headers separately
    headers = readline(filename) |> line -> split(line, ",")
    
    # Read data as matrix (skip header row)
    data_matrix = readdlm(filename, ',', Int; skipstart=1)
    return headers, data_matrix
end;

In [3]:
cencus_headers, cencus_data = load_csv_with_headers_matrix("Data/toy_cencus.csv");
survay_headers, survay_data = load_csv_with_headers_matrix("Data/toy_survay.csv");
println(cencus_headers == survay_headers)

true


In [4]:
area = 1  # First area (row 1)
total = cencus_data[area, 1] + cencus_data[area, 2]  # Row 1, Cols 1 + 2
survey_data_length = size(survay_data,1);
random_indices = rand(1:survey_data_length, total);

In [5]:
cols = length(survay_headers)
synth_pop_totals = zeros(Int,1,cols )  # 1×5 Matrix: [0.0 0.0 0.0 0.0 0.0]
for i in random_indices
    for j in 1:cols
        synth_pop_totals[j]+=survay_data[i,j]
    end
end

In [6]:
function mutate(random_indices, survay_data, synth_pop_totals)
    """
    Mutate a synthetic population by swapping one individual for another and calculate the new fitness.
    
    This function performs a mutation operation on a synthetic population by:
    1. Randomly selecting an individual to remove from the current population
    2. Randomly selecting an individual to add from the survey data
    3. Calculating the new population totals and Manhattan distance fitness
    
    # Arguments
    - `random_indices::Vector{Int}`: Vector of indices mapping to individuals in the synthetic population
    - `survay_data::Matrix`: Survey data matrix where each row represents an individual
    - `synth_pop_totals::Matrix`: Current synthetic population totals (row vector)
    
    # Returns
    - `remove_person_pos::Int`: Position in random_indices of the removed individual
    - `add_person_idx::Int`: Index in survay_data of the added individual  
    - `temp_manhattan_dist::Number`: Manhattan distance of the new population from census data
    - `temp_synth_pop_totals::Matrix`: New synthetic population totals after mutation
    """
    
    # Validate input dimensions
    @assert size(synth_pop_totals, 1) == 1 "synth_pop_totals should be a row vector"
    @assert size(survay_data, 2) == size(synth_pop_totals, 2) "Dimension mismatch between survey data and population totals"
    
    # Pick a random individual from the synthetic population to remove
    remove_person_pos = rand(1:length(random_indices))
    remove_person_idx = random_indices[remove_person_pos]
    
    # Extract the individual's data as a row vector (maintains matrix dimensions)
    remove_person = survay_data[remove_person_pos:remove_person_pos, :]
    
    # Pick a random individual from the survey data to add  
    survey_data_length = size(survay_data, 1)
    add_person_idx = rand(1:survey_data_length)
    add_person = survay_data[add_person_idx:add_person_idx, :]
    
    # Create temporary population totals by swapping individuals
    # Note: Both remove_person and add_person are row vectors for dimension compatibility
    temp_synth_pop_totals = synth_pop_totals - remove_person + add_person
    
    # Calculate Manhattan distance between new population and census data
    # Assuming 'area' and 'cencus_data' are defined in outer scope (consider passing as parameters)
    temp_manhattan_dist = sum(abs.(temp_synth_pop_totals - cencus_data[area:area, :]))
    
    return remove_person_pos, add_person_idx, temp_synth_pop_totals , temp_manhattan_dist
end;

In [7]:
function annealing(random_indices, survay_data, synth_pop_totals)
    """
    Performs simulated annealing to optimize synthetic population allocation.
    
    This function iteratively improves a synthetic population by randomly mutating
    individuals and accepting changes based on both improvement probability
    and a cooling temperature schedule.
    
    # Arguments
    - `random_indices`: Initial random indices for population selection
    - `survay_data`: Survey data used for mutation operations
    - `synth_pop_totals`: Current synthetic population totals to be optimized
    
    # Returns
    - `random_indices`: Optimized indices after simulated annealing process
    """
    
    # Cooling parameters for simulated annealing
    temperature = 1.0              # Initial temperature (high for more exploration)
    cooling_rate = 0.95            # Geometric cooling rate (0.85-0.99 typical)
    min_temperature = 0.0000000001 # Minimum temperature to prevent underflow
    
    # Calculate initial fitness using Manhattan distance between synthetic and census data
    manhattan_dist = sum(abs.(synth_pop_totals - cencus_data[area:area,:]))
    counter = 0  # Counter for consecutive rejected mutations
    
    # Main annealing loop: continue until convergence or max iterations
    while counter < 200 && manhattan_dist > 0 
        # Generate a mutation: swap one individual in the population
        remove_person_pos, add_person_idx, temp_synth_pop_totals, temp_manhattan_dist = 
            mutate(random_indices, survay_data, synth_pop_totals)
        
        # Calculate change in fitness (negative delta = improvement)
        delta = temp_manhattan_dist - manhattan_dist
        
        # Simulated annealing acceptance criteria:
        # - Always accept improvements (delta < 0)
        # - Sometimes accept worse solutions based on temperature and randomness
        #   (higher temperature = more likely to accept worse moves)
        if delta < 0 || exp(-delta / temperature) > rand()
            # Accept the mutation: update population and fitness
            random_indices[remove_person_pos] = add_person_idx
            synth_pop_totals = temp_synth_pop_totals
            manhattan_dist = temp_manhattan_dist
            counter = 0  # Reset counter after successful mutation
        else
            # Reject the mutation: increment counter
            counter += 1
        end
        
        # Cool down: reduce temperature geometrically
        # This gradually shifts from exploration to exploitation
        temperature *= cooling_rate
        
        # Print current fitness for monitoring progress
        print("$manhattan_dist ")
    end
    
    return random_indices  # Return optimized population indices
end;

annealing (generic function with 1 method)

In [8]:
println(random_indices)
synthpop = annealing(random_indices, survay_data, synth_pop_totals)
println(synthpop)

[42, 178, 150, 78, 94, 62, 144, 32, 154, 46, 157, 88, 96, 69, 22, 176, 12, 154, 2, 116, 128, 51, 41, 45, 68, 26, 125, 81, 58, 118, 39, 85, 137, 129, 128, 112, 86, 64, 58, 164, 22, 92, 25, 134, 175, 175, 79, 34, 17, 76, 199, 135, 144, 127, 154, 19, 176, 36, 143, 118, 172, 80, 166, 189, 74, 198, 190]
60 60 60 60 60 58 58 58 58 56 56 56 56 58 58 56 56 56 56 56 56 56 56 54 54 54 54 54 52 52 50 50 50 50 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 46 46 46 46 46 46 46 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 44 42 42 40 38 38 38 38 38 38 38 38 38 38 36 34 34 34 34 32 32 32 32 30 28 28 28 24 24 24 24 24 22 22 20 20 20 20 20 18 18 18 18 18 18 16 16 16 14 14 14 14 14 14 14 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 10 8 8 8 8 6 6 6 6 6 6 6 6 6 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 [42, 178, 63, 78, 78, 29, 144, 82, 39, 40, 10, 88, 170, 69, 22