In [1]:
%load_ext lab_black

In [2]:
import numpy as np
import pandas as pd

import gc
import timeit
import sys
import os

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

# Residual Sum of Squares calculations: synthetic data against spring census data

In [4]:
# Revoking stored reported data
# https://ipython.org/ipython-doc/rel-0.12/config/extensions/storemagic.html

%store -r data_Tingvoll
%store -r data_Surnadal
%store -r data_Sunndal
%store -r data_Vestnes
%store -r data_Laerdal

In [5]:
def sum_squares_spring_census(
    number_of_years,
    municipality_file,
    empirical_observations,
):
    """
    Calculating the minimum residual_sum_square between synthetic data and
    reported data.
    """

    # Revoking the targeted municipality data frame created by the synthetic population generator
    save_dir = "synthetic_data/"
    data_frame = pd.read_pickle(save_dir + municipality_file)

    RSS_list = []
    for i in range(0, int(len(data_frame) / number_of_years)):
        # Catch the predicted spring_population for all observation years
        spring_population_p = data_frame.iloc[
            i * number_of_years : (i + 1) * number_of_years
        ]["spring_pop"].values

        spring_population_e = np.array(empirical_observations)

        # Find the scaling factor that minimises RSS and do the scaling
        scaling_factor = np.sum(
            np.multiply(spring_population_p, spring_population_e)
        ) / np.sum(np.multiply(spring_population_p, spring_population_p))
        spring_population_p_scaled = scaling_factor * spring_population_p

        # Find minimum Residual Sum Square value
        RSS_min = np.sum((spring_population_p_scaled - spring_population_e) ** 2)

        RSS_list.append([scaling_factor, RSS_min])

    return RSS_list

In [6]:
def make_RSS_frame(municipality_file, RSS_list):

    # Revoking the targeted municipality data frame created by the synthetic population generator
    save_dir = "synthetic_data/"
    data_frame = pd.read_pickle(save_dir + municipality_file)

    # Remove all rows except those where obs_year == 2021
    df_filtered = data_frame[data_frame["obs_year"] == 2021]

    # Remove columns not used
    cols = [
        "f_calves",
        "y_hinds",
        "a_hinds",
        "m_calves",
        "y_stags",
        "a_stags",
        "ws_fc",
        "ws_yh",
        "ws_ah",
        "ws_mc",
        "ws_ys",
        "ws_as",
        "c_yh",
        "c_ah",
    ]
    df_filtered2 = df_filtered.drop(cols, axis=1)

    # Resert index, otherwise pd.concat does not work
    df_filtered2 = df_filtered2.reset_index(drop=True)

    # Add two columns from RSS_list
    df_RSS = pd.concat(
        [df_filtered2, pd.DataFrame(RSS_list, columns=["scaling", "RSS"])], axis=1
    )

    # Sort the frame on the RSS value
    sorted_sum_squares_frame = df_RSS.sort_values(by=["RSS"]).reset_index(drop=True)

    return sorted_sum_squares_frame

In [1]:
def extract_top_hits(
    i,
    sorted_sum_squares_frame,
    filtering_strategy,
    frac_init_pop,
    number_of_top_hits,
):
    """
    Filtering the sum_squares frame based on an assumption about the
    size of the Dec 31 2021 population vs the Dec 31 2006 population,
    and delivering only the number_of_top_hits best fits.
    """

    if filtering_strategy == "uninformed":
        sorted_sum_squares_frame_filtered = sorted_sum_squares_frame

    if filtering_strategy == "informed":
        # Educated guess filtering - can play with these criteria
        sorted_sum_squares_frame_filtered = sorted_sum_squares_frame[
            (
                sorted_sum_squares_frame.tot_pop_ah
                > frac_init_pop[i] * sorted_sum_squares_frame.init_pop
            )
        ].reset_index(drop=True)

    return sorted_sum_squares_frame_filtered[0:number_of_top_hits]

In [2]:
def run_the_show():
    # Using the 5 municipalities with spring census data
    municipalities = ["Tingvoll", "Surnadal", "Sunndal", "Vestnes", "Laerdal"]
    data_municipality = [
        data_Tingvoll,
        data_Surnadal,
        data_Sunndal,
        data_Vestnes,
        data_Laerdal,
    ]

    municipality_frame = [
        "df_original_sorted_Tingvoll.pkl",
        "df_original_sorted_Surnadal.pkl",
        "df_original_sorted_Sunndal.pkl",
        "df_original_sorted_Vestnes.pkl",
        "df_original_sorted_Laerdal.pkl",
    ]

    filtering_strategy = "informed"
    frac_init_pop = [0.5] * len(municipalities)  # minimum tot_pop 2021/2006 ratio
    number_of_top_hits = 20

    for q in range(len(municipalities)):
        [
            municipality,
            first_year,
            last_year,
            number_of_years,
            years,
            seen_deer_obs,
            seen_deer_obs_outfield,
            seen_deer_obs_infield,
            hinds_per_stag_obs,
            total_harvest,
            fraction_female_calves_harvested,
            fraction_young_hinds_harvested,
            fraction_adult_hinds_harvested,
            fraction_male_calves_harvested,
            fraction_young_stags_harvested,
            fraction_adult_stags_harvested,
            spring_counts,
        ] = data_municipality[q]

        RSS_list = sum_squares_spring_census(
            number_of_years,
            municipality_frame[q],
            spring_counts,
        )

        sorted_sum_squares_frame = make_RSS_frame(municipality_frame[q], RSS_list)

        top_hits_frame_filtered = extract_top_hits(
            q,
            sorted_sum_squares_frame,
            filtering_strategy,
            frac_init_pop,
            number_of_top_hits,
        )

        # Storing top hits frames
        save_dir = "synthetic_data/"
        top_hits_frame_filtered.to_pickle(
            save_dir + "top_hits_norm_" + municipality + "_spring_census" + ".pkl"
        )

        print("Municipality = ", municipality)
        display(top_hits_frame_filtered)
        print()
        print(
            "Percentage difference between min and max RSS value: ",
            round(
                (
                    (
                        top_hits_frame_filtered["RSS"].max()
                        - top_hits_frame_filtered["RSS"].min()
                    )
                    / top_hits_frame_filtered["RSS"].min()
                )
                * 100,
                2,
            ),
        )
        print()
        print("Mean column values of top_hits_frame:")
        display(top_hits_frame_filtered.mean())
        print()
        print()

In [9]:
%%time
# Running all scripts by calling up run_the_show
# This is done to get rid of memory leaks
run_the_show()

Municipality =  Tingvoll


Unnamed: 0,init_pop,init_hps,sh_threshold,i_fcalves,i_yhinds,i_ahinds,i_mcalves,i_ystags,i_astags,obs_year,spring_pop,tot_pop_bh,tot_pop_ah,hps_bh,hps_ah,mig%,num_migs,scaling,RSS
0,2850,1.3,0.6,285,270,1018,285,218,773,2021,5096,6867,6172,1.4599,1.4558,12.1995,187,0.264309,105041.371256
1,2600,2.0,0.5,416,106,1072,416,318,271,2021,4577,6291,5596,1.6482,1.6664,10.2133,119,0.299876,107793.110566
2,2600,2.0,0.5,351,215,1050,351,253,379,2021,4658,6397,5702,1.5991,1.6109,12.6065,155,0.290088,108854.824402
3,3100,1.3,0.7,403,168,1128,403,299,698,2021,5070,6832,6137,1.3068,1.2888,5.7693,92,0.258273,111179.350623
4,2950,1.7,0.6,501,183,1042,501,288,432,2021,5086,6901,6206,1.4758,1.4734,9.6083,145,0.259192,111460.189228
5,2400,1.7,0.5,264,165,1013,264,194,499,2021,4744,6312,5617,1.6197,1.6342,20.5243,272,0.290285,113317.806384
6,2700,1.5,0.6,378,268,898,378,388,388,2021,4338,5842,5147,1.4731,1.4698,12.0786,159,0.303459,113992.277959
7,2700,1.9,0.5,337,145,1180,337,139,558,2021,5366,7251,6556,1.6519,1.6679,16.9734,253,0.259055,114543.419396
8,2650,1.7,0.5,410,207,944,410,203,474,2021,4621,6459,5764,1.6642,1.6843,8.9762,107,0.292835,114571.094927
9,2650,1.9,0.5,384,147,1084,384,311,337,2021,4756,6488,5793,1.6068,1.6192,11.9508,148,0.285513,114747.774859



Percentage difference between min and max RSS value:  10.61

Mean column values of top_hits_frame:


init_pop          2747.500000
init_hps             1.660000
sh_threshold         0.590000
i_fcalves          366.850000
i_yhinds           216.150000
i_ahinds          1028.300000
i_mcalves          366.850000
i_ystags           260.200000
i_astags           506.650000
obs_year          2021.000000
spring_pop        4843.250000
tot_pop_bh        6549.900000
tot_pop_ah        5854.900000
hps_bh               1.488855
hps_ah               1.489145
mig%                11.270805
num_migs           158.900000
scaling              0.278551
RSS             113554.374973
dtype: float64



Municipality =  Surnadal


Unnamed: 0,init_pop,init_hps,sh_threshold,i_fcalves,i_yhinds,i_ahinds,i_mcalves,i_ystags,i_astags,obs_year,spring_pop,tot_pop_bh,tot_pop_ah,hps_bh,hps_ah,mig%,num_migs,scaling,RSS
0,2000,1.5,0.5,350,210,569,350,155,363,2021,2049,2897,2407,1.6501,1.8954,0.0,0,0.419195,50531.294845
1,1900,1.8,0.5,294,151,691,294,177,290,2021,1881,2679,2189,1.6355,1.9035,0.2436,1,0.44117,54008.423956
2,1800,1.8,0.5,270,153,656,270,188,261,2021,1819,2598,2108,1.6758,1.977,0.0,0,0.452762,55770.888088
3,1850,1.9,0.5,296,222,601,296,104,329,2021,1943,2764,2274,1.6392,1.8998,2.8532,13,0.434136,56193.970631
4,1900,1.6,0.5,275,107,722,275,176,342,2021,1810,2581,2091,1.6359,1.9183,0.0,0,0.447952,56412.630876
5,2000,1.3,0.5,310,202,577,310,311,288,2021,1834,2566,2076,1.6026,1.867,3.9711,17,0.452696,56944.63005
6,1900,1.7,0.5,323,205,584,323,167,297,2021,1882,2699,2209,1.7144,2.0216,0.0,0,0.430673,57042.110937
7,1950,1.4,1.0,282,137,670,282,230,346,2021,1944,2701,2211,1.2576,1.3645,0.0,0,0.425907,57503.340303
8,1800,1.7,0.5,315,147,589,315,129,303,2021,1791,2493,2003,1.6505,1.9525,7.3744,33,0.460246,57744.496664
9,2000,1.5,0.6,290,195,656,290,295,272,2021,1907,2671,2181,1.4898,1.6885,1.2871,6,0.426357,57915.448256



Percentage difference between min and max RSS value:  17.35

Mean column values of top_hits_frame:


init_pop         1910.000000
init_hps            1.555000
sh_threshold        0.545000
i_fcalves         300.100000
i_yhinds          156.350000
i_ahinds          635.000000
i_mcalves         300.100000
i_ystags          179.250000
i_astags          337.000000
obs_year         2021.000000
spring_pop       1861.850000
tot_pop_bh       2629.000000
tot_pop_ah       2139.000000
hps_bh              1.603755
hps_ah              1.865260
mig%                2.104670
num_migs            9.600000
scaling             0.444753
RSS             57294.010078
dtype: float64



Municipality =  Sunndal


Unnamed: 0,init_pop,init_hps,sh_threshold,i_fcalves,i_yhinds,i_ahinds,i_mcalves,i_ystags,i_astags,obs_year,spring_pop,tot_pop_bh,tot_pop_ah,hps_bh,hps_ah,mig%,num_migs,scaling,RSS
0,2550,1.3,0.5,433,209,741,433,263,468,2021,4243,5804,5260,1.6984,1.7625,13.8663,159,0.210312,68595.251478
1,2550,1.3,0.5,382,242,766,382,419,357,2021,4109,5547,5003,1.6441,1.7018,13.5105,149,0.213133,68727.571835
2,2400,1.4,0.5,360,186,793,360,307,392,2021,4334,5780,5236,1.6555,1.7123,17.6445,210,0.206541,68955.37816
3,2500,1.3,0.5,375,168,820,375,304,456,2021,3722,5122,4578,1.6905,1.7622,8.8618,85,0.228667,68975.324751
4,2600,1.5,0.5,455,233,780,455,283,392,2021,4416,6150,5606,1.6756,1.7321,6.5566,74,0.199214,69086.785669
5,2200,1.3,0.5,374,98,722,374,113,517,2021,3636,4930,4386,1.6001,1.6583,11.9092,110,0.242102,69107.087868
6,2550,1.3,0.5,395,238,755,395,397,367,2021,3943,5393,4849,1.6956,1.7648,13.6146,147,0.217195,69640.260122
7,2450,1.3,0.5,428,225,675,428,276,415,2021,4217,5625,5081,1.5946,1.6436,15.1872,174,0.205658,70317.452811
8,2450,1.3,0.5,404,157,770,404,314,399,2021,4170,5575,5031,1.6348,1.6908,17.2716,197,0.21082,70374.019695
9,2550,1.3,0.5,446,206,730,446,172,547,2021,4236,5598,5054,1.6447,1.7019,19.953,240,0.209267,70728.474766



Percentage difference between min and max RSS value:  5.99

Mean column values of top_hits_frame:


init_pop         2447.500000
init_hps            1.360000
sh_threshold        0.520000
i_fcalves         397.350000
i_yhinds          198.250000
i_ahinds          751.800000
i_mcalves         397.350000
i_ystags          247.100000
i_astags          453.250000
obs_year         2021.000000
spring_pop       4183.400000
tot_pop_bh       5676.450000
tot_pop_ah       5132.450000
hps_bh              1.612420
hps_ah              1.665495
mig%               12.354965
num_migs          142.150000
scaling             0.211914
RSS             70662.358789
dtype: float64



Municipality =  Vestnes


Unnamed: 0,init_pop,init_hps,sh_threshold,i_fcalves,i_yhinds,i_ahinds,i_mcalves,i_ystags,i_astags,obs_year,spring_pop,tot_pop_bh,tot_pop_ah,hps_bh,hps_ah,mig%,num_migs,scaling,RSS
0,1950,2.0,0.5,243,146,828,243,126,360,2021,2071,2891,2273,1.6749,1.8101,7.592,41,0.384352,105358.440507
1,1900,1.9,1.0,190,179,816,190,136,387,2021,2087,2808,2190,1.0012,0.9454,0.0,0,0.380001,106567.095077
2,1800,2.0,1.0,180,191,768,180,163,316,2021,1899,2598,1980,1.167,1.1424,0.0,0,0.408037,106727.702211
3,1900,2.0,0.7,266,136,775,266,191,264,2021,2167,2957,2339,1.3265,1.3438,3.5876,24,0.375428,107689.000446
4,1750,2.0,0.7,183,239,682,183,184,276,2021,1932,2640,2022,1.2878,1.2963,4.7086,29,0.414139,108071.483871
5,1950,1.8,0.8,273,126,776,273,160,340,2021,2194,3016,2398,1.1992,1.1858,0.0,0,0.363246,108414.442144
6,1850,1.9,0.8,203,85,860,203,99,398,2021,2117,2905,2287,1.2651,1.2668,0.0,0,0.384244,108462.251519
7,2000,1.5,0.9,200,153,806,200,217,422,2021,2309,3099,2481,1.1205,1.0918,2.8116,24,0.342205,108628.805823
8,2150,1.6,0.9,344,107,791,344,213,348,2021,2331,3149,2531,1.157,1.136,0.0,0,0.348628,108701.656086
9,1800,1.9,0.7,189,83,847,189,225,264,2021,2051,2814,2196,1.3596,1.3875,2.53,16,0.386897,108847.77519



Percentage difference between min and max RSS value:  5.03

Mean column values of top_hits_frame:


init_pop          1935.000000
init_hps             1.780000
sh_threshold         0.755000
i_fcalves          250.750000
i_yhinds           148.550000
i_ahinds           764.700000
i_mcalves          250.750000
i_ystags           167.050000
i_astags           350.700000
obs_year          2021.000000
spring_pop        2101.800000
tot_pop_bh        2880.350000
tot_pop_ah        2262.350000
hps_bh               1.302870
hps_ah               1.318795
mig%                 2.082735
num_migs            13.350000
scaling              0.380168
RSS             108780.639809
dtype: float64



Municipality =  Laerdal


Unnamed: 0,init_pop,init_hps,sh_threshold,i_fcalves,i_yhinds,i_ahinds,i_mcalves,i_ystags,i_astags,obs_year,spring_pop,tot_pop_bh,tot_pop_ah,hps_bh,hps_ah,mig%,num_migs,scaling,RSS
0,1700,1.4,0.5,289,137,517,289,84,383,2021,1709,2317,1785,1.6629,1.7348,13.7975,65,0.381135,152839.427265
1,1750,1.3,0.5,262,69,623,262,244,287,2021,1669,2309,1777,1.6408,1.7067,10.0526,44,0.365117,157190.086235
2,1900,1.4,0.6,313,148,594,313,201,328,2021,2005,2768,2236,1.4536,1.462,6.0641,35,0.327453,157499.452555
3,1900,1.3,0.5,332,188,509,332,204,332,2021,1859,2589,2057,1.6646,1.7285,9.1287,44,0.349807,157912.017559
4,1950,1.4,0.6,282,185,621,282,196,380,2021,1962,2711,2179,1.5022,1.5219,8.8083,52,0.32789,159133.239479
5,1900,1.3,0.6,313,187,532,313,132,420,2021,1950,2661,2129,1.4976,1.5164,10.1314,60,0.340262,159276.376068
6,1900,1.4,0.5,332,144,576,332,267,247,2021,1741,2359,1827,1.6123,1.6674,15.6265,75,0.369647,159569.431491
7,1850,1.4,0.5,286,96,647,286,148,382,2021,1889,2516,1984,1.6696,1.7368,21.3052,116,0.343344,159700.121781
8,1850,1.4,0.5,268,183,582,268,207,339,2021,1990,2750,2218,1.6955,1.7634,12.767,70,0.343476,159724.906329
9,2000,1.3,0.5,330,189,568,330,256,326,2021,1884,2611,2079,1.6944,1.7665,11.9375,61,0.345871,161215.610946



Percentage difference between min and max RSS value:  7.01

Mean column values of top_hits_frame:


init_pop          1885.000000
init_hps             1.355000
sh_threshold         0.520000
i_fcalves          306.100000
i_yhinds           147.250000
i_ahinds           583.400000
i_mcalves          306.100000
i_ystags           201.200000
i_astags           338.250000
obs_year          2021.000000
spring_pop        1895.800000
tot_pop_bh        2594.250000
tot_pop_ah        2062.250000
hps_bh               1.629910
hps_ah               1.685410
mig%                12.824305
num_migs            67.600000
scaling              0.345848
RSS             160603.822525
dtype: float64



CPU times: user 2h 59min 57s, sys: 9min 9s, total: 3h 9min 6s
Wall time: 3h 9min 9s
