In [211]:
import pandas as pd
import os
import json

pid_labels = json.load(open(os.path.join("..", "data", "pid_labels_persons.json")))
reference_labels = json.load(open(os.path.join("..", "data", "pid_labels.json")))
pid_labels = {**pid_labels, **reference_labels}
pid_labels["P887"] = "based on heuristic"
pid_labels = {key: f"{value} ({key})" for key, value in pid_labels.items()}


filter_list = {key for key, label in pid_labels.items() if "ID" in label or "ISNI" in label or "category" in label.lower()}

df_pid_counter = pd.read_pickle(os.path.join("..", "data", "person_statements", "pid_counter.pkl"))
df_claim_counter = pd.read_pickle(os.path.join("..", "data", "person_statements","claim_counter.pkl"))
df_claim_with_ref_counter = pd.read_pickle(os.path.join("..", "data", "person_statements","claim_with_ref_counter.pkl"))
df_num_refs = pd.read_pickle(os.path.join("..", "data", "person_statements","num_refs.pkl"))
df_num_refs_detailed = pd.read_pickle(os.path.join("..", "data", "person_statements","num_refs_detailed.pkl"))
df_num_refs_with_url_detailed = pd.read_pickle(os.path.join("..", "data", "person_statements","num_refs_with_url_detailed.pkl"))
df_num_refs_per_pid = pd.read_pickle(os.path.join("..", "data", "person_statements","num_refs_per_pid.pkl"))

def remove_wrong_decades(df):
    try:
        return df[df['decade'].str.extract(r'(\d{4})').astype(int)[0] <= 2020]
    except AttributeError:
        return df[df[('decade', None)].str.extract(r'(\d{4})').astype(int)[0] <= 2020]
df_pid_counter = remove_wrong_decades(df_pid_counter)
df_claim_counter = remove_wrong_decades(df_claim_counter)
df_claim_with_ref_counter = remove_wrong_decades(df_claim_with_ref_counter)
df_num_refs = remove_wrong_decades(df_num_refs)
df_num_refs_detailed = remove_wrong_decades(df_num_refs_detailed)
df_num_refs_with_url_detailed = remove_wrong_decades(df_num_refs_with_url_detailed)
df_num_refs_per_pid = remove_wrong_decades(df_num_refs_per_pid)




In [212]:

num_elements_to_consider = 50
p_cols = df_pid_counter.columns[df_pid_counter.columns.str.startswith('P')]
most_popular_pids = df_pid_counter[p_cols].sum(axis=0)
most_popular_pids = most_popular_pids[most_popular_pids.index.isin(pid_labels)]
most_popular_pids = most_popular_pids[~most_popular_pids.index.isin(filter_list)]
most_popular_pids_filter=  most_popular_pids.nlargest(250).index.tolist()
most_popular_pids = most_popular_pids.nlargest(num_elements_to_consider).index.tolist()


# Step 1: Select columns where COUNT > 1 (second level of column MultiIndex)
filtered = df_num_refs_detailed.loc[:, df_num_refs_detailed.columns.get_level_values(1) > 1]
# Step 2: Sum across rows (axis=0) for each PID — group by first level of column MultiIndex
result = filtered.groupby(axis=1, level=0).sum()

df_claim_counter_summed = df_claim_counter.loc[:, df_claim_counter.columns.str.startswith('P')]
df_claim_counter_summed = df_claim_counter_summed.sum()
p_cols = result.columns[result.columns.str.startswith('P')]
summed  = result[p_cols].sum(axis=0)
for pid in summed.index:
    if not pid.startswith("P"):
        continue
    if pid in df_claim_counter_summed.index:
        summed[pid] = summed[pid] / df_claim_counter_summed.loc[pid]

most_popular_pids_with_more_than_one_ref = summed
most_popular_pids_with_more_than_one_ref = most_popular_pids_with_more_than_one_ref[most_popular_pids_with_more_than_one_ref.index.isin(pid_labels)]
most_popular_pids_with_more_than_one_ref = most_popular_pids_with_more_than_one_ref[most_popular_pids_with_more_than_one_ref.index.isin(most_popular_pids_filter)]
most_popular_pids_with_more_than_one_ref = most_popular_pids_with_more_than_one_ref.nlargest(num_elements_to_consider).index.tolist()
most_popular_ref_pids = ["P854", "P4656", "P248", "P3452", "P813", "P887", "P143"]
print("Most popular PIDs: ", [pid_labels.get(pid) for pid in most_popular_pids])
print("Most popular PIDs with more than one ref: ", [pid_labels.get(pid) for pid in most_popular_pids_with_more_than_one_ref])
print("Most popular reference PIDs: ", [pid_labels.get(pid, pid) for pid in most_popular_ref_pids])


  result = filtered.groupby(axis=1, level=0).sum()
  most_popular_ref_pids = df_num_refs_per_pid.groupby(level=1, axis=1).sum()


Most popular PIDs:  ['sex or gender (P21)', 'instance of (P31)', 'date of birth (P569)', 'country of citizenship (P27)', 'occupation (P106)', 'given name (P735)', 'place of birth (P19)', 'family name (P734)', 'languages spoken, written or signed (P1412)', 'sport (P641)', 'educated at (P69)', 'image (P18)', 'date of death (P570)', 'member of sports team (P54)', 'name in native language (P1559)', 'position played on team / speciality (P413)', 'position held (P39)', 'employer (P108)', 'award received (P166)', 'member of political party (P102)', 'place of death (P20)', 'height (P2048)', 'participant in (P1344)', 'native language (P103)', 'work period (start) (P2031)', 'country for sport (P1532)', 'X (Twitter) username (P2002)', 'official website (P856)', 'mass (P2067)', 'Instagram username (P2003)', 'field of work (P101)', 'National Library of Poland Descriptor (P7293)', 'name in kana (P1814)', 'social media followers (P8687)', 'member of (P463)', 'instrument (P1303)', 'religion or worldvi

In [213]:


from IPython.core.display import Markdown


def calculate_all_stats(df_pid_counter, df_claim_counter , df_claim_with_ref_counter, df_num_refs, df_num_refs_per_pid, group_by: list):
    # Accumulate over genders
    df_pid_counter = df_pid_counter.groupby(group_by).sum()
    df_claim_counter = df_claim_counter.groupby(group_by).sum()
    df_claim_with_ref_counter = df_claim_with_ref_counter.groupby(group_by).sum()
    df_num_refs = df_num_refs.groupby(group_by).sum()
    df_num_refs_per_pid = df_num_refs_per_pid.groupby([(x, None) for x in group_by]).sum()

    p_cols = df_pid_counter.columns[df_pid_counter.columns.str.startswith('P')]
    df_pid_counter[p_cols] = df_pid_counter[p_cols].div(df_pid_counter['person_counter'], axis=0)

    def create_pid_stats(filter_pids):
        # Filter all PID columns out that are not in the most_popular_pids
        df_pid_counter_most_pop = df_pid_counter.loc[:, df_pid_counter.columns.isin(filter_pids)]
        df_pid_counter_most_pop.fillna(0)
        df_pid_counter_most_pop = df_pid_counter_most_pop[df_pid_counter_most_pop.iloc[0].sort_values(ascending=False).index]
        df_pid_counter_most_pop = df_pid_counter_most_pop.rename(columns=pid_labels)
        return df_pid_counter_most_pop

    def create_ref_stats(filter_pids, normalize_by):
        normalized_num_refs_per_pid = df_num_refs_per_pid.copy()
        filter_columns = set()
        for pid in df_num_refs_per_pid.columns:
            if not pid[0].startswith("P"):
                continue
            if pid[0] in normalize_by.columns:
                if any(normalize_by[pid[0]] > 100):
                    filter_columns.add(pid[0])
                normalized_num_refs_per_pid[pid] = normalized_num_refs_per_pid[pid].div(normalize_by[pid[0]], axis=0)

        normalized_num_refs_per_pid = normalized_num_refs_per_pid.loc[:, normalized_num_refs_per_pid.columns.get_level_values(0).isin(filter_columns)]

        # normalized_num_refs_per_pid = normalized_num_refs_per_pid.fillna(0)
        normalized_num_refs_per_pid = normalized_num_refs_per_pid.loc[:, normalized_num_refs_per_pid.columns.get_level_values(0).isin(filter_pids)]
        normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()
        normalized_num_refs_per_pid = normalized_num_refs_per_pid.loc[:, normalized_num_refs_per_pid.columns.isin(most_popular_ref_pids)]
        normalized_num_refs_per_pid = normalized_num_refs_per_pid[normalized_num_refs_per_pid.iloc[0].sort_values(ascending=False).index]
        normalized_num_refs_per_pid = normalized_num_refs_per_pid.rename(columns=pid_labels)
        return normalized_num_refs_per_pid

    display(Markdown(f"# Group by {group_by}"))
    display(Markdown("## Calculate percentage of persons with specific relation having references (there might be multiple claims per person)"))
    copy_df_num_refs = df_claim_with_ref_counter.copy()
    p_cols = copy_df_num_refs.columns[copy_df_num_refs.columns.str.startswith('P')]
    copy_df_num_refs[p_cols] = copy_df_num_refs[p_cols].div(df_num_refs['person_counter'], axis=0)
    copy_df_num_refs = copy_df_num_refs.rename(columns=pid_labels)
    display(copy_df_num_refs)

    display(Markdown("## Calculate percentage of claims with specific relation having references"))
    copy_df_num_refs = df_claim_with_ref_counter.copy()
    for pid in copy_df_num_refs.columns:
            if not pid.startswith("P"):
                continue
            if pid in df_claim_counter.columns:
                copy_df_num_refs[pid] = copy_df_num_refs[pid].div(df_claim_counter[pid], axis=0)
    copy_df_num_refs = copy_df_num_refs.rename(columns=pid_labels)
    display(copy_df_num_refs)

    display(Markdown("## Calculate percentage of claims having references"))
    copy_df_claim_with_ref_counter = df_claim_with_ref_counter.copy()
    p_columns = [col for col in copy_df_claim_with_ref_counter.columns if col.startswith('P')]
    # Sum those columns row-wise
    copy_df_claim_with_ref_counter['P_sum'] = copy_df_claim_with_ref_counter[p_columns].sum(axis=1)
    # Drop the original 'P' columns
    df_claim_with_ref_summed = copy_df_claim_with_ref_counter.drop(columns=p_columns)

    copy_df_claim_counter = df_claim_counter.copy()
    p_columns = [col for col in copy_df_claim_counter.columns if col.startswith('P')]
    # Sum those columns row-wise
    copy_df_claim_counter['P_sum'] = copy_df_claim_counter[p_columns].sum(axis=1)
    # Drop the original 'P' columns
    df_claim_summed = copy_df_claim_counter.drop(columns=p_columns)
    df_normalized_claim_with_ref = df_claim_with_ref_summed.copy()
    for pid in df_claim_with_ref_summed.columns:
        if not pid.startswith("P"):
            continue
        if pid in df_claim_summed.columns:
            df_normalized_claim_with_ref[pid] = df_claim_with_ref_summed[pid].div(df_claim_summed[pid], axis=0)

    display(df_normalized_claim_with_ref)

    display(Markdown("## Calculate percentage of claims having a specific reference type"))
    copy_df_claim_with_ref_counter = df_num_refs_per_pid.copy()
    df_summed = copy_df_claim_with_ref_counter.groupby(level=1, axis=1).sum()
    for pid in copy_df_num_refs.columns:
            if not pid.startswith("P"):
                continue
            if pid in df_claim_counter.columns:
                copy_df_num_refs[pid] = copy_df_num_refs[pid].div(df_claim_counter[pid], axis=0)
    df_summed = df_summed.loc[:, df_summed.columns.isin(most_popular_ref_pids)]
    # Normalize by df_claim_summed P_sum value for each corresponding row
    for pid in df_summed.columns:
        if not pid.startswith("P"):
            continue
        df_summed[pid] = df_summed[pid].div(copy_df_claim_counter["P_sum"], axis=0)
    df_summed = df_summed.rename(columns=pid_labels)
    display(df_summed)

    display(Markdown(f"## Calculate the average occurrence of each relation"))
    display(Markdown("### For each decade and the top-50 most popular relations (if multiple claims occur, it is still counted as one - replacing with df_claim_counter would consider this)"))
    df_top_pop = create_pid_stats(most_popular_pids)
    display(df_top_pop)
    if len(group_by) > 1:
        display(Markdown("### Average over all decades for the top-50 most popular relations"))
        df_top_pop_averaged = df_top_pop.groupby(["income_class"]).mean()
        display(df_top_pop_averaged)
    display(Markdown("### For each decade and the top-50 relations that are equipped with the most references (not just counting the claims with references but the ratio of claims with references to all claims)"))
    df_top_ref_pop = create_pid_stats(most_popular_pids_with_more_than_one_ref)
    display(df_top_ref_pop)
    if len(group_by) > 1:
        display(Markdown("### Average over all decades for the top-50 relations that are equipped with the most references"))
        df_top_ref_pop_averaged = df_top_ref_pop.groupby(["income_class"]).mean()
        display(df_top_ref_pop_averaged)
    display(Markdown("## Calculate average use of references"))
    display(Markdown("### For each decade and the top-50 most popular relations - normalized by claims"))
    df_ref_stats_top_pop_claim = create_ref_stats(most_popular_pids, df_claim_counter)
    display(df_ref_stats_top_pop_claim)
    if len(group_by) > 1:
        display(Markdown("### Average over all decades for the top-50 most popular relations - normalized by claims"))
        df_ref_stats_top_pop_claim_averaged = df_ref_stats_top_pop_claim.groupby(level=0).mean()
        display(df_ref_stats_top_pop_claim_averaged)
    # display(Markdown("### For each decade and the top-50 relations that are equipped with the most references - normalized by claims"))
    # df_ref_stats_top_ref_pop_claim = create_ref_stats(most_popular_pids_with_more_than_one_ref, df_claim_counter)
    # display(df_ref_stats_top_ref_pop_claim)
    # if len(group_by) > 1:
    #     display(Markdown("### Average over all decades for the top-50 relations that are equipped with the most references - normalized by claims"))
    #     df_ref_stats_top_ref_pop_claim_averaged = df_ref_stats_top_ref_pop_claim.groupby(level=0).mean()
    #     display(df_ref_stats_top_ref_pop_claim_averaged)
    display(Markdown("### For each decade and the top-50 relations - normalized by claims with references"))
    df_ref_stats_top_pop_claim_ref = create_ref_stats(most_popular_pids, df_claim_with_ref_counter)
    display(df_ref_stats_top_pop_claim_ref)
    if len(group_by) > 1:
        display(Markdown("### Average over all decades for the top-50 relations - normalized by claims with references"))
        df_ref_stats_top_pop_claim_ref_averaged = df_ref_stats_top_pop_claim_ref.groupby(level=0).mean()
        display(df_ref_stats_top_pop_claim_ref_averaged)
    # display(Markdown("### For each decade and the top-50 relations that are equipped with the most references - normalized by claims with references"))
    # df_ref_stats_top_ref_pop_claim_ref = create_ref_stats(most_popular_pids_with_more_than_one_ref, df_claim_with_ref_counter)
    # display(df_ref_stats_top_ref_pop_claim_ref)
    # if len(group_by) > 1:
    #     display(Markdown("### Average over all decades for the top-50 relations that are equipped with the most references - normalized by claims with references"))
    #     df_ref_stats_top_ref_pop_claim_ref_averaged = df_ref_stats_top_ref_pop_claim_ref.groupby(level=0).mean()
    #     display(df_ref_stats_top_ref_pop_claim_ref_averaged)





In [214]:
calculate_all_stats(df_pid_counter, df_claim_counter, df_claim_with_ref_counter, df_num_refs, df_num_refs_per_pid, group_by=['income_class'])

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,imported from Wikimedia project (P143),retrieved (P813),stated in (P248),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.497937,0.35528,0.285389,0.152233,0.10622,0.094414,0.022598
low_income,0.481928,0.279426,0.218016,0.195093,0.150689,0.102924,0.020605
lower-middle_income,0.521353,0.263023,0.19164,0.180917,0.16346,0.098937,0.014866
upper-middle_income,0.537009,0.274637,0.235423,0.152988,0.162519,0.110497,0.016034


# Group by ['income_class']

## Calculate percentage of persons with specific relation having references (there might be multiple claims per person)

Unnamed: 0_level_0,person_counter,decade,gender,VIAF cluster ID (P214),member of political party (P102),educated at (P69),instance of (P31),place of birth (P19),place of death (P20),position held (P39),...,Encyclopedia of China (Third Edition) ID (P10565),member of cabinet (P5054),Ministry of Unification politician ID (P11150),has goal (P3712),parent club (P831),time played (P9140),The Counted person ID (P11638),measured physical quantity (P111),Netherlands Olympic Committee athlete ID (P10957),IMDb keyword (P11924)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1847883,1930s1930s1920s1980s1990s1970s1950s1980s1960s1...,malefemalemalemalemalemalemalefemalemalemalefe...,0.214847,0.044028,0.269457,0.469116,0.528624,0.072912,0.221348,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
low_income,21533,1980s1960s1970s2000s1940s1950s1990s1970s1980s1...,malemalemalefemalemalemalemalefemalefemalefema...,0.085311,0.090652,0.185715,0.432545,0.488924,0.067339,0.219709,...,0.0,0.0,0.001347,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05
lower-middle_income,128003,1940s1960s1930s2000s1970s1980s1980s1940s2000s1...,malemalemalemalemalemalefemalefemalefemalemale...,0.092771,0.100037,0.234502,0.434857,0.515183,0.071795,0.153379,...,8e-06,8e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
upper-middle_income,408665,2000s1940s1960s1950s1970s1970s1940s1980s1990s1...,femalemalemalemalefemalemalefemalemalemalemale...,0.066404,0.223366,0.160151,0.381868,0.545158,0.060113,0.079219,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Calculate percentage of claims with specific relation having references

Unnamed: 0_level_0,person_counter,decade,gender,VIAF cluster ID (P214),member of political party (P102),educated at (P69),instance of (P31),place of birth (P19),place of death (P20),position held (P39),...,Encyclopedia of China (Third Edition) ID (P10565),member of cabinet (P5054),Ministry of Unification politician ID (P11150),has goal (P3712),parent club (P831),time played (P9140),The Counted person ID (P11638),measured physical quantity (P111),Netherlands Olympic Committee athlete ID (P10957),IMDb keyword (P11924)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1847883,1930s1930s1920s1980s1990s1970s1950s1980s1960s1...,malefemalemalemalemalemalemalefemalemalemalefe...,0.555894,0.513562,0.571728,0.46899,0.77424,0.746814,0.749961,...,0.0,0.0,,,0.0,,0.0,,,0.0
low_income,21533,1980s1960s1970s2000s1940s1950s1990s1970s1980s1...,malemalemalefemalemalemalemalefemalefemalefema...,0.522172,0.69914,0.511446,0.432345,0.731213,0.682353,0.660201,...,0.0,,0.508772,1.0,0.5,1.0,1.0,1.0,1.0,1.0
lower-middle_income,128003,1940s1960s1930s2000s1970s1980s1980s1940s2000s1...,malemalemalemalemalemalefemalefemalefemalemale...,0.521085,0.640154,0.546151,0.434752,0.759656,0.666763,0.483179,...,0.003831,0.5,,,0.0,,,,,
upper-middle_income,408665,2000s1940s1960s1950s1970s1970s1940s1980s1990s1...,femalemalemalemalefemalemalefemalemalemalemale...,0.490156,0.886655,0.568238,0.381806,0.787372,0.670524,0.510639,...,0.0,0.0,,,0.0,,,,,


## Calculate percentage of claims having references

Unnamed: 0_level_0,person_counter,decade,gender,P_sum
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high_income,1847883,1930s1930s1920s1980s1990s1970s1950s1980s1960s1...,malefemalemalemalemalemalemalefemalemalemalefe...,0.476889
low_income,21533,1980s1960s1970s2000s1940s1950s1990s1970s1980s1...,malemalemalefemalemalemalemalefemalefemalefema...,0.437844
lower-middle_income,128003,1940s1960s1930s2000s1970s1980s1980s1940s2000s1...,malemalemalemalemalemalefemalefemalefemalemale...,0.424836
upper-middle_income,408665,2000s1940s1960s1950s1970s1970s1940s1980s1990s1...,femalemalemalemalefemalemalefemalemalemalemale...,0.479136


## Calculate percentage of claims having a specific reference type

  df_summed = copy_df_claim_with_ref_counter.groupby(level=1, axis=1).sum()


Unnamed: 0_level_0,imported from Wikimedia project (P143),stated in (P248),inferred from (P3452),Wikimedia import URL (P4656),retrieved (P813),reference URL (P854),based on heuristic (P887)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.191152,0.18763,0.005652,0.02596,0.173303,0.049607,0.04771
low_income,0.245043,0.091955,0.004496,0.037605,0.097466,0.059252,0.032656
lower-middle_income,0.228914,0.103729,0.004469,0.042983,0.094487,0.050706,0.030452
upper-middle_income,0.226505,0.151344,0.003579,0.039314,0.113349,0.069402,0.039874


## Calculate the average occurrence of each relation

### For each decade and the top-50 most popular relations (if multiple claims occur, it is still counted as one - replacing with df_claim_counter would consider this)

Unnamed: 0_level_0,sex or gender (P21),instance of (P31),date of birth (P569),country of citizenship (P27),occupation (P106),given name (P735),family name (P734),place of birth (P19),"languages spoken, written or signed (P1412)",educated at (P69),...,Europeana entity (P7704),spouse (P26),genre (P136),writing language (P6886),father (P22),residence (P551),on focus list of Wikimedia project (P5008),academic degree (P512),religion or worldview (P140),candidacy in election (P3602)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1.0,1.0,1.0,1.0,0.962922,0.867333,0.688481,0.677751,0.570859,0.317,...,0.041228,0.038683,0.037086,0.037035,0.035915,0.035405,0.029856,0.029434,0.017417,0.015587
low_income,1.0,1.0,1.0,1.0,0.97887,0.555845,0.26067,0.661078,0.443877,0.226211,...,0.020062,0.019876,0.016579,0.007802,0.016022,0.041146,0.032276,0.00743,0.082153,0.008499
lower-middle_income,1.0,1.0,1.0,1.0,0.974024,0.496316,0.27058,0.668711,0.425373,0.276462,...,0.036687,0.036687,0.01989,0.028843,0.027413,0.043374,0.016882,0.016804,0.081451,0.014093
upper-middle_income,1.0,1.0,1.0,1.0,0.942361,0.532598,0.324549,0.687911,0.376015,0.210542,...,0.019593,0.022126,0.026432,0.006783,0.017491,0.046432,0.042835,0.025432,0.15442,0.134002


### For each decade and the top-50 relations that are equipped with the most references (not just counting the claims with references but the ratio of claims with references to all claims)

Unnamed: 0_level_0,sex or gender (P21),date of birth (P569),date of death (P570),position held (P39),height (P2048),X (Twitter) username (P2002),country for sport (P1532),member of political party (P102),mass (P2067),Instagram username (P2003),...,significant person (P3342),political ideology (P1142),Wikimedia username (P4174),subject has role (P2868),contributed to creative work (P3919),lifestyle (P1576),academic appointment (P8413),killed by (P157),date of disappearance (P746),NIP (P11429)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1.0,1.0,0.171577,0.121845,0.093259,0.075807,0.075632,0.073735,0.067388,0.056189,...,0.000693,0.000621,0.000586,0.00054,0.000513,0.000341,0.000333,0.00027,0.000238,5.411598e-07
low_income,1.0,1.0,0.161148,0.19347,0.118284,0.044443,0.083686,0.122324,0.061951,0.037152,...,0.000511,0.001161,0.000372,0.000697,9.3e-05,0.000139,0.0,0.000789,0.000882,0.0
lower-middle_income,1.0,1.0,0.187144,0.177066,0.083154,0.055428,0.084053,0.141708,0.043976,0.061772,...,0.000586,0.00125,0.000656,0.000234,0.000125,0.000117,0.000102,0.001742,0.000211,0.0
upper-middle_income,1.0,1.0,0.140585,0.095193,0.088158,0.043236,0.092176,0.238202,0.054135,0.062526,...,0.000279,0.000651,0.000303,0.000223,9.1e-05,0.000139,7.1e-05,0.000947,0.000837,0.02893323


## Calculate average use of references

### For each decade and the top-50 most popular relations - normalized by claims

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,imported from Wikimedia project (P143),retrieved (P813),stated in (P248),reference URL (P854),based on heuristic (P887),Wikimedia import URL (P4656),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.280972,0.193196,0.143032,0.080586,0.060451,0.058861,0.00628
low_income,0.255829,0.128389,0.084501,0.086667,0.054466,0.074755,0.01334
lower-middle_income,0.276739,0.121419,0.071812,0.079959,0.054941,0.080152,0.007011
upper-middle_income,0.285113,0.139219,0.122592,0.073045,0.065198,0.084068,0.003671


### For each decade and the top-50 relations - normalized by claims with references

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,imported from Wikimedia project (P143),retrieved (P813),stated in (P248),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.497937,0.35528,0.285389,0.152233,0.10622,0.094414,0.022598
low_income,0.481928,0.279426,0.218016,0.195093,0.150689,0.102924,0.020605
lower-middle_income,0.521353,0.263023,0.19164,0.180917,0.16346,0.098937,0.014866
upper-middle_income,0.537009,0.274637,0.235423,0.152988,0.162519,0.110497,0.016034


In [215]:
#calculate_all_stats(df_pid_counter, df_claim_counter, df_claim_with_ref_counter, df_num_refs, df_num_refs_per_pid, group_by=['income_class', 'decade'])

In [216]:
#calculate_all_stats(df_pid_counter, df_claim_counter, df_claim_with_ref_counter, df_num_refs, df_num_refs_per_pid, group_by=['income_class', 'gender'])