In [13]:
import pandas as pd
import os
import json

pid_labels = json.load(open(os.path.join("..", "data", "pid_labels_persons.json")))
reference_labels = json.load(open(os.path.join("..", "data", "pid_labels.json")))
pid_labels = {**pid_labels, **reference_labels}
pid_labels["P887"] = "based on heuristic"
pid_labels = {key: f"{value} ({key})" for key, value in pid_labels.items()}


filter_list = {key for key, label in pid_labels.items() if "ID" in label or "ISNI" in label or "category" in label.lower()}

df_pid_counter = pd.read_pickle(os.path.join("..", "data", "pid_counter.pkl"))
df_claim_counter = pd.read_pickle(os.path.join("..", "data", "claim_counter.pkl"))
df_claim_with_ref_counter = pd.read_pickle(os.path.join("..", "data", "claim_with_ref_counter.pkl"))
df_num_refs = pd.read_pickle(os.path.join("..", "data", "num_refs.pkl"))
df_num_refs_detailed = pd.read_pickle(os.path.join("..", "data", "num_refs_detailed.pkl"))
df_num_refs_with_url_detailed = pd.read_pickle(os.path.join("..", "data", "num_refs_with_url_detailed.pkl"))
df_num_refs_per_pid = pd.read_pickle(os.path.join("..", "data", "num_refs_per_pid.pkl"))

def remove_wrong_decades(df):
    try:
        return df[df['decade'].str.extract(r'(\d{4})').astype(int)[0] <= 2020]
    except AttributeError:
        return df[df[('decade', None)].str.extract(r'(\d{4})').astype(int)[0] <= 2020]
df_pid_counter = remove_wrong_decades(df_pid_counter)
df_claim_counter = remove_wrong_decades(df_claim_counter)
df_claim_with_ref_counter = remove_wrong_decades(df_claim_with_ref_counter)
df_num_refs = remove_wrong_decades(df_num_refs)
df_num_refs_detailed = remove_wrong_decades(df_num_refs_detailed)
df_num_refs_with_url_detailed = remove_wrong_decades(df_num_refs_with_url_detailed)
df_num_refs_per_pid = remove_wrong_decades(df_num_refs_per_pid)


In [14]:

num_elements_to_consider = 50
p_cols = df_pid_counter.columns[df_pid_counter.columns.str.startswith('P')]
most_popular_pids = df_pid_counter[p_cols].sum(axis=0).nlargest(num_elements_to_consider).index.tolist()
most_popular_pids = [pid for pid in most_popular_pids if pid in pid_labels]
most_popular_pids = [pid for pid in most_popular_pids if pid not in filter_list]

# Step 1: Select columns where COUNT > 1 (second level of column MultiIndex)
filtered = df_num_refs_detailed.loc[:, df_num_refs_detailed.columns.get_level_values(1) > 1]
# Step 2: Sum across rows (axis=0) for each PID — group by first level of column MultiIndex
result = filtered.groupby(axis=1, level=0).sum()

df_claim_counter_summed = df_claim_counter.loc[:, df_claim_counter.columns.str.startswith('P')]
df_claim_counter_summed = df_claim_counter_summed.sum()
p_cols = result.columns[result.columns.str.startswith('P')]
summed  = result[p_cols].sum(axis=0)
for pid in summed.index:
    if not pid.startswith("P"):
        continue
    if pid in df_claim_counter_summed.index:
        summed[pid] = summed[pid] / df_claim_counter_summed.loc[pid]
most_popular_pids_with_more_than_one_ref = summed.nlargest(num_elements_to_consider).index.tolist()
most_popular_pids_with_more_than_one_ref = [pid for pid in most_popular_pids_with_more_than_one_ref if pid in pid_labels]
most_popular_pids_with_more_than_one_ref = [pid for pid in most_popular_pids_with_more_than_one_ref if pid not in filter_list]
most_popular_ref_pids = df_num_refs_per_pid.groupby(level=1, axis=1).sum()
most_popular_ref_pids = most_popular_ref_pids.sum(axis=0).nlargest(50).index.tolist()
most_popular_ref_pids = ["P854", "P4656", "P248", "P3452", "P813", "P887"]
print("Most popular PIDs: ", [pid_labels.get(pid) for pid in most_popular_pids])
print("Most popular PIDs with more than one ref: ", [pid_labels.get(pid) for pid in most_popular_pids_with_more_than_one_ref])
print("Most popular reference PIDs: ", [pid_labels.get(pid, pid) for pid in most_popular_ref_pids])


  result = filtered.groupby(axis=1, level=0).sum()
  most_popular_ref_pids = df_num_refs_per_pid.groupby(level=1, axis=1).sum()


Most popular PIDs:  ['sex or gender (P21)', 'instance of (P31)', 'date of birth (P569)', 'country of citizenship (P27)', 'occupation (P106)', 'given name (P735)', 'place of birth (P19)', 'family name (P734)', 'languages spoken, written or signed (P1412)', 'sport (P641)', 'educated at (P69)', 'image (P18)', 'date of death (P570)', 'member of sports team (P54)', 'name in native language (P1559)', 'position played on team / speciality (P413)', 'position held (P39)', 'employer (P108)', 'award received (P166)', 'member of political party (P102)', 'place of death (P20)', 'height (P2048)', 'participant in (P1344)', 'native language (P103)', 'work period (start) (P2031)', 'country for sport (P1532)', 'X (Twitter) username (P2002)', 'official website (P856)', 'mass (P2067)']
Most popular PIDs with more than one ref:  ['location of discovery (P189)', 'manufacturer (P176)', 'autonomous system number (P3797)', 'e-Rad researcher number (P9776)', 'post office box (P2918)', 'Wikidata item of this pro

In [15]:


from IPython.core.display import Markdown


def calculate_all_stats(df_pid_counter, df_claim_counter , df_claim_with_ref_counter, df_num_refs, df_num_refs_per_pid, group_by: list):
    # Accumulate over genders
    df_pid_counter = df_pid_counter.groupby(group_by).sum()
    df_claim_counter = df_claim_counter.groupby(group_by).sum()
    df_claim_with_ref_counter = df_claim_with_ref_counter.groupby(group_by).sum()
    df_num_refs = df_num_refs.groupby(group_by).sum()
    df_num_refs_per_pid = df_num_refs_per_pid.groupby([(x, None) for x in group_by]).sum()

    p_cols = df_pid_counter.columns[df_pid_counter.columns.str.startswith('P')]
    df_pid_counter[p_cols] = df_pid_counter[p_cols].div(df_pid_counter['person_counter'], axis=0)

    def create_pid_stats(filter_pids):
        # Filter all PID columns out that are not in the most_popular_pids
        df_pid_counter_most_pop = df_pid_counter.loc[:, df_pid_counter.columns.isin(filter_pids)]
        df_pid_counter_most_pop.fillna(0)
        df_pid_counter_most_pop = df_pid_counter_most_pop[df_pid_counter_most_pop.iloc[0].sort_values(ascending=False).index]
        df_pid_counter_most_pop = df_pid_counter_most_pop.rename(columns=pid_labels)
        return df_pid_counter_most_pop

    def create_ref_stats(filter_pids, normalize_by):
        normalized_num_refs_per_pid = df_num_refs_per_pid.copy()
        for pid in df_num_refs_per_pid.columns:
            if not pid[0].startswith("P"):
                continue
            if pid[0] in normalize_by.columns:
                normalized_num_refs_per_pid[pid] = normalized_num_refs_per_pid[pid].div(normalize_by[pid[0]], axis=0)

        normalized_num_refs_per_pid = normalized_num_refs_per_pid.fillna(0)
        normalized_num_refs_per_pid = normalized_num_refs_per_pid.loc[:, normalized_num_refs_per_pid.columns.get_level_values(0).isin(filter_pids)]
        normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()
        normalized_num_refs_per_pid = normalized_num_refs_per_pid.loc[:, normalized_num_refs_per_pid.columns.isin(most_popular_ref_pids)]
        normalized_num_refs_per_pid = normalized_num_refs_per_pid[normalized_num_refs_per_pid.iloc[0].sort_values(ascending=False).index]
        normalized_num_refs_per_pid = normalized_num_refs_per_pid.rename(columns=pid_labels)
        return normalized_num_refs_per_pid

    display(Markdown(f"# Group by {group_by}"))
    display(Markdown("## Calculate percentage of persons with specific relation having references (there might be multiple claims per person)"))
    copy_df_num_refs = df_claim_with_ref_counter.copy()
    p_cols = copy_df_num_refs.columns[copy_df_num_refs.columns.str.startswith('P')]
    copy_df_num_refs[p_cols] = copy_df_num_refs[p_cols].div(df_num_refs['person_counter'], axis=0)
    copy_df_num_refs = copy_df_num_refs.rename(columns=pid_labels)
    display(copy_df_num_refs)

    display(Markdown("## Calculate percentage of claims with specific relation having references"))
    copy_df_num_refs = df_claim_with_ref_counter.copy()
    for pid in copy_df_num_refs.columns:
            if not pid.startswith("P"):
                continue
            if pid in df_claim_counter.columns:
                copy_df_num_refs[pid] = copy_df_num_refs[pid].div(df_claim_counter[pid], axis=0)
    copy_df_num_refs = copy_df_num_refs.rename(columns=pid_labels)
    display(copy_df_num_refs)

    display(Markdown("## Calculate percentage of claims having references"))
    copy_df_claim_with_ref_counter = df_claim_with_ref_counter.copy()
    p_columns = [col for col in copy_df_claim_with_ref_counter.columns if col.startswith('P')]
    # Sum those columns row-wise
    copy_df_claim_with_ref_counter['P_sum'] = copy_df_claim_with_ref_counter[p_columns].sum(axis=1)
    # Drop the original 'P' columns
    df_claim_with_ref_summed = copy_df_claim_with_ref_counter.drop(columns=p_columns)

    copy_df_claim_counter = df_claim_counter.copy()
    p_columns = [col for col in copy_df_claim_counter.columns if col.startswith('P')]
    # Sum those columns row-wise
    copy_df_claim_counter['P_sum'] = copy_df_claim_counter[p_columns].sum(axis=1)
    # Drop the original 'P' columns
    df_claim_summed = copy_df_claim_counter.drop(columns=p_columns)
    df_normalized_claim_with_ref = df_claim_with_ref_summed.copy()
    for pid in df_claim_with_ref_summed.columns:
        if not pid.startswith("P"):
            continue
        if pid in df_claim_summed.columns:
            df_normalized_claim_with_ref[pid] = df_claim_with_ref_summed[pid].div(df_claim_summed[pid], axis=0)

    display(df_normalized_claim_with_ref)


    display(Markdown("## Calculate percentage of claims having a specific reference type"))
    copy_df_claim_with_ref_counter = df_num_refs_per_pid.copy()
    df_summed = copy_df_claim_with_ref_counter.groupby(level=1, axis=1).sum()
    for pid in copy_df_num_refs.columns:
            if not pid.startswith("P"):
                continue
            if pid in df_claim_counter.columns:
                copy_df_num_refs[pid] = copy_df_num_refs[pid].div(df_claim_counter[pid], axis=0)
    df_summed = df_summed.loc[:, df_summed.columns.isin(most_popular_ref_pids)]
    # Normalize by df_claim_summed P_sum value for each corresponding row
    for pid in df_summed.columns:
        if not pid.startswith("P"):
            continue
        df_summed[pid] = df_summed[pid].div(copy_df_claim_counter["P_sum"], axis=0)
    df_summed = df_summed.rename(columns=pid_labels)
    display(df_summed)

    return

    display(Markdown(f"## Calculate the average occurrence of each relation"))
    display(Markdown("### For each decade and the top-50 most popular relations (if multiple claims occur, it is still counted as one - replacing with df_claim_counter would consider this)"))
    df_top_pop = create_pid_stats(most_popular_pids)
    display(df_top_pop)
    if len(group_by) > 1:
        display(Markdown("### Average over all decades for the top-50 most popular relations"))
        df_top_pop_averaged = df_top_pop.groupby(["income_class"]).mean()
        display(df_top_pop_averaged)
    display(Markdown("### For each decade and the top-50 relations that are equipped with the most references (not just counting the claims with references but the ratio of claims with references to all claims)"))
    df_top_ref_pop = create_pid_stats(most_popular_pids_with_more_than_one_ref)
    display(df_top_ref_pop)
    if len(group_by) > 1:
        display(Markdown("### Average over all decades for the top-50 relations that are equipped with the most references"))
        df_top_ref_pop_averaged = df_top_ref_pop.groupby(["income_class"]).mean()
        display(df_top_ref_pop_averaged)
    display(Markdown("## Calculate average use of references"))
    display(Markdown("### For each decade and the top-50 most popular relations - normalized by claims"))
    df_ref_stats_top_pop_claim = create_ref_stats(most_popular_pids, df_claim_counter)
    display(df_ref_stats_top_pop_claim)
    if len(group_by) > 1:
        display(Markdown("### Average over all decades for the top-50 most popular relations - normalized by claims"))
        df_ref_stats_top_pop_claim_averaged = df_ref_stats_top_pop_claim.groupby(level=0).mean()
        display(df_ref_stats_top_pop_claim_averaged)
    display(Markdown("### For each decade and the top-50 relations that are equipped with the most references - normalized by claims"))
    df_ref_stats_top_ref_pop_claim = create_ref_stats(most_popular_pids_with_more_than_one_ref, df_claim_counter)
    display(df_ref_stats_top_ref_pop_claim)
    if len(group_by) > 1:
        display(Markdown("### Average over all decades for the top-50 relations that are equipped with the most references - normalized by claims"))
        df_ref_stats_top_ref_pop_claim_averaged = df_ref_stats_top_ref_pop_claim.groupby(level=0).mean()
        display(df_ref_stats_top_ref_pop_claim_averaged)
    display(Markdown("### For each decade and the top-50 relations - normalized by claims with references"))
    df_ref_stats_top_pop_claim_ref = create_ref_stats(most_popular_pids, df_claim_with_ref_counter)
    display(df_ref_stats_top_pop_claim_ref)
    if len(group_by) > 1:
        display(Markdown("### Average over all decades for the top-50 relations - normalized by claims with references"))
        df_ref_stats_top_pop_claim_ref_averaged = df_ref_stats_top_pop_claim_ref.groupby(level=0).mean()
        display(df_ref_stats_top_pop_claim_ref_averaged)
    display(Markdown("### For each decade and the top-50 relations that are equipped with the most references - normalized by claims with references"))
    df_ref_stats_top_ref_pop_claim_ref = create_ref_stats(most_popular_pids_with_more_than_one_ref, df_claim_with_ref_counter)
    display(df_ref_stats_top_ref_pop_claim_ref)
    if len(group_by) > 1:
        display(Markdown("### Average over all decades for the top-50 relations that are equipped with the most references - normalized by claims with references"))
        df_ref_stats_top_ref_pop_claim_ref_averaged = df_ref_stats_top_ref_pop_claim_ref.groupby(level=0).mean()
        display(df_ref_stats_top_ref_pop_claim_ref_averaged)





In [16]:
calculate_all_stats(df_pid_counter, df_claim_counter, df_claim_with_ref_counter, df_num_refs, df_num_refs_per_pid, group_by=['income_class'])

# Group by ['income_class']

## Calculate percentage of persons with specific relation having references (there might be multiple claims per person)

Unnamed: 0_level_0,person_counter,decade,gender,VIAF cluster ID (P214),member of political party (P102),educated at (P69),instance of (P31),place of birth (P19),place of death (P20),position held (P39),...,Encyclopedia of China (Third Edition) ID (P10565),member of cabinet (P5054),Ministry of Unification politician ID (P11150),has goal (P3712),parent club (P831),time played (P9140),The Counted person ID (P11638),measured physical quantity (P111),Netherlands Olympic Committee athlete ID (P10957),IMDb keyword (P11924)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1847883,1930s1930s1920s1980s1990s1970s1950s1980s1960s1...,malefemalemalemalemalemalemalefemalemalemalefe...,0.214847,0.044028,0.269457,0.469116,0.528624,0.072912,0.221348,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
low_income,21533,1980s1960s1970s2000s1940s1950s1990s1970s1980s1...,malemalemalefemalemalemalemalefemalefemalefema...,0.085311,0.090652,0.185715,0.432545,0.488924,0.067339,0.219709,...,0.0,0.0,0.001347,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05
lower-middle_income,128003,1940s1960s1930s2000s1970s1980s1980s1940s2000s1...,malemalemalemalemalemalefemalefemalefemalemale...,0.092771,0.100037,0.234502,0.434857,0.515183,0.071795,0.153379,...,8e-06,8e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
upper-middle_income,408665,2000s1940s1960s1950s1970s1970s1940s1980s1990s1...,femalemalemalemalefemalemalefemalemalemalemale...,0.066404,0.223366,0.160151,0.381868,0.545158,0.060113,0.079219,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Calculate percentage of claims with specific relation having references

Unnamed: 0_level_0,person_counter,decade,gender,VIAF cluster ID (P214),member of political party (P102),educated at (P69),instance of (P31),place of birth (P19),place of death (P20),position held (P39),...,Encyclopedia of China (Third Edition) ID (P10565),member of cabinet (P5054),Ministry of Unification politician ID (P11150),has goal (P3712),parent club (P831),time played (P9140),The Counted person ID (P11638),measured physical quantity (P111),Netherlands Olympic Committee athlete ID (P10957),IMDb keyword (P11924)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1847883,1930s1930s1920s1980s1990s1970s1950s1980s1960s1...,malefemalemalemalemalemalemalefemalemalemalefe...,0.555894,0.513562,0.571728,0.46899,0.77424,0.746814,0.749961,...,0.0,0.0,,,0.0,,0.0,,,0.0
low_income,21533,1980s1960s1970s2000s1940s1950s1990s1970s1980s1...,malemalemalefemalemalemalemalefemalefemalefema...,0.522172,0.69914,0.511446,0.432345,0.731213,0.682353,0.660201,...,0.0,,0.508772,1.0,0.5,1.0,1.0,1.0,1.0,1.0
lower-middle_income,128003,1940s1960s1930s2000s1970s1980s1980s1940s2000s1...,malemalemalemalemalemalefemalefemalefemalemale...,0.521085,0.640154,0.546151,0.434752,0.759656,0.666763,0.483179,...,0.003831,0.5,,,0.0,,,,,
upper-middle_income,408665,2000s1940s1960s1950s1970s1970s1940s1980s1990s1...,femalemalemalemalefemalemalefemalemalemalemale...,0.490156,0.886655,0.568238,0.381806,0.787372,0.670524,0.510639,...,0.0,0.0,,,0.0,,,,,


## Calculate percentage of claims having references

Unnamed: 0_level_0,person_counter,decade,gender,P_sum
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high_income,1847883,1930s1930s1920s1980s1990s1970s1950s1980s1960s1...,malefemalemalemalemalemalemalefemalemalemalefe...,0.476889
low_income,21533,1980s1960s1970s2000s1940s1950s1990s1970s1980s1...,malemalemalefemalemalemalemalefemalefemalefema...,0.437844
lower-middle_income,128003,1940s1960s1930s2000s1970s1980s1980s1940s2000s1...,malemalemalemalemalemalefemalefemalefemalemale...,0.424836
upper-middle_income,408665,2000s1940s1960s1950s1970s1970s1940s1980s1990s1...,femalemalemalemalefemalemalefemalemalemalemale...,0.479136


## Calculate percentage of claims having a specific reference type

  df_summed = copy_df_claim_with_ref_counter.groupby(level=1, axis=1).sum()


Unnamed: 0_level_0,imported from Wikimedia project (P143),stated in (P248),inferred from (P3452),Wikimedia import URL (P4656),retrieved (P813),reference URL (P854),based on heuristic (P887)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.191152,0.18763,0.005652,0.02596,0.173303,0.049607,0.04771
low_income,0.245043,0.091955,0.004496,0.037605,0.097466,0.059252,0.032656
lower-middle_income,0.228914,0.103729,0.004469,0.042983,0.094487,0.050706,0.030452
upper-middle_income,0.226505,0.151344,0.003579,0.039314,0.113349,0.069402,0.039874


## Calculate the average occurrence of each relation

### For each decade and the top-50 most popular relations (if multiple claims occur, it is still counted as one - replacing with df_claim_counter would consider this)

Unnamed: 0_level_0,sex or gender (P21),instance of (P31),date of birth (P569),country of citizenship (P27),occupation (P106),given name (P735),family name (P734),place of birth (P19),"languages spoken, written or signed (P1412)",educated at (P69),...,place of death (P20),height (P2048),participant in (P1344),native language (P103),work period (start) (P2031),official website (P856),X (Twitter) username (P2002),country for sport (P1532),member of political party (P102),mass (P2067)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1.0,1.0,1.0,1.0,0.962922,0.867333,0.688481,0.677751,0.570859,0.317,...,0.096602,0.093259,0.081223,0.081216,0.076615,0.076351,0.075807,0.075632,0.073735,0.067388
low_income,1.0,1.0,1.0,1.0,0.97887,0.555845,0.26067,0.661078,0.443877,0.226211,...,0.097107,0.118284,0.157944,0.029536,0.067199,0.024753,0.044443,0.083686,0.122324,0.061951
lower-middle_income,1.0,1.0,1.0,1.0,0.974024,0.496316,0.27058,0.668711,0.425373,0.276462,...,0.105466,0.083154,0.114443,0.162871,0.106333,0.039351,0.055428,0.084053,0.141708,0.043976
upper-middle_income,1.0,1.0,1.0,1.0,0.942361,0.532598,0.324549,0.687911,0.376015,0.210542,...,0.089085,0.088158,0.094236,0.079212,0.100686,0.029454,0.043236,0.092176,0.238202,0.054135


### For each decade and the top-50 relations that are equipped with the most references (not just counting the claims with references but the ratio of claims with references to all claims)

Unnamed: 0_level_0,date of birth (P569),date of death (P570),X (Twitter) username (P2002),copyright representative (P6275),doctoral advisor (P184),candidacy in election (P3602),significant event (P793),sexual orientation (P91),e-Rad researcher number (P9776),affiliation string (P6424),...,talk show guest (P5030),ISBN-10 (P957),post office box (P2918),location of discovery (P189),manufacturer (P176),identifiers.org prefix (P4793),Wikidata item of this property (P1629),highest note (P1897),platform (P400),autonomous system number (P3797)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1.0,0.171577,0.075807,0.021927,0.017928,0.015587,0.00476,0.002288,0.002228,0.001937,...,3e-06,2e-06,2e-06,1e-06,1e-06,1e-06,1e-06,1e-06,5.411598e-07,5.411598e-07
low_income,1.0,0.161148,0.044443,0.001486,0.005898,0.008499,0.006084,0.000604,0.0,0.002786,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lower-middle_income,1.0,0.187144,0.055428,0.002031,0.007844,0.014093,0.006406,0.001203,1.6e-05,0.004453,...,1.6e-05,8e-06,8e-06,8e-06,0.0,8e-06,0.0,0.0,7.812317e-06,0.0
upper-middle_income,1.0,0.140585,0.043236,0.003078,0.005493,0.134002,0.003,0.001737,1.7e-05,0.002065,...,0.0,0.0,2e-06,4.6e-05,0.0,5e-06,0.0,2e-06,2.446992e-06,0.0


## Calculate average use of references

### For each decade and the top-50 most popular relations - normalized by claims

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,imported from Wikimedia project (P143),stated in (P248),retrieved (P813),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.330953,0.135966,0.128472,0.073158,0.067518,0.046746,0.003681
low_income,0.314447,0.073087,0.083864,0.090439,0.080215,0.041556,0.00337
lower-middle_income,0.333407,0.066106,0.073688,0.070145,0.088006,0.042083,0.003895
upper-middle_income,0.338372,0.096918,0.082625,0.074866,0.088943,0.044031,0.002533


### For each decade and the top-50 relations that are equipped with the most references - normalized by claims

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,retrieved (P813),stated in (P248),reference URL (P854),imported from Wikimedia project (P143),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.398316,0.375588,0.328076,0.269509,0.177075,0.107831,0.01083
low_income,0.171375,0.242243,0.110251,0.0861,0.01589,0.134734,0.070785
lower-middle_income,0.197849,0.291665,0.160692,0.204552,0.074717,0.085878,0.032592
upper-middle_income,0.300872,0.335724,0.280934,0.173382,0.135051,0.080383,0.015211


### For each decade and the top-50 relations - normalized by claims with references

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,imported from Wikimedia project (P143),stated in (P248),retrieved (P813),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.585438,0.24086,0.236967,0.123169,0.099689,0.092081,0.007734
low_income,0.571616,0.149127,0.198621,0.189065,0.144467,0.104477,0.007739
lower-middle_income,0.610352,0.1355,0.176706,0.156438,0.15905,0.105546,0.008967
upper-middle_income,0.612899,0.17398,0.172979,0.139106,0.144731,0.106749,0.005556


### For each decade and the top-50 relations that are equipped with the most references - normalized by claims with references

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,reference URL (P854),retrieved (P813),stated in (P248),imported from Wikimedia project (P143),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.557503,0.531397,0.478336,0.346698,0.217143,0.168944,0.012466
low_income,0.172194,0.255763,0.33932,0.151834,0.028027,0.282639,0.079084
lower-middle_income,0.230777,0.246901,0.361863,0.290152,0.086002,0.170892,0.038155
upper-middle_income,0.326315,0.360448,0.404399,0.240818,0.166242,0.168568,0.017746


In [17]:
calculate_all_stats(df_pid_counter, df_claim_counter, df_claim_with_ref_counter, df_num_refs, df_num_refs_per_pid, group_by=['income_class', 'decade'])

# Group by ['income_class', 'decade']

## Calculate percentage of persons with specific relation having references (there might be multiple claims per person)

Unnamed: 0_level_0,Unnamed: 1_level_0,person_counter,gender,VIAF cluster ID (P214),member of political party (P102),educated at (P69),instance of (P31),place of birth (P19),place of death (P20),position held (P39),Freebase ID (P646),...,Encyclopedia of China (Third Edition) ID (P10565),member of cabinet (P5054),Ministry of Unification politician ID (P11150),has goal (P3712),parent club (P831),time played (P9140),The Counted person ID (P11638),measured physical quantity (P111),Netherlands Olympic Committee athlete ID (P10957),IMDb keyword (P11924)
income_class,decade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
high_income,1920s,89113,malefemalenon-binary,0.359274,0.055682,0.277165,0.479436,0.62056,0.372291,0.186785,0.089403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,1930s,186759,malefemalenon-binary,0.342725,0.05445,0.296816,0.487211,0.590633,0.26048,0.196949,0.09115,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,1940s,251904,malefemalenon-binary,0.31693,0.063072,0.296041,0.514462,0.532445,0.112884,0.342357,0.09774,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,1950s,292443,malefemalenon-binary,0.261921,0.063749,0.289243,0.52483,0.48974,0.044901,0.418916,0.091484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,1960s,279123,malefemalenon-binary,0.23528,0.053579,0.285218,0.506357,0.513899,0.021908,0.302085,0.099017,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,1970s,261290,malefemalenon-binary,0.178288,0.039408,0.271254,0.486375,0.52883,0.010881,0.171916,0.115733,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,1980s,240406,malefemalenon-binary,0.090081,0.020686,0.263479,0.479897,0.571254,0.006227,0.061937,0.134984,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,1990s,179023,malefemalenon-binary,0.033867,0.006446,0.210839,0.294202,0.519732,0.003374,0.012339,0.045536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,2000s,66537,femalemalenon-binary,0.068473,0.004749,0.105595,0.202293,0.325007,0.003923,0.007635,0.014383,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,2010s,1181,femalemale,0.017782,0.000847,0.024555,0.121931,0.18967,0.016088,0.00254,0.005927,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Calculate percentage of claims with specific relation having references

Unnamed: 0_level_0,Unnamed: 1_level_0,person_counter,gender,VIAF cluster ID (P214),member of political party (P102),educated at (P69),instance of (P31),place of birth (P19),place of death (P20),position held (P39),Freebase ID (P646),...,Encyclopedia of China (Third Edition) ID (P10565),member of cabinet (P5054),Ministry of Unification politician ID (P11150),has goal (P3712),parent club (P831),time played (P9140),The Counted person ID (P11638),measured physical quantity (P111),Netherlands Olympic Committee athlete ID (P10957),IMDb keyword (P11924)
income_class,decade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
high_income,1920s,89113,malefemalenon-binary,0.619888,0.453897,0.541419,0.479339,0.792877,0.724035,0.627096,0.275847,...,0.0,,,,,,,,,
high_income,1930s,186759,malefemalenon-binary,0.595541,0.465891,0.544545,0.487135,0.782656,0.76049,0.645525,0.279809,...,0.0,,,,,,,,,
high_income,1940s,251904,malefemalenon-binary,0.582791,0.489902,0.541286,0.514343,0.780682,0.757223,0.747167,0.285925,...,0.0,,,,0.0,,,,,
high_income,1950s,292443,malefemalenon-binary,0.554247,0.525318,0.552986,0.524732,0.776552,0.745021,0.793437,0.279956,...,0.0,,,,0.0,,,,,
high_income,1960s,279123,malefemalenon-binary,0.552836,0.54463,0.563881,0.50622,0.771831,0.756901,0.776962,0.296002,...,0.0,,,,,,0.0,,,
high_income,1970s,261290,malefemalenon-binary,0.506194,0.555783,0.576551,0.486241,0.769687,0.73921,0.759092,0.318779,...,0.0,,,,,,,,,
high_income,1980s,240406,malefemalenon-binary,0.448773,0.550232,0.633097,0.479745,0.787604,0.702487,0.7445,0.331973,...,0.0,0.0,,,0.0,,,,,0.0
high_income,1990s,179023,malefemalenon-binary,0.463178,0.551098,0.679674,0.294076,0.76053,0.660832,0.688376,0.214109,...,0.0,,,,0.0,,,,,
high_income,2000s,66537,femalemalenon-binary,0.595556,0.509677,0.540046,0.202193,0.676331,0.597254,0.452763,0.157014,...,,,,,0.0,,,,,
high_income,2010s,1181,femalemale,0.42,0.5,0.644444,0.121724,0.363636,0.612903,0.2,0.411765,...,,,,,,,,,,


## Calculate percentage of claims having references

Unnamed: 0_level_0,Unnamed: 1_level_0,person_counter,gender,P_sum
income_class,decade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high_income,1920s,89113,malefemalenon-binary,0.450993
high_income,1930s,186759,malefemalenon-binary,0.453774
high_income,1940s,251904,malefemalenon-binary,0.458135
high_income,1950s,292443,malefemalenon-binary,0.506679
high_income,1960s,279123,malefemalenon-binary,0.500727
high_income,1970s,261290,malefemalenon-binary,0.490605
high_income,1980s,240406,malefemalenon-binary,0.493594
high_income,1990s,179023,malefemalenon-binary,0.446644
high_income,2000s,66537,femalemalenon-binary,0.379144
high_income,2010s,1181,femalemale,0.257683


## Calculate percentage of claims having a specific reference type

  df_summed = copy_df_claim_with_ref_counter.groupby(level=1, axis=1).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,imported from Wikimedia project (P143),stated in (P248),inferred from (P3452),Wikimedia import URL (P4656),retrieved (P813),reference URL (P854),based on heuristic (P887)
"(income_class, nan)","(decade, nan)",Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high_income,1920s,0.173372,0.207169,0.006515,0.021296,0.213283,0.059233,0.012797
high_income,1930s,0.173524,0.210343,0.006772,0.021406,0.214313,0.057617,0.012439
high_income,1940s,0.16811,0.222084,0.006885,0.019885,0.208181,0.053349,0.012004
high_income,1950s,0.168737,0.224251,0.006254,0.020866,0.194407,0.050326,0.060574
high_income,1960s,0.182034,0.203322,0.005918,0.023044,0.18028,0.049091,0.064511
high_income,1970s,0.208605,0.171793,0.005256,0.025955,0.157534,0.045449,0.066484
high_income,1980s,0.259237,0.134561,0.00389,0.031683,0.118914,0.037868,0.064939
high_income,1990s,0.218102,0.118431,0.003615,0.046478,0.105418,0.04362,0.064739
high_income,2000s,0.121639,0.112018,0.004129,0.04185,0.114816,0.067654,0.067414
high_income,2010s,0.051959,0.062906,0.002223,0.020061,0.054404,0.08808,0.06085


## Calculate the average occurrence of each relation

### For each decade and the top-50 most popular relations (if multiple claims occur, it is still counted as one - replacing with df_claim_counter would consider this)

Unnamed: 0_level_0,Unnamed: 1_level_0,sex or gender (P21),instance of (P31),date of birth (P569),country of citizenship (P27),occupation (P106),given name (P735),date of death (P570),place of birth (P19),family name (P734),"languages spoken, written or signed (P1412)",...,native language (P103),member of sports team (P54),work period (start) (P2031),position played on team / speciality (P413),participant in (P1344),country for sport (P1532),official website (P856),height (P2048),mass (P2067),X (Twitter) username (P2002)
income_class,decade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
high_income,1920s,1.0,1.0,1.0,1.0,0.943914,0.888928,0.866596,0.77358,0.687711,0.596097,...,0.075713,0.068946,0.054661,0.048691,0.044831,0.02728,0.019335,0.018516,0.016126,0.001403
high_income,1930s,1.0,1.0,1.0,1.0,0.95582,0.887995,0.614396,0.74752,0.696116,0.600308,...,0.077549,0.076891,0.046134,0.052356,0.055542,0.036373,0.03255,0.031961,0.026714,0.004225
high_income,1940s,1.0,1.0,1.0,1.0,0.959147,0.89816,0.267074,0.676516,0.710306,0.611181,...,0.07776,0.071579,0.051389,0.05021,0.056117,0.036184,0.059562,0.037776,0.032139,0.017003
high_income,1950s,1.0,1.0,1.0,1.0,0.958949,0.894985,0.107737,0.626457,0.711359,0.616404,...,0.078207,0.074654,0.05082,0.054503,0.056561,0.03796,0.080258,0.040398,0.034755,0.042778
high_income,1960s,1.0,1.0,1.0,1.0,0.958986,0.884424,0.050222,0.661547,0.700272,0.604554,...,0.080975,0.104599,0.068027,0.078589,0.075683,0.054431,0.098745,0.065344,0.05302,0.077532
high_income,1970s,1.0,1.0,1.0,1.0,0.96554,0.863179,0.024678,0.682678,0.683069,0.561885,...,0.085579,0.156298,0.096112,0.124903,0.088369,0.075353,0.1131,0.106368,0.081144,0.111256
high_income,1980s,1.0,1.0,1.0,1.0,0.972114,0.841073,0.013893,0.720939,0.66713,0.518348,...,0.091649,0.283874,0.133653,0.252822,0.110051,0.126349,0.103429,0.201659,0.146835,0.150454
high_income,1990s,1.0,1.0,1.0,1.0,0.978081,0.794792,0.007407,0.678427,0.646414,0.478486,...,0.083073,0.373349,0.11503,0.370924,0.141256,0.182787,0.056205,0.226848,0.145484,0.166275
high_income,2000s,1.0,1.0,1.0,1.0,0.976088,0.778349,0.016968,0.477223,0.645596,0.434721,...,0.065227,0.199648,0.051235,0.250462,0.13361,0.181343,0.04109,0.123691,0.037318,0.085216
high_income,2010s,1.0,1.0,1.0,1.0,0.809483,0.785775,0.030483,0.516511,0.757832,0.527519,...,0.15072,0.129551,0.040644,0.00254,0.165961,0.232007,0.028789,0.024555,0.007621,0.047417


### Average over all decades for the top-50 most popular relations

Unnamed: 0_level_0,sex or gender (P21),instance of (P31),date of birth (P569),country of citizenship (P27),occupation (P106),given name (P735),date of death (P570),place of birth (P19),family name (P734),"languages spoken, written or signed (P1412)",...,native language (P103),member of sports team (P54),work period (start) (P2031),position played on team / speciality (P413),participant in (P1344),country for sport (P1532),official website (P856),height (P2048),mass (P2067),X (Twitter) username (P2002)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1.0,1.0,1.0,1.0,0.887871,0.849507,0.185265,0.664672,0.701227,0.542088,...,0.080517,0.139944,0.064337,0.118657,0.084362,0.090006,0.0593,0.080612,0.053706,0.064834
low_income,1.0,1.0,1.0,1.0,0.936677,0.549362,0.260341,0.638979,0.278982,0.422719,...,0.047263,0.130615,0.057464,0.133899,0.127466,0.064061,0.025464,0.084876,0.042357,0.031913
lower-middle_income,1.0,1.0,1.0,1.0,0.876768,0.495519,0.2353,0.662564,0.292963,0.412167,...,0.152777,0.096964,0.084969,0.096136,0.088529,0.073522,0.029974,0.061745,0.029101,0.039472
upper-middle_income,1.0,1.0,1.0,1.0,0.84664,0.541752,0.19872,0.691072,0.346992,0.375908,...,0.085695,0.11328,0.080006,0.121533,0.081038,0.088098,0.021672,0.064111,0.037913,0.033148


### For each decade and the top-50 relations that are equipped with the most references (not just counting the claims with references but the ratio of claims with references to all claims)

Unnamed: 0_level_0,Unnamed: 1_level_0,date of birth (P569),date of death (P570),doctoral advisor (P184),copyright representative (P6275),significant event (P793),candidacy in election (P3602),X (Twitter) username (P2002),Swedish Royal Theater Archive (P7574),e-Rad researcher number (P9776),sexual orientation (P91),...,highest note (P1897),post office box (P2918),Wikidata item of this property (P1629),multi-channel network (P6540),manufacturer (P176),EU VAT number (P3608),location of discovery (P189),motto (P1546),number of deaths (P1120),autonomous system number (P3797)
income_class,decade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
high_income,1920s,1.0,0.866596,0.019952,0.019919,0.009494,0.004646,0.001403,0.001391,0.000931,0.000898,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,1930s,1.0,0.614396,0.023715,0.022923,0.007909,0.006163,0.004225,0.001317,0.001649,0.000835,...,0.0,0.0,0.0,0.0,0.0,0.0,5e-06,5e-06,5e-06,0.0
high_income,1940s,1.0,0.267074,0.024319,0.025827,0.005669,0.011048,0.017003,0.00102,0.003196,0.001151,...,0.0,0.0,0.0,0.0,0.0,4e-06,4e-06,4e-06,4e-06,0.0
high_income,1950s,1.0,0.107737,0.023075,0.029103,0.004705,0.018609,0.042778,0.000663,0.0045,0.001392,...,0.0,0.0,0.0,0.0,3e-06,1e-05,0.0,7e-06,3e-06,0.0
high_income,1960s,1.0,0.050222,0.02155,0.031911,0.004235,0.02429,0.077532,0.000788,0.003414,0.002171,...,4e-06,0.0,0.0,0.0,0.0,1.1e-05,0.0,0.0,0.0,0.0
high_income,1970s,1.0,0.024678,0.01914,0.025895,0.003632,0.023698,0.111256,0.000804,0.001875,0.00261,...,4e-06,4e-06,0.0,0.0,0.0,1.1e-05,0.0,0.0,4e-06,0.0
high_income,1980s,1.0,0.013893,0.009409,0.013839,0.003494,0.01621,0.150454,0.000304,0.000387,0.004243,...,0.0,4e-06,0.0,2.5e-05,4e-06,4e-06,0.0,4e-06,1.2e-05,0.0
high_income,1990s,1.0,0.007407,0.001497,0.002184,0.002899,0.009624,0.166275,7.8e-05,2.8e-05,0.004888,...,0.0,6e-06,6e-06,0.00014,0.0,1.7e-05,0.0,0.0,0.0,6e-06
high_income,2000s,1.0,0.016968,0.007515,0.000782,0.002254,0.006327,0.085216,7.5e-05,0.000962,0.001683,...,0.0,0.0,1.5e-05,4.5e-05,0.0,0.0,0.0,0.0,0.0,0.0
high_income,2010s,1.0,0.030483,0.000847,0.001693,0.022015,0.0,0.047417,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Average over all decades for the top-50 relations that are equipped with the most references

Unnamed: 0_level_0,date of birth (P569),date of death (P570),doctoral advisor (P184),copyright representative (P6275),significant event (P793),candidacy in election (P3602),X (Twitter) username (P2002),Swedish Royal Theater Archive (P7574),e-Rad researcher number (P9776),sexual orientation (P91),...,highest note (P1897),post office box (P2918),Wikidata item of this property (P1629),multi-channel network (P6540),manufacturer (P176),EU VAT number (P3608),location of discovery (P189),motto (P1546),number of deaths (P1120),autonomous system number (P3797)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1.0,0.185265,0.013729,0.015825,0.007776,0.010965,0.064834,0.000586,0.00154,0.001806,...,6.736196e-07,1e-06,2e-06,1.9e-05,6.890091e-07,5e-06,8.476601e-07,2e-06,3e-06,5.078068e-07
low_income,1.0,0.260341,0.005474,0.001235,0.011266,0.006677,0.031913,3.1e-05,0.0,0.000405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lower-middle_income,1.0,0.2353,0.006535,0.001743,0.009993,0.01037,0.039472,1.4e-05,1.2e-05,0.000842,...,0.0,5e-06,0.0,0.0,0.0,5e-06,5.376373e-06,0.0,0.0,0.0
upper-middle_income,1.0,0.19872,0.005376,0.002982,0.00362,0.085941,0.033148,2.9e-05,1.5e-05,0.001262,...,1.522051e-06,2e-06,0.0,1.3e-05,0.0,0.0,2.618252e-05,0.0,3e-06,0.0


## Calculate average use of references

### For each decade and the top-50 most popular relations - normalized by claims

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,imported from Wikimedia project (P143),retrieved (P813),stated in (P248),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)","(decade, nan)",Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high_income,1920s,0.332768,0.139486,0.138795,0.064934,0.062943,0.006088,0.004132
high_income,1930s,0.335461,0.136765,0.136393,0.068905,0.06251,0.007118,0.00391
high_income,1940s,0.334333,0.131436,0.150213,0.068572,0.062902,0.00613,0.004116
high_income,1950s,0.333963,0.127518,0.155455,0.068206,0.065934,0.058612,0.003728
high_income,1960s,0.336351,0.128266,0.142775,0.075386,0.065625,0.063397,0.003548
high_income,1970s,0.344648,0.12859,0.123793,0.082456,0.065929,0.065787,0.00322
high_income,1980s,0.351759,0.124753,0.105338,0.089857,0.069162,0.067533,0.002395
high_income,1990s,0.2822,0.127582,0.105573,0.108379,0.081078,0.067334,0.002065
high_income,2000s,0.183488,0.131115,0.098162,0.13634,0.073358,0.059412,0.003239
high_income,2010s,0.110709,0.10516,0.064463,0.150157,0.029523,0.043335,0.000811


### Average over all decades for the top-50 most popular relations - normalized by claims

Unnamed: 0_level_0,imported from Wikimedia project (P143),retrieved (P813),stated in (P248),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.277125,0.126305,0.119506,0.099517,0.067065,0.042516,0.002833
low_income,0.280014,0.072607,0.072331,0.102788,0.077576,0.033704,0.002818
lower-middle_income,0.272319,0.064144,0.062009,0.076293,0.078717,0.036181,0.002897
upper-middle_income,0.285046,0.078315,0.085965,0.079814,0.082611,0.033564,0.002048


### For each decade and the top-50 relations that are equipped with the most references - normalized by claims

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,stated in (P248),retrieved (P813),imported from Wikimedia project (P143),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)","(decade, nan)",Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high_income,1920s,0.408845,0.246015,0.161252,0.154226,0.086311,0.013191,0.013072
high_income,1930s,0.408991,0.242218,0.171058,0.228558,0.092657,0.01818,0.014118
high_income,1940s,0.371081,0.295894,0.139681,0.223233,0.067313,0.091131,0.013966
high_income,1950s,0.367161,0.278748,0.148698,0.179456,0.063954,0.075221,0.013473
high_income,1960s,0.339435,0.245621,0.155191,0.218518,0.064019,0.105468,0.010395
high_income,1970s,0.336189,0.251364,0.161,0.17497,0.057226,0.126994,0.009477
high_income,1980s,0.343443,0.370737,0.23958,0.242614,0.15253,0.10392,0.004811
high_income,1990s,0.316832,0.317595,0.155887,0.243807,0.08677,0.109839,0.004066
high_income,2000s,0.291092,0.276564,0.214166,0.150209,0.133632,0.083874,0.005845
high_income,2010s,0.167742,0.0719,0.061878,0.071141,0.03433,0.000209,0.000478


### Average over all decades for the top-50 relations that are equipped with the most references - normalized by claims

Unnamed: 0_level_0,stated in (P248),retrieved (P813),imported from Wikimedia project (P143),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.307609,0.241962,0.146576,0.180585,0.076346,0.066184,0.008155
low_income,0.148661,0.076536,0.085374,0.0745,0.016551,0.031498,0.044388
lower-middle_income,0.179027,0.099755,0.129904,0.091489,0.021267,0.058715,0.028259
upper-middle_income,0.235308,0.180782,0.113276,0.12924,0.05831,0.065787,0.02443


### For each decade and the top-50 relations - normalized by claims with references

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,imported from Wikimedia project (P143),retrieved (P813),stated in (P248),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)","(decade, nan)",Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high_income,1920s,0.614469,0.2874,0.284254,0.118359,0.093101,0.012758,0.009456
high_income,1930s,0.621914,0.279312,0.276469,0.120342,0.093951,0.015062,0.009015
high_income,1940s,0.604037,0.258804,0.296363,0.11588,0.09081,0.012565,0.009226
high_income,1950s,0.576417,0.231011,0.262897,0.110121,0.094453,0.091707,0.007979
high_income,1960s,0.581994,0.230337,0.242061,0.123229,0.095414,0.096449,0.007489
high_income,1970s,0.590683,0.233522,0.217211,0.136119,0.095804,0.101845,0.006956
high_income,1980s,0.59544,0.226576,0.190862,0.153524,0.102157,0.10774,0.005461
high_income,1990s,0.523847,0.243654,0.20592,0.198423,0.133327,0.113866,0.004467
high_income,2000s,0.42453,0.290572,0.213687,0.28417,0.149311,0.107327,0.007398
high_income,2010s,0.370711,0.286285,0.154148,0.386619,0.140684,0.099177,0.030448


### Average over all decades for the top-50 relations - normalized by claims with references

Unnamed: 0_level_0,imported from Wikimedia project (P143),retrieved (P813),stated in (P248),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.51161,0.256526,0.233844,0.192474,0.109371,0.074407,0.0089
low_income,0.52279,0.176583,0.143426,0.21288,0.144143,0.077342,0.006601
lower-middle_income,0.522589,0.159008,0.129069,0.167606,0.146947,0.088899,0.006981
upper-middle_income,0.538814,0.186014,0.170701,0.169826,0.145042,0.078566,0.004811


### For each decade and the top-50 relations that are equipped with the most references - normalized by claims with references

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,stated in (P248),retrieved (P813),imported from Wikimedia project (P143),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)","(decade, nan)",Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high_income,1920s,0.459036,0.294534,0.20937,0.19328,0.110888,0.022768,0.015336
high_income,1930s,0.455398,0.299898,0.25264,0.269952,0.139483,0.032002,0.015666
high_income,1940s,0.460601,0.35655,0.186606,0.302942,0.087231,0.164068,0.015511
high_income,1950s,0.419783,0.357051,0.198995,0.229542,0.081033,0.129441,0.015898
high_income,1960s,0.396055,0.29658,0.207549,0.265196,0.082163,0.165289,0.01193
high_income,1970s,0.389109,0.3021,0.223023,0.224366,0.073775,0.177139,0.011022
high_income,1980s,0.406988,0.426125,0.314922,0.306651,0.200066,0.160957,0.005743
high_income,1990s,0.443098,0.408264,0.219598,0.325398,0.115621,0.185581,0.005338
high_income,2000s,0.365924,0.343547,0.274907,0.217758,0.163325,0.174786,0.007212
high_income,2010s,0.205714,0.103504,0.10641,0.123174,0.067754,0.000336,0.000768


### Average over all decades for the top-50 relations that are equipped with the most references - normalized by claims with references

Unnamed: 0_level_0,stated in (P248),retrieved (P813),imported from Wikimedia project (P143),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.368409,0.299431,0.200088,0.236077,0.10211,0.110215,0.009493
low_income,0.188012,0.116039,0.130584,0.113554,0.024611,0.045806,0.052949
lower-middle_income,0.230586,0.137483,0.195559,0.136423,0.029692,0.101724,0.036745
upper-middle_income,0.289004,0.2262,0.178556,0.173584,0.078548,0.110193,0.032663


In [18]:
calculate_all_stats(df_pid_counter, df_claim_counter, df_claim_with_ref_counter, df_num_refs, df_num_refs_per_pid, group_by=['income_class', 'gender'])

# Group by ['income_class', 'gender']

## Calculate percentage of persons with specific relation having references (there might be multiple claims per person)

Unnamed: 0_level_0,Unnamed: 1_level_0,person_counter,decade,VIAF cluster ID (P214),member of political party (P102),educated at (P69),instance of (P31),place of birth (P19),place of death (P20),position held (P39),Freebase ID (P646),...,Encyclopedia of China (Third Edition) ID (P10565),member of cabinet (P5054),Ministry of Unification politician ID (P11150),has goal (P3712),parent club (P831),time played (P9140),The Counted person ID (P11638),measured physical quantity (P111),Netherlands Olympic Committee athlete ID (P10957),IMDb keyword (P11924)
income_class,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
high_income,female,502510,1930s1980s1970s1940s1950s1960s1920s2000s1990s2...,0.191606,0.036099,0.231792,0.395636,0.456873,0.042455,0.16644,0.080617,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,male,1342652,1930s1920s1980s1990s1970s1950s1960s1940s2000s2...,0.223554,0.047012,0.28337,0.49688,0.555507,0.084352,0.242287,0.10055,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
high_income,non-binary,2721,1950s1970s1990s1980s1960s2000s1940s1930s1920s,0.210952,0.035649,0.360162,0.339214,0.514149,0.052187,0.029768,0.108416,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
low_income,female,5175,2000s1970s1980s1950s1960s1940s1990s1920s1930s2...,0.074396,0.067053,0.227826,0.318647,0.404831,0.037101,0.162899,0.063382,...,0.0,0.0,0.000773,0.000193,0.0,0.0,0.0,0.0,0.000193,0.0
low_income,male,16337,1980s1960s1970s1940s1950s1990s1930s2000s1920s2...,0.088694,0.098182,0.172431,0.468691,0.515762,0.076942,0.237987,0.127992,...,0.0,0.0,0.00153,0.0,6.1e-05,6.1e-05,6.1e-05,6.1e-05,0.0,6.1e-05
low_income,non-binary,21,1930s1950s1980s1990s1970s1960s,0.142857,0.047619,0.142857,0.380952,0.333333,0.047619,0.0,0.095238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lower-middle_income,female,30195,1980s1940s2000s1960s1990s1970s1950s1920s1930s2...,0.08561,0.061434,0.238715,0.379699,0.470906,0.043981,0.092697,0.068753,...,3.3e-05,3.3e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lower-middle_income,male,97653,1940s1960s1930s2000s1970s1980s1990s1950s1920s2...,0.095041,0.11207,0.233224,0.45205,0.528832,0.080428,0.172263,0.083193,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lower-middle_income,non-binary,155,1990s1980s1960s1970s1950s2000s1940s1920s,0.058065,0.03871,0.219355,0.348387,0.541935,0.051613,0.077419,0.064516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
upper-middle_income,female,106807,2000s1970s1940s1960s1980s1990s1950s1930s1920s2...,0.059369,0.23976,0.146826,0.315335,0.457489,0.030691,0.04863,0.051111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Calculate percentage of claims with specific relation having references

Unnamed: 0_level_0,Unnamed: 1_level_0,person_counter,decade,VIAF cluster ID (P214),member of political party (P102),educated at (P69),instance of (P31),place of birth (P19),place of death (P20),position held (P39),Freebase ID (P646),...,Encyclopedia of China (Third Edition) ID (P10565),member of cabinet (P5054),Ministry of Unification politician ID (P11150),has goal (P3712),parent club (P831),time played (P9140),The Counted person ID (P11638),measured physical quantity (P111),Netherlands Olympic Committee athlete ID (P10957),IMDb keyword (P11924)
income_class,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
high_income,female,502510,1930s1980s1970s1940s1950s1960s1920s2000s1990s2...,0.506097,0.519547,0.526564,0.395504,0.741552,0.733228,0.763155,0.312488,...,0.0,0.0,,,0.0,,,,,
high_income,male,1342652,1930s1920s1980s1990s1970s1950s1960s1940s2000s2...,0.574208,0.511752,0.587104,0.496759,0.785034,0.749494,0.746703,0.28654,...,0.0,,,,0.0,,0.0,,,0.0
high_income,non-binary,2721,1950s1970s1990s1980s1960s2000s1940s1930s1920s,0.473988,0.602484,0.58753,0.339089,0.705853,0.699507,0.566434,0.381137,...,0.0,,,,,,,,,
low_income,female,5175,2000s1970s1980s1950s1960s1940s1990s1920s1930s2...,0.472973,0.601386,0.466377,0.318463,0.662137,0.676056,0.618943,0.271299,...,0.0,,0.666667,1.0,,,,,1.0,
low_income,male,16337,1980s1960s1970s1940s1950s1990s1930s2000s1920s2...,0.537264,0.724481,0.533422,0.46849,0.750913,0.683152,0.669883,0.318362,...,0.0,,0.490196,,0.5,1.0,1.0,1.0,,1.0
low_income,non-binary,21,1930s1950s1980s1990s1970s1960s,0.428571,1.0,0.3,0.380952,0.538462,1.0,,0.4,...,,,,,,,,,,
lower-middle_income,female,30195,1980s1940s2000s1960s1990s1970s1950s1920s1930s2...,0.483811,0.634405,0.539238,0.379535,0.728507,0.669018,0.514995,0.205138,...,0.025,1.0,,,0.0,,,,,
lower-middle_income,male,97653,1940s1960s1930s2000s1970s1980s1990s1950s1920s2...,0.532717,0.641162,0.548531,0.451971,0.768802,0.666271,0.478183,0.193249,...,0.0,0.0,,,0.0,,,,,
lower-middle_income,non-binary,155,1990s1980s1960s1970s1950s2000s1940s1920s,0.375,0.6,0.459459,0.346154,0.705882,0.8,0.631579,0.25641,...,,,,,,,,,,
upper-middle_income,female,106807,2000s1970s1940s1960s1980s1990s1950s1930s1920s2...,0.456318,0.927289,0.523868,0.315276,0.749283,0.636258,0.521748,0.285036,...,0.0,0.0,,,0.0,,,,,


## Calculate percentage of claims having references

Unnamed: 0_level_0,Unnamed: 1_level_0,person_counter,decade,P_sum
income_class,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high_income,female,502510,1930s1980s1970s1940s1950s1960s1920s2000s1990s2...,0.455877
high_income,male,1342652,1930s1920s1980s1990s1970s1950s1960s1940s2000s2...,0.484413
high_income,non-binary,2721,1950s1970s1990s1980s1960s2000s1940s1930s1920s,0.410915
low_income,female,5175,2000s1970s1980s1950s1960s1940s1990s1920s1930s2...,0.397498
low_income,male,16337,1980s1960s1970s1940s1950s1990s1930s2000s1920s2...,0.448729
low_income,non-binary,21,1930s1950s1980s1990s1970s1960s,0.445415
lower-middle_income,female,30195,1980s1940s2000s1960s1990s1970s1950s1920s1930s2...,0.408805
lower-middle_income,male,97653,1940s1960s1930s2000s1970s1980s1990s1950s1920s2...,0.42973
lower-middle_income,non-binary,155,1990s1980s1960s1970s1950s2000s1940s1920s,0.352805
upper-middle_income,female,106807,2000s1970s1940s1960s1980s1990s1950s1930s1920s2...,0.466443


## Calculate percentage of claims having a specific reference type

  df_summed = copy_df_claim_with_ref_counter.groupby(level=1, axis=1).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,imported from Wikimedia project (P143),stated in (P248),inferred from (P3452),Wikimedia import URL (P4656),retrieved (P813),reference URL (P854),based on heuristic (P887)
"(income_class, nan)","(gender, nan)",Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high_income,female,0.160303,0.183494,0.005861,0.024686,0.172723,0.054412,0.053259
high_income,male,0.202141,0.189154,0.005576,0.0264,0.17345,0.047783,0.045792
high_income,non-binary,0.117269,0.157712,0.006469,0.028248,0.195439,0.105509,0.038048
low_income,female,0.172136,0.088714,0.004658,0.045003,0.094291,0.08023,0.037326
low_income,male,0.264939,0.092629,0.004452,0.035652,0.098108,0.053536,0.031404
low_income,non-binary,0.111354,0.222707,0.004367,0.008734,0.237991,0.091703,0.026201
lower-middle_income,female,0.18452,0.110491,0.004456,0.052018,0.102671,0.060379,0.034361
lower-middle_income,male,0.242282,0.10177,0.004475,0.040279,0.092067,0.047746,0.029283
lower-middle_income,non-binary,0.140264,0.071287,0.0033,0.050825,0.084158,0.09868,0.033003
upper-middle_income,female,0.162736,0.186178,0.003747,0.033345,0.134031,0.08781,0.040625


## Calculate the average occurrence of each relation

### For each decade and the top-50 most popular relations (if multiple claims occur, it is still counted as one - replacing with df_claim_counter would consider this)

Unnamed: 0_level_0,Unnamed: 1_level_0,sex or gender (P21),instance of (P31),date of birth (P569),country of citizenship (P27),occupation (P106),given name (P735),family name (P734),place of birth (P19),"languages spoken, written or signed (P1412)",educated at (P69),...,position held (P39),official website (P856),native language (P103),position played on team / speciality (P413),work period (start) (P2031),country for sport (P1532),member of political party (P102),place of death (P20),member of sports team (P54),mass (P2067)
income_class,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
high_income,female,1.0,1.0,1.0,1.0,0.965535,0.843494,0.66929,0.610943,0.557625,0.292245,...,0.095433,0.094169,0.093023,0.080824,0.07934,0.067692,0.061424,0.057129,0.051269,0.041305
high_income,male,1.0,1.0,1.0,1.0,0.961919,0.876494,0.695718,0.702674,0.575878,0.326131,...,0.131905,0.069248,0.076718,0.149333,0.075398,0.078736,0.078395,0.111421,0.18853,0.077249
high_income,non-binary,1.0,1.0,1.0,1.0,0.975377,0.749357,0.661521,0.717751,0.538037,0.382947,...,0.035281,0.290702,0.120176,0.004778,0.174201,0.010658,0.048144,0.074237,0.010658,0.018376
low_income,female,1.0,1.0,1.0,1.0,0.989758,0.525024,0.30029,0.603865,0.397101,0.288309,...,0.162126,0.033043,0.035362,0.076329,0.062802,0.065121,0.105314,0.052947,0.038841,0.041739
low_income,male,1.0,1.0,1.0,1.0,0.975393,0.565404,0.247965,0.679256,0.458407,0.206341,...,0.203648,0.022097,0.027606,0.23052,0.068556,0.089551,0.127808,0.111159,0.238355,0.068372
low_income,non-binary,1.0,1.0,1.0,1.0,1.0,0.714286,0.380952,0.619048,0.666667,0.380952,...,0.0,0.047619,0.095238,0.0,0.095238,0.095238,0.047619,0.047619,0.0,0.047619
lower-middle_income,female,1.0,1.0,1.0,1.0,0.980361,0.505978,0.32926,0.639146,0.427091,0.285676,...,0.113496,0.049578,0.151913,0.070442,0.14105,0.079086,0.089949,0.064713,0.040437,0.0308
lower-middle_income,male,1.0,1.0,1.0,1.0,0.972064,0.49341,0.252302,0.677716,0.424821,0.273571,...,0.196922,0.036077,0.166365,0.150338,0.095553,0.085681,0.157865,0.118133,0.171976,0.048089
lower-middle_income,non-binary,1.0,1.0,1.0,1.0,0.974194,0.445161,0.354839,0.754839,0.43871,0.303226,...,0.051613,0.109677,0.096774,0.0,0.135484,0.025806,0.045161,0.064516,0.0,0.019355
upper-middle_income,female,1.0,1.0,1.0,1.0,0.933497,0.477815,0.329192,0.607207,0.342187,0.206831,...,0.061578,0.036524,0.080341,0.074471,0.090996,0.066419,0.249656,0.047899,0.041149,0.037441


### Average over all decades for the top-50 most popular relations

Unnamed: 0_level_0,sex or gender (P21),instance of (P31),date of birth (P569),country of citizenship (P27),occupation (P106),given name (P735),family name (P734),place of birth (P19),"languages spoken, written or signed (P1412)",educated at (P69),...,position held (P39),official website (P856),native language (P103),position played on team / speciality (P413),work period (start) (P2031),country for sport (P1532),member of political party (P102),place of death (P20),member of sports team (P54),mass (P2067)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1.0,1.0,1.0,1.0,0.96761,0.823115,0.67551,0.677123,0.55718,0.333774,...,0.08754,0.151373,0.096639,0.078312,0.109646,0.052362,0.062654,0.080929,0.083485,0.045643
low_income,1.0,1.0,1.0,1.0,0.988384,0.601571,0.309736,0.634056,0.507392,0.291868,...,0.121925,0.034253,0.052735,0.102283,0.075532,0.083303,0.09358,0.070575,0.092398,0.052577
lower-middle_income,1.0,1.0,1.0,1.0,0.97554,0.481516,0.312133,0.690567,0.430207,0.287491,...,0.120677,0.065111,0.13835,0.073594,0.124029,0.063524,0.097658,0.082454,0.070804,0.032748
upper-middle_income,1.0,1.0,1.0,1.0,0.953247,0.541577,0.380418,0.713719,0.422447,0.209301,...,0.074805,0.058857,0.091418,0.093808,0.118108,0.062348,0.188228,0.086984,0.085443,0.038277


### For each decade and the top-50 relations that are equipped with the most references (not just counting the claims with references but the ratio of claims with references to all claims)

Unnamed: 0_level_0,Unnamed: 1_level_0,date of birth (P569),X (Twitter) username (P2002),date of death (P570),copyright representative (P6275),candidacy in election (P3602),doctoral advisor (P184),significant event (P793),sexual orientation (P91),affiliation string (P6424),Tumblr username (P3943),...,motto (P1546),location of discovery (P189),investigated by (P1840),highest note (P1897),number of deaths (P1120),manufacturer (P176),identifiers.org prefix (P4793),Wikidata item of this property (P1629),platform (P400),autonomous system number (P3797)
income_class,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
high_income,female,1.0,0.101807,0.097626,0.02793,0.018274,0.014591,0.004466,0.003813,0.001586,0.001401,...,2e-06,1.99001e-06,2e-06,1.99001e-06,0.0,0.0,0.0,0.0,0.0,0.0
high_income,male,1.0,0.065482,0.199367,0.019685,0.014573,0.019179,0.004827,0.001541,0.002067,0.000623,...,3e-06,7.447946e-07,1e-05,7.447946e-07,5e-06,1e-06,1e-06,1e-06,7.447946e-07,0.0
high_income,non-binary,1.0,0.368614,0.116134,0.019478,0.019478,0.016538,0.025726,0.089305,0.002573,0.031974,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000368
low_income,female,1.0,0.065121,0.082705,0.00058,0.003865,0.003285,0.005604,0.000966,0.004831,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
low_income,male,1.0,0.037828,0.186142,0.001775,0.009977,0.006733,0.006243,0.000428,0.002142,0.000122,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
low_income,non-binary,1.0,0.095238,0.047619,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lower-middle_income,female,1.0,0.086405,0.106971,0.002252,0.008081,0.005696,0.007584,0.001987,0.00467,0.000431,...,0.0,3.311807e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lower-middle_income,male,1.0,0.04561,0.212057,0.001966,0.015954,0.00852,0.006021,0.00087,0.004373,0.000215,...,0.0,0.0,0.0,0.0,0.0,0.0,1e-05,0.0,1.024034e-05,0.0
lower-middle_income,non-binary,1.0,0.206452,0.109677,0.0,0.012903,0.0,0.019355,0.058065,0.012903,0.006452,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
upper-middle_income,female,1.0,0.06155,0.071269,0.003492,0.18746,0.005355,0.002509,0.00309,0.001741,0.000328,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Average over all decades for the top-50 relations that are equipped with the most references

Unnamed: 0_level_0,date of birth (P569),X (Twitter) username (P2002),date of death (P570),copyright representative (P6275),candidacy in election (P3602),doctoral advisor (P184),significant event (P793),sexual orientation (P91),affiliation string (P6424),Tumblr username (P3943),...,motto (P1546),location of discovery (P189),investigated by (P1840),highest note (P1897),number of deaths (P1120),manufacturer (P176),identifiers.org prefix (P4793),Wikidata item of this property (P1629),platform (P400),autonomous system number (P3797)
income_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
high_income,1.0,0.178635,0.137709,0.022364,0.017442,0.016769,0.011673,0.031553,0.002075,0.011332,...,2e-06,9.116016e-07,4e-06,9.116016e-07,2e-06,4.965298e-07,4.965298e-07,4.965298e-07,2.482649e-07,0.000123
low_income,1.0,0.066062,0.105489,0.000785,0.004614,0.003339,0.003949,0.016338,0.002324,4.1e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lower-middle_income,1.0,0.112822,0.142902,0.001406,0.012313,0.004739,0.010987,0.020307,0.007315,0.002366,...,0.0,1.103936e-05,0.0,0.0,0.0,0.0,3.413447e-06,0.0,3.413447e-06,0.0
upper-middle_income,1.0,0.108155,0.1319,0.00342,0.109857,0.004913,0.007644,0.025082,0.002586,0.000792,...,0.0,2.101744e-05,0.0,1.106181e-06,1e-06,0.0,0.0006409014,0.0,1.106181e-06,0.0


## Calculate average use of references

### For each decade and the top-50 most popular relations - normalized by claims

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,imported from Wikimedia project (P143),stated in (P248),retrieved (P813),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)","(gender, nan)",Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high_income,female,0.279473,0.168503,0.13738,0.086548,0.062771,0.050552,0.003782
high_income,male,0.342915,0.12876,0.126661,0.069475,0.069296,0.045478,0.003635
high_income,non-binary,0.220145,0.108533,0.149418,0.155369,0.071859,0.035285,0.003788
low_income,female,0.224485,0.106976,0.084449,0.11769,0.086154,0.042421,0.003001
low_income,male,0.33407,0.068123,0.084142,0.082299,0.079764,0.041441,0.003492
low_income,non-binary,0.216951,0.154946,0.183884,0.095346,0.024138,0.044686,0.007781
lower-middle_income,female,0.271195,0.093854,0.091003,0.093966,0.092151,0.044703,0.003297
lower-middle_income,male,0.344858,0.061486,0.070185,0.064868,0.087063,0.041221,0.00408
lower-middle_income,non-binary,0.178769,0.026054,0.088707,0.139558,0.096671,0.042135,0.001852
upper-middle_income,female,0.260537,0.148356,0.102732,0.095715,0.079927,0.043506,0.002686


### Average over all decades for the top-50 most popular relations - normalized by claims

Unnamed: 0_level_0,imported from Wikimedia project (P143),stated in (P248),retrieved (P813),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.280844,0.135265,0.137819,0.103797,0.067976,0.043772,0.003735
low_income,0.258502,0.110015,0.117492,0.098445,0.063352,0.042849,0.004758
lower-middle_income,0.264941,0.060465,0.083298,0.099464,0.091962,0.042687,0.003076
upper-middle_income,0.264249,0.109491,0.103571,0.115441,0.080793,0.039999,0.002474


### For each decade and the top-50 relations that are equipped with the most references - normalized by claims

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,stated in (P248),retrieved (P813),reference URL (P854),imported from Wikimedia project (P143),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)","(gender, nan)",Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high_income,female,0.336699,0.319276,0.290464,0.239957,0.155084,0.097706,0.009648
high_income,male,0.382423,0.342652,0.225561,0.262668,0.168519,0.118644,0.011447
high_income,non-binary,0.299995,0.323623,0.183732,0.176106,0.118822,0.079004,0.004572
low_income,female,0.165873,0.159403,0.135615,0.05467,0.011597,0.007123,0.089428
low_income,male,0.257242,0.127011,0.105255,0.092241,0.016534,0.136166,0.06539
low_income,non-binary,0.088889,0.055901,0.049451,0.014881,0.0,0.0,0.0
lower-middle_income,female,0.224896,0.122487,0.117304,0.143479,0.016912,0.108602,0.042145
lower-middle_income,male,0.293403,0.194101,0.172748,0.203214,0.073759,0.075113,0.026791
lower-middle_income,non-binary,0.045604,0.042425,0.107923,0.079116,0.007196,0.015152,0.0
upper-middle_income,female,0.265137,0.217122,0.158703,0.174104,0.132712,0.088469,0.015373


### Average over all decades for the top-50 relations that are equipped with the most references - normalized by claims

Unnamed: 0_level_0,stated in (P248),retrieved (P813),reference URL (P854),imported from Wikimedia project (P143),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.339706,0.328517,0.233253,0.226244,0.147475,0.098451,0.008556
low_income,0.170668,0.114105,0.096774,0.053931,0.009377,0.047763,0.051606
lower-middle_income,0.187968,0.119671,0.132658,0.141937,0.032622,0.066289,0.022979
upper-middle_income,0.252671,0.217928,0.176504,0.146346,0.107569,0.056372,0.01035


### For each decade and the top-50 relations - normalized by claims with references

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,imported from Wikimedia project (P143),stated in (P248),retrieved (P813),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)","(gender, nan)",Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high_income,female,0.523276,0.27687,0.249545,0.146112,0.095132,0.093685,0.007908
high_income,male,0.601345,0.232405,0.234936,0.117019,0.101466,0.091584,0.007675
high_income,non-binary,0.451743,0.225121,0.327713,0.293111,0.131302,0.082626,0.009477
low_income,female,0.458151,0.189509,0.200934,0.240547,0.177702,0.100901,0.0067
low_income,male,0.599548,0.146464,0.201062,0.174731,0.13263,0.105935,0.00809
low_income,non-binary,0.317016,0.239076,0.312808,0.202734,0.051724,0.070652,0.019397
lower-middle_income,female,0.525071,0.169641,0.205702,0.191851,0.179587,0.100692,0.007292
lower-middle_income,male,0.629965,0.130936,0.169803,0.149419,0.150692,0.107562,0.009507
lower-middle_income,non-binary,0.401954,0.079361,0.201626,0.282918,0.232663,0.093406,0.004436
upper-middle_income,female,0.518025,0.238796,0.205274,0.178434,0.138142,0.10055,0.005817


### Average over all decades for the top-50 relations - normalized by claims with references

Unnamed: 0_level_0,imported from Wikimedia project (P143),stated in (P248),retrieved (P813),reference URL (P854),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.525455,0.244799,0.270731,0.185414,0.1093,0.089298,0.008353
low_income,0.458238,0.191683,0.238268,0.206004,0.120686,0.092496,0.011395
lower-middle_income,0.518997,0.126646,0.192377,0.208063,0.187648,0.100553,0.007078
upper-middle_income,0.508859,0.196191,0.21474,0.225925,0.135337,0.10028,0.005557


### For each decade and the top-50 relations that are equipped with the most references - normalized by claims with references

  normalized_num_refs_per_pid = normalized_num_refs_per_pid.groupby(level=1, axis=1).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,stated in (P248),reference URL (P854),retrieved (P813),imported from Wikimedia project (P143),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)","(gender, nan)",Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high_income,female,0.395909,0.375837,0.375421,0.31605,0.193898,0.163965,0.01122
high_income,male,0.479606,0.376557,0.466179,0.327158,0.197674,0.17446,0.013165
high_income,non-binary,0.360156,0.237791,0.39707,0.212628,0.130461,0.14639,0.006415
low_income,female,0.214248,0.190481,0.21816,0.09404,0.023287,0.031646,0.113246
low_income,male,0.352922,0.162434,0.206495,0.15685,0.028457,0.282994,0.070741
low_income,non-binary,0.095833,0.052885,0.059783,0.019531,0.0,0.0,0.0
lower-middle_income,female,0.290259,0.161133,0.164126,0.223024,0.028688,0.183789,0.046273
lower-middle_income,male,0.367973,0.227793,0.247166,0.292446,0.084443,0.15951,0.032272
lower-middle_income,non-binary,0.090598,0.138856,0.056689,0.116506,0.01546,0.041667,0.0
upper-middle_income,female,0.334612,0.20116,0.27521,0.233754,0.158731,0.179799,0.01849


### Average over all decades for the top-50 relations that are equipped with the most references - normalized by claims with references

Unnamed: 0_level_0,stated in (P248),reference URL (P854),retrieved (P813),imported from Wikimedia project (P143),Wikimedia import URL (P4656),based on heuristic (P887),inferred from (P3452)
"(income_class, nan)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
high_income,0.41189,0.330062,0.41289,0.285279,0.174011,0.161605,0.010267
low_income,0.221001,0.135267,0.161479,0.09014,0.017248,0.10488,0.061329
lower-middle_income,0.24961,0.175927,0.155994,0.210659,0.042864,0.128322,0.026182
upper-middle_income,0.316751,0.218973,0.271384,0.202787,0.129377,0.122672,0.012217
