In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chisquare
import re


In [2]:
benford_first_digit = {
    1: 0.3010,
    2: 0.1761,
    3: 0.1249,
    4: 0.0969,
    5: 0.0792,
    6: 0.0669,
    7: 0.0580,
    8: 0.0512,
    9: 0.0458
}

metadata = pd.read_csv('../data/metadata.csv')
display(metadata.head())

Unnamed: 0,filename,title,description,source
0,population-since-10000bc.csv,"Population, 10,000 BCE to 2023","Population by country, available from 10,000 B...",HYDE (2023); Gapminder (2022); UN WPP (2024)
1,yearly-number-of-objects-launched-into-outer-s...,Annual number of objects launched into space,"Annual number of satellites, probes, landers, ...",UNOOSA
2,fish-seafood-production.csv,Fish and Seafood Production,Fish and seafood production is measured as the...,Food and Agriculture Organization of the Unite...
3,number-of-people-living-in-extreme-poverty.csv,"Number of people living in extreme poverty, 2024",Extreme poverty is defined as living below the...,World Bank Poverty and Inequality Platform (2025)
4,annual-co2-emissions-per-country.csv,Annual CO? emissions by country,Annual total emissions of carbon dioxide (CO?)...,Global Carbon Budget (2024)


In [3]:
population_data = pd.read_csv("../data/population-since-10000bc.csv")
population_data.head()

Unnamed: 0,Entity,Code,Year,Population (historical)
0,Afghanistan,AFG,-10000,14737
1,Afghanistan,AFG,-9000,20405
2,Afghanistan,AFG,-8000,28253
3,Afghanistan,AFG,-7000,39120
4,Afghanistan,AFG,-6000,54166


In [4]:
column = 'Population (historical)'

first_digits = population_data[column].astype(str).str[0].value_counts().sort_index()
first_digits / first_digits.sum()

Population (historical)
1    0.283580
2    0.186296
3    0.133062
4    0.101953
5    0.080914
6    0.065427
7    0.054933
8    0.048769
9    0.045067
Name: count, dtype: float64

In [5]:
last_digits = population_data[column].astype(str).str[-1].value_counts().sort_index()
last_digits / last_digits.sum()

Population (historical)
0    0.138597
1    0.094855
2    0.094583
3    0.097300
4    0.097521
5    0.094176
6    0.096060
7    0.093870
8    0.096485
9    0.096553
Name: count, dtype: float64

In [6]:
MIN_ORDERS_OF_MAGNITUDE = 2

def check_magnitude_range(series):
    max_value = series.abs().max()
    min_value = series.abs().min()
    
    magnitude_range = np.log10(max_value) - np.log10(min_value)
    
    return magnitude_range

print(check_magnitude_range(population_data[column]))

9.908041643529613


In [7]:

def get_dataset_stats(dataset):
    
    df = pd.read_csv(f'../data/{dataset}')
    
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    print(f'Found {len(numerical_cols)} numerical columns in {dataset}: {numerical_cols} \n\n')
    
    print('COLUMN-WISE ANALYSIS:')
    for col in numerical_cols:
        print('\n----------------------------------\n')
        
        col_data = df[col].abs()
        col_data = col_data.replace(0, np.nan).dropna()
        
        magnitude_range = np.round(check_magnitude_range(col_data), 2)
        
        print(f'Column "{col}" has a magnitude range of {magnitude_range}.')

        if magnitude_range == MIN_ORDERS_OF_MAGNITUDE:
            print(f'Since magnitude range({magnitude_range} = 2), the data should start to show Benford-like properties.')
            
        elif (magnitude_range > MIN_ORDERS_OF_MAGNITUDE):
            print(f'Since magnitude range({magnitude_range}) > 2, it is a good candidate for Benford analysis.')

        else:
            print(f'Since magnitude range({magnitude_range}) < 2, "{col}" is unlikely to follow Benford\'s Law.')

        print('\n----------------------------------\n')


        first_digits = col_data.apply(extract_first_digit).value_counts().sort_index()
        first_digits_ratio = first_digits / first_digits.sum()

        last_digits = col_data.apply(extract_last_digit).value_counts().sort_index()
        last_digits_ratio = last_digits / last_digits.sum()
        
        benford_first_digit_series = pd.Series(benford_first_digit)
        benford_first_digit_series.index = benford_first_digit_series.index
        
        results = pd.DataFrame({
            'First Digit Ratio': first_digits_ratio,
            'Expected Benford First Digit Ratio': benford_first_digit_series,
            'Last Digit Ratio': last_digits_ratio,
        })

        display(f'Distribution of first and last digits for column "{col}":')
        display(results.fillna(0).round(4))



def extract_first_digit(x):
    s = str(abs(x))  # remove sign
    s = re.sub(r"[^0-9]", "", s)  # keep only digits
    for ch in s:
        if ch != "0":
            return int(ch)
    return None

def extract_last_digit(x):
    """Extract last digit from a number, ignoring trailing .0 if present."""
    try:
        s = str(x).strip()
        
        # If the number ends with ".0", drop it
        if s.endswith(".0"):
            s = s[:-2]
        
        s = s.replace(".", "").replace("-", "")  # remove dot and minus
        return int(s[-1]) if s else None
    
    except Exception:
        return None

In [8]:
get_dataset_stats('population-since-10000bc.csv')

Found 2 numerical columns in population-since-10000bc.csv: ['Year', 'Population (historical)'] 


COLUMN-WISE ANALYSIS:

----------------------------------

Column "Year" has a magnitude range of 2.0.
Since magnitude range(2.0 = 2), the data should start to show Benford-like properties.

----------------------------------



'Distribution of first and last digits for column "Year":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.0,0.2209
1,0.8417,0.301,0.0899
2,0.112,0.1761,0.0894
3,0.0066,0.1249,0.0896
4,0.0066,0.0969,0.085
5,0.0066,0.0792,0.0851
6,0.0066,0.0669,0.0851
7,0.0066,0.058,0.085
8,0.0066,0.0512,0.085
9,0.0066,0.0458,0.085



----------------------------------

Column "Population (historical)" has a magnitude range of 9.91.
Since magnitude range(9.91) > 2, it is a good candidate for Benford analysis.

----------------------------------



'Distribution of first and last digits for column "Population (historical)":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.0,0.1386
1,0.2836,0.301,0.0949
2,0.1863,0.1761,0.0946
3,0.1331,0.1249,0.0973
4,0.102,0.0969,0.0975
5,0.0809,0.0792,0.0942
6,0.0654,0.0669,0.0961
7,0.0549,0.058,0.0939
8,0.0488,0.0512,0.0965
9,0.0451,0.0458,0.0966


In [9]:
get_dataset_stats('yearly-number-of-objects-launched-into-outer-space.csv')

Found 2 numerical columns in yearly-number-of-objects-launched-into-outer-space.csv: ['Year', 'Annual number of objects launched into outer space'] 


COLUMN-WISE ANALYSIS:

----------------------------------

Column "Year" has a magnitude range of 0.01.
Since magnitude range(0.01) < 2, "Year" is unlikely to follow Benford's Law.

----------------------------------



'Distribution of first and last digits for column "Year":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.0,0.1081
1,0.5403,0.301,0.11
2,0.4597,0.1761,0.1061
3,0.0,0.1249,0.0982
4,0.0,0.0969,0.11
5,0.0,0.0792,0.0923
6,0.0,0.0669,0.0806
7,0.0,0.058,0.0884
8,0.0,0.0512,0.1022
9,0.0,0.0458,0.1041



----------------------------------

Column "Annual number of objects launched into outer space" has a magnitude range of 3.46.
Since magnitude range(3.46) > 2, it is a good candidate for Benford analysis.

----------------------------------



'Distribution of first and last digits for column "Annual number of objects launched into outer space":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.0,0.0747
1,0.3635,0.301,0.2004
2,0.2495,0.1761,0.1788
3,0.1297,0.1249,0.1002
4,0.0668,0.0969,0.1139
5,0.0393,0.0792,0.0786
6,0.0275,0.0669,0.0668
7,0.053,0.058,0.0589
8,0.0255,0.0512,0.057
9,0.0452,0.0458,0.0707


In [10]:
filename = 'constituency-wise-detailed-result.csv'
get_dataset_stats(filename)

Found 7 numerical columns in constituency-wise-detailed-result.csv: [' AGE ', 'Votes Secured -  GENERAL ', ' Votes Secured - POSTAL ', ' Votes Secured - TOTAL ', '% of votes secured - OVER TOTAL ELECTORS IN CONSTITUENCY', '% of votes secured - OVER TOTAL VOTES POLLED IN CONSTITUENCY', 'Total Electors'] 


COLUMN-WISE ANALYSIS:

----------------------------------

Column " AGE " has a magnitude range of 0.57.
Since magnitude range(0.57) < 2, " AGE " is unlikely to follow Benford's Law.

----------------------------------



'Distribution of first and last digits for column " AGE ":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.0,0.1026
1,0.0,0.301,0.092
2,0.0636,0.1761,0.0976
3,0.2406,0.1249,0.0944
4,0.2992,0.0969,0.1018
5,0.2131,0.0792,0.1002
6,0.1494,0.0669,0.0992
7,0.0313,0.058,0.1026
8,0.0027,0.0512,0.1083
9,0.0001,0.0458,0.1014



----------------------------------

Column "Votes Secured -  GENERAL " has a magnitude range of 4.1.
Since magnitude range(4.1) > 2, it is a good candidate for Benford analysis.

----------------------------------



'Distribution of first and last digits for column "Votes Secured -  GENERAL ":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.0,0.0989
1,0.2841,0.301,0.0951
2,0.1567,0.1761,0.1002
3,0.1205,0.1249,0.0983
4,0.1168,0.0969,0.1036
5,0.0935,0.0792,0.0992
6,0.0736,0.0669,0.0992
7,0.0571,0.058,0.1005
8,0.0513,0.0512,0.1003
9,0.0464,0.0458,0.1047



----------------------------------

Column " Votes Secured - POSTAL " has a magnitude range of 4.29.
Since magnitude range(4.29) > 2, it is a good candidate for Benford analysis.

----------------------------------



'Distribution of first and last digits for column " Votes Secured - POSTAL ":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.0,0.0589
1,0.3522,0.301,0.2136
2,0.1997,0.1761,0.1604
3,0.1267,0.1249,0.1235
4,0.0893,0.0969,0.098
5,0.0705,0.0792,0.0852
6,0.0515,0.0669,0.0707
7,0.042,0.058,0.0698
8,0.0371,0.0512,0.0615
9,0.031,0.0458,0.0583



----------------------------------

Column " Votes Secured - TOTAL " has a magnitude range of 4.1.
Since magnitude range(4.1) > 2, it is a good candidate for Benford analysis.

----------------------------------



'Distribution of first and last digits for column " Votes Secured - TOTAL ":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.0,0.099
1,0.2844,0.301,0.0977
2,0.1563,0.1761,0.0942
3,0.1204,0.1249,0.0993
4,0.1164,0.0969,0.0991
5,0.0938,0.0792,0.1004
6,0.0737,0.0669,0.1027
7,0.0561,0.058,0.1021
8,0.0516,0.0512,0.1012
9,0.0472,0.0458,0.1042



----------------------------------

Column "% of votes secured - OVER TOTAL ELECTORS IN CONSTITUENCY" has a magnitude range of 3.98.
Since magnitude range(3.98) > 2, it is a good candidate for Benford analysis.

----------------------------------



'Distribution of first and last digits for column "% of votes secured - OVER TOTAL ELECTORS IN CONSTITUENCY":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
1,0.2707,0.301,0.1162
2,0.1889,0.1761,0.1167
3,0.1532,0.1249,0.1047
4,0.1028,0.0969,0.1103
5,0.0719,0.0792,0.1093
6,0.062,0.0669,0.1048
7,0.0544,0.058,0.1169
8,0.0494,0.0512,0.1072
9,0.0466,0.0458,0.1139



----------------------------------

Column "% of votes secured - OVER TOTAL VOTES POLLED IN CONSTITUENCY" has a magnitude range of 3.97.
Since magnitude range(3.97) > 2, it is a good candidate for Benford analysis.

----------------------------------



'Distribution of first and last digits for column "% of votes secured - OVER TOTAL VOTES POLLED IN CONSTITUENCY":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
1,0.2774,0.301,0.1113
2,0.1567,0.1761,0.1138
3,0.1348,0.1249,0.1046
4,0.1184,0.0969,0.114
5,0.0938,0.0792,0.1157
6,0.0729,0.0669,0.1135
7,0.0521,0.058,0.1114
8,0.0529,0.0512,0.1061
9,0.0409,0.0458,0.1096



----------------------------------

Column "Total Electors" has a magnitude range of 1.76.
Since magnitude range(1.76) < 2, "Total Electors" is unlikely to follow Benford's Law.

----------------------------------



'Distribution of first and last digits for column "Total Electors":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.0,0.1309
1,0.8715,0.301,0.0997
2,0.1098,0.1761,0.0708
3,0.0041,0.1249,0.1146
4,0.0023,0.0969,0.1105
5,0.0024,0.0792,0.1096
6,0.0043,0.0669,0.1017
7,0.002,0.058,0.0678
8,0.0,0.0512,0.0815
9,0.0036,0.0458,0.1129


In [11]:
election_data = pd.read_csv('../data/constituency-wise-detailed-result.csv')
election_data.columns = election_data.columns.str.strip()
election_data.head()

Unnamed: 0,State Name,PC NAME,CANDIDATES NAME,SEX,AGE,CATEGORY,PARTY NAME,PARTY SYMBOL,Votes Secured - GENERAL,Votes Secured - POSTAL,Votes Secured - TOTAL,% of votes secured - OVER TOTAL ELECTORS IN CONSTITUENCY,% of votes secured - OVER TOTAL VOTES POLLED IN CONSTITUENCY,Total Electors
0,Andhra Pradesh,Aruku,KISHORE CHANDRA DEO,MALE,72.0,ST,TDP,Bicycle,336163.0,1938.0,338101.0,23.29453,31.356893,1451418.0
1,Andhra Pradesh,Aruku,Dr. KOSURI KASI VISWANADHA VEERA VENKATA SATYA...,MALE,54.0,ST,BJP,Lotus,17578.0,289.0,17867.0,1.231003,1.65706,1451418.0
2,Andhra Pradesh,Aruku,GODDETI. MADHAVI,FEMALE,26.0,ST,YSRCP,Ceiling Fan,557561.0,4629.0,562190.0,38.733845,52.13984,1451418.0
3,Andhra Pradesh,Aruku,SHRUTI DEVI VYRICHERLA,FEMALE,46.0,ST,INC,Hand,17656.0,74.0,17730.0,1.221564,1.644354,1451418.0
4,Andhra Pradesh,Aruku,GANGULAIAH VAMPURU.,MALE,49.0,ST,JnP,Glass Tumbler,42245.0,549.0,42794.0,2.948427,3.968894,1451418.0


In [12]:
total_electors_values = election_data.groupby('PC NAME')['Total Electors'].mean()

total_electors_values.astype(str).str[0].value_counts().sort_index() / total_electors_values.shape[0]

Total Electors
1    0.888889
2    0.085185
3    0.005556
4    0.003704
5    0.005556
6    0.001852
7    0.005556
9    0.003704
Name: count, dtype: float64

In [13]:
election_data.groupby('PC NAME')['Total Electors'].nunique().value_counts()

Total Electors
1    537
2      3
Name: count, dtype: int64

In [18]:
datasets_overview = pd.read_csv('../data/exploration-datasets/datasets_overview.csv')

In [19]:
datasets_overview.head()

Unnamed: 0,filename,title,description,source,num_rows,url,date_downloaded
0,share-of-population-in-extreme-poverty.csv,Share of population living in extreme poverty,Percentage of population living in households ...,World Bank Poverty and Inequality Platform (2025),2743,https://ourworldindata.org/grapher/share-of-po...,2025-09-21
1,gdp-per-capita-maddison-project-database.csv,GDP per capita,Average economic output per person in a countr...,Bolt and van Zanden – Maddison Project Databas...,21586,https://ourworldindata.org/grapher/gdp-per-cap...,2025-09-21
2,distribution-of-population-poverty-thresholds.csv,Distribution of population between different p...,Number of people living in households with an ...,World Bank Poverty and Inequality Platform (2025),2743,https://ourworldindata.org/grapher/distributio...,2025-09-21
3,human-development-index.csv,Human Development Index,The Human Development Index (HDI) is a summary...,"UNDP, Human Development Report (2025)",6683,https://ourworldindata.org/grapher/human-devel...,2025-09-21
4,prevalence-of-undernourishment.csv,Share of people who are undernourished,Share of the population whose daily food intak...,Food and Agriculture Organization of the Unite...,4683,https://ourworldindata.org/grapher/prevalence-...,2025-09-21


In [None]:

# Expected Benford distribution for first digits 1-9
benford_probs = {
    d: np.log10(1 + 1/d) for d in range(1, 10)
}

# Example observed data: digit counts from your dataset
observed_counts = np.array([30, 18, 12, 10, 8, 7, 6, 5, 4])  # replace with your counts
n = observed_counts.sum()

# Expected counts under Benford's law
expected_counts = np.array([p * n for p in benford_probs.values()])

# Chi-square goodness of fit test
chi2_stat, p_value = chisquare(f_obs=observed_counts, f_exp=expected_counts)

print("Chi-square statistic:", chi2_stat)
print("p-value:", p_value)

if p_value > 0.05:
    print("Fail to reject null: data follows Benford's law (at 5% significance).")
else:
    print("Reject null: data does not follow Benford's law.")

Chi-square statistic: 0.1351691497839959
p-value: 0.9999991763847688
Fail to reject null: data follows Benford's law (at 5% significance).
