In [None]:
import pandas as pd
import numpy as np

In [12]:
benford_first_digit = {
    1: 0.3010,
    2: 0.1761,
    3: 0.1249,
    4: 0.0969,
    5: 0.0792,
    6: 0.0669,
    7: 0.0580,
    8: 0.0512,
    9: 0.0458
}

metadata = pd.read_csv('../data/metadata.csv')
display(metadata.head())

Unnamed: 0,filename,title,description,source
0,population-since-10000bc.csv,"Population, 10,000 BCE to 2023","Population by country, available from 10,000 B...",HYDE (2023); Gapminder (2022); UN WPP (2024)
1,yearly-number-of-objects-launched-into-outer-s...,Annual number of objects launched into space,"Annual number of satellites, probes, landers, ...",UNOOSA
2,fish-seafood-production.csv,Fish and Seafood Production,Fish and seafood production is measured as the...,Food and Agriculture Organization of the Unite...
3,number-of-people-living-in-extreme-poverty.csv,"Number of people living in extreme poverty, 2024",Extreme poverty is defined as living below the...,World Bank Poverty and Inequality Platform (2025)
4,annual-co2-emissions-per-country.csv,Annual CO? emissions by country,Annual total emissions of carbon dioxide (CO?)...,Global Carbon Budget (2024)


In [3]:
population_data = pd.read_csv("../data/population-since-10000bc.csv")
population_data.head()

Unnamed: 0,Entity,Code,Year,Population (historical)
0,Afghanistan,AFG,-10000,14737
1,Afghanistan,AFG,-9000,20405
2,Afghanistan,AFG,-8000,28253
3,Afghanistan,AFG,-7000,39120
4,Afghanistan,AFG,-6000,54166


In [7]:
column = 'Population (historical)'

first_digits = population_data[column].astype(str).str[0].value_counts().sort_index()
first_digits / first_digits.sum()

Population (historical)
1    0.283580
2    0.186296
3    0.133062
4    0.101953
5    0.080914
6    0.065427
7    0.054933
8    0.048769
9    0.045067
Name: count, dtype: float64

In [8]:
last_digits = population_data[column].astype(str).str[-1].value_counts().sort_index()
last_digits / last_digits.sum()

Population (historical)
0    0.138597
1    0.094855
2    0.094583
3    0.097300
4    0.097521
5    0.094176
6    0.096060
7    0.093870
8    0.096485
9    0.096553
Name: count, dtype: float64

In [59]:
MIN_ORDERS_OF_MAGNITUDE = 2

def check_magnitude_range(series):
    max_value = series.abs().max()
    min_value = series.abs().min()
    
    magnitude_range = np.log10(max_value) - np.log10(min_value)
    
    return magnitude_range

print(check_magnitude_range(population_data[column]))

9.908041643529613


In [97]:

def get_dataset_stats(dataset):
    
    df = pd.read_csv(f'../data/{dataset}')
    
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    print(f'Found {len(numerical_cols)} numerical columns in {dataset}: {numerical_cols} \n\n')
    
    print('COLUMN-WISE ANALYSIS:')
    for col in numerical_cols:
        print('\n----------------------------------\n')
        
        col_data = df[col].abs()
        col_data = col_data.replace(0, np.nan).dropna()
        
        magnitude_range = np.round(check_magnitude_range(col_data), 2)
        
        print(f'Column "{col}" has a magnitude range of {magnitude_range}.')

        if magnitude_range == MIN_ORDERS_OF_MAGNITUDE:
            print(f'Since magnitude range({magnitude_range} = 2), the data should start to show Benford-like properties.')
            
        elif (magnitude_range > MIN_ORDERS_OF_MAGNITUDE):
            print(f'Since magnitude range({magnitude_range}) > 2, it is a good candidate for Benford analysis.')

        else:
            print(f'Since magnitude range({magnitude_range}) < 2, "{col}" is unlikely to follow Benford\'s Law.')

        print('\n----------------------------------\n')
        
        
        first_digits = col_data.astype(str).str[0].value_counts().sort_index()
        first_digits_ratio = first_digits / first_digits.sum()
        
        last_digits = col_data.astype(str).str[-1].value_counts().sort_index()
        last_digits_ratio = last_digits / last_digits.sum()
        
        benford_first_digit_series = pd.Series(benford_first_digit)
        benford_first_digit_series.index = benford_first_digit_series.index.astype(str)
        
        results = pd.DataFrame({
            'First Digit Ratio': first_digits_ratio,
            'Expected Benford First Digit Ratio': benford_first_digit_series,
            'Last Digit Ratio': last_digits_ratio,
        })

        display(f'Distribution of first and last digits for column "{col}":')
        display(results.fillna(0).round(4))

    
    

In [98]:
get_dataset_stats('yearly-number-of-objects-launched-into-outer-space.csv')

Found 2 numerical columns in yearly-number-of-objects-launched-into-outer-space.csv: ['Year', 'Annual number of objects launched into outer space'] 


COLUMN-WISE ANALYSIS:

----------------------------------

Column "Year" has a magnitude range of 0.01.
Since magnitude range(0.01) < 2, "Year" is unlikely to follow Benford's Law.

----------------------------------



'Distribution of first and last digits for column "Year":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.0,0.1081
1,0.5403,0.301,0.11
2,0.4597,0.1761,0.1061
3,0.0,0.1249,0.0982
4,0.0,0.0969,0.11
5,0.0,0.0792,0.0923
6,0.0,0.0669,0.0806
7,0.0,0.058,0.0884
8,0.0,0.0512,0.1022
9,0.0,0.0458,0.1041



----------------------------------

Column "Annual number of objects launched into outer space" has a magnitude range of 3.46.
Since magnitude range(3.46) > 2, it is a good candidate for Benford analysis.

----------------------------------



'Distribution of first and last digits for column "Annual number of objects launched into outer space":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.0,0.0747
1,0.3635,0.301,0.2004
2,0.2495,0.1761,0.1788
3,0.1297,0.1249,0.1002
4,0.0668,0.0969,0.1139
5,0.0393,0.0792,0.0786
6,0.0275,0.0669,0.0668
7,0.053,0.058,0.0589
8,0.0255,0.0512,0.057
9,0.0452,0.0458,0.0707
