In [20]:
import pandas as pd
import numpy as np
from scipy.stats import chisquare
import re
    

In [21]:
# Load sample dataset
population_filename = "../data/population-since-10000bc.csv"
population_data = pd.read_csv(population_filename)
popaulation_column = 'Population (historical)' # column with numerical data

population_data.head()

Unnamed: 0,Entity,Code,Year,Population (historical)
0,Afghanistan,AFG,-10000,14737
1,Afghanistan,AFG,-9000,20405
2,Afghanistan,AFG,-8000,28253
3,Afghanistan,AFG,-7000,39120
4,Afghanistan,AFG,-6000,54166


In [22]:
MIN_ORDERS_OF_MAGNITUDE = 2

def check_magnitude_range(series):
    p1 = series.abs().quantile(0.01)
    p99 = series.abs().quantile(0.99)
    
    
    magnitude_range = np.log10(p99) - np.log10(p1)
    
    return magnitude_range

print(check_magnitude_range(population_data[popaulation_column]))

6.482972250046383


In [23]:
benford_probs = {
    d: np.log10(1 + 1/d) for d in range(1, 10)
}

benford_first_digits = list(benford_probs.values())
benford_last_digits = [0.1] * 10

def test_goodness_of_fit_chisquare(observed_counts, expected, label='first'):
    print(f"\nTesting goodness of fit for {label} digits:")

    # Example observed data: digit counts from your dataset
    observed_counts = np.array(observed_counts)  # replace with your counts
    n = observed_counts.sum()

    # Expected counts under Benford's law
    expected_counts = np.array([p * n for p in expected])
    print(f"observed counts for {label} digits:", observed_counts)
    print(f"expected counts for {label} digits:", expected_counts)

    # Chi-square goodness of fit test
    chi2_stat, p_value = chisquare(f_obs=observed_counts, f_exp=expected_counts)

    print(f"\nChi-square goodness of fit test for {label} digits:")
    print("Chi-square statistic:", chi2_stat)
    print("p-value:", p_value)

    if p_value > 0.05:
        print(f"Fail to reject null: data follows Benford's law (at 5% significance) for {label} digits.")
    else:
        print(f"Reject null: data does not follow Benford's law for {label} digits.")
        
    return chi2_stat, p_value

def test_goodness_of_fit_mad(observed_counts, expected, label='first'):
    print(f"\nTesting goodness of fit (MAD) for {label} digits:")
    observed_counts = np.array(observed_counts)

    # Convert observed counts to proportions
    observed_probs = observed_counts / observed_counts.sum()
    print(f"observed proportions for {label} digits:", observed_probs)
    print(f"expected proportions for {label} digits:", expected)

    # Mean Absolute Deviation
    mad = np.mean(np.abs(observed_probs - expected))
    
    category = ''

    # Classify conformity (per Nigrini's thresholds)
    if mad < 0.006:
        print(f"{label.capitalize()} digits show Close conformity with Benford's Law")
        category = 'Close conformity'
    elif mad < 0.012:
        print(f"{label.capitalize()} digits show Acceptable conformity with Benford's Law")
        category = 'Acceptable conformity'
    else:
        print(f"{label.capitalize()} digits show Non-conformity with Benford's Law")
        category = 'Non-conformity'
        
    return mad, category


In [24]:
def get_dataset_stats(dataset, json_data = [], dataset_metadata=None, print_output=True):
    
    df = None
    
    if len(json_data) > 0:
        df = json_data
    else:
        df = pd.read_csv(f'../data/{dataset}')
    
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    print(f'Found {len(numerical_cols)} numerical columns in {dataset}: {numerical_cols} \n\n')
    
    result_list = []
    
    print('COLUMN-WISE ANALYSIS:')
    for col in numerical_cols:
        print('----------------------------------\n')
        
        
        col_data = df[col].abs()
        col_data = col_data.replace(0, np.nan).dropna()
        
        magnitude_range = np.round(check_magnitude_range(col_data), 2)
        data_range_min = col_data.min()
        data_range_max = col_data.max() 
        
        print(f'Column "{col}" has a magnitude range of {magnitude_range}.')

        if magnitude_range == MIN_ORDERS_OF_MAGNITUDE:
            print(f'Since magnitude range({magnitude_range} = 2), the data should start to show Benford-like properties.')
            
        elif (magnitude_range > MIN_ORDERS_OF_MAGNITUDE):
            print(f'Since magnitude range({magnitude_range}) > 2, it is a good candidate for Benford analysis.')

        else:
            print(f'Since magnitude range({magnitude_range}) < 2, "{col}" is unlikely to follow Benford\'s Law.')
            
        benfords_ten_digits = [0] + benford_first_digits

        # Extract first and last digits
        first_digits = col_data.apply(extract_first_digit).value_counts().sort_index()
        # Ensure digits 1-9 are present (missing digits get count 0)
        first_digits = first_digits.reindex(range(1, 10), fill_value=0)
        print('first_digits:', first_digits)
        first_digits_ratio = first_digits / first_digits.sum()

        last_digits = col_data.apply(extract_last_digit).value_counts().sort_index()
        last_digits = last_digits.reindex(range(0, 10), fill_value=0)
        print('last_digits:', last_digits)
        last_digits_ratio = last_digits / last_digits.sum()
        
        benford_first_digit_series = pd.Series(benford_first_digits)
        benford_first_digit_series.index = benford_first_digit_series.index
        
        
        results = pd.DataFrame({
            'First Digit Ratio': first_digits_ratio,
            'Expected Benford First Digit Ratio': benford_first_digit_series,
            'Last Digit Ratio': last_digits_ratio,
        })


        if print_output:
            display(f'Distribution of first and last digits for column "{col}":')
            display(results.fillna(0).round(4))
        
        # chisquare_first, pvalue_first = test_goodness_of_fit_chisquare(first_digits.values.tolist(), benford_first_digits, label='first')
        # chisquare_last, pvalue_last = test_goodness_of_fit_chisquare(last_digits.values.tolist(), benford_last_digits, label='last')
        
        mad_first, mad_first_category = test_goodness_of_fit_mad(first_digits.values.tolist(), benford_first_digits, label='first')
        mad_last, mad_last_category = test_goodness_of_fit_mad(last_digits.values.tolist(), benford_last_digits, label='last')
        
        final_result = {
            'dataset': dataset,
            'column': col,
            'data_range_min': data_range_min,
            'data_range_max': data_range_max,
            'magnitude_range': magnitude_range,
            'mad_first': mad_first,
            'mad_first_category': mad_first_category,
            'mad_last': mad_last,
            'mad_last_category': mad_last_category,
        }
        
        first_digits_ratio_cols = {f"first_digit_{i}": v for i, v in first_digits_ratio.items()}
        first_digits_counts = {f"first_digit_count_{i}": v for i, v in first_digits.items()}
        last_digits_ratio_cols = {f"last_digit_{i}": v for i, v in last_digits_ratio.items()}
        
        final_result.update(first_digits_ratio_cols)
        final_result.update(first_digits_counts)
        final_result.update(last_digits_ratio_cols)
        
        
        if dataset_metadata is not None:
            final_result['title'] = dataset_metadata.get('title', '')
            final_result['source'] = dataset_metadata.get('source', '')
            final_result['description'] = dataset_metadata.get('description', '')
            final_result['url'] = dataset_metadata.get('url', '')
        
        result_list.append(final_result)
        
    return result_list


def extract_first_digit(x):
    s = str(abs(x))  # remove sign
    s = re.sub(r"[^0-9]", "", s)  # keep only digits
    for ch in s:
        if ch != "0":
            return int(ch)
    return None


def extract_last_digit(x):
    """Extract last digit from a number, ignoring trailing .0 if present."""
    try:
        s = str(x).strip()
        
        # If the number ends with ".0", drop it
        if s.endswith(".0"):
            s = s[:-2]
        
        s = s.replace(".", "").replace("-", "")  # remove dot and minus
        return int(s[-1]) if s else None
    
    except Exception:
        return None
    
temp = get_dataset_stats(population_filename, print_output=False)

Found 2 numerical columns in ../data/population-since-10000bc.csv: ['Year', 'Population (historical)'] 


COLUMN-WISE ANALYSIS:
----------------------------------

Column "Year" has a magnitude range of 1.37.
Since magnitude range(1.37) < 2, "Year" is unlikely to follow Benford's Law.
first_digits: Year
1    49402
2     6576
3      387
4      387
5      388
6      388
7      389
8      388
9      388
Name: count, dtype: int64
last_digits: Year
0    12968
1     5278
2     5249
3     5257
4     4988
5     4992
6     4997
7     4988
8     4988
9     4988
Name: count, dtype: int64

Testing goodness of fit (MAD) for first digits:
observed proportions for first digits: [0.84170174 0.11204062 0.00659363 0.00659363 0.00661067 0.00661067
 0.00662771 0.00661067 0.00661067]
expected proportions for first digits: [0.3010299956639812, 0.17609125905568124, 0.12493873660829993, 0.09691001300805642, 0.07918124604762482, 0.06694678963061322, 0.05799194697768673, 0.05115252244738129, 0.04575749056067514

In [25]:
# get_dataset_stats('population-since-10000bc.csv')

In [26]:
datasets = pd.read_csv('../data/exploration-datasets/datasets_overview.csv')
display(datasets.head())

def test_runner(datasets=datasets):
    overview_stats = []
    
    for idx, row in datasets.iterrows():
        dataset_name = row['filename']
        print(f'Processing dataset: {dataset_name}')
        dataset_stats = get_dataset_stats("exploration-datasets/"+dataset_name, dataset_metadata=row, print_output=False)
        overview_stats.extend(dataset_stats)
        
    return overview_stats

Unnamed: 0,filename,title,description,source,num_rows,url,date_downloaded
0,share-of-population-in-extreme-poverty.csv,Share of population living in extreme poverty,Percentage of population living in households ...,World Bank Poverty and Inequality Platform (2025),2743,https://ourworldindata.org/grapher/share-of-po...,2025-09-21
1,gdp-per-capita-maddison-project-database.csv,GDP per capita,Average economic output per person in a countr...,Bolt and van Zanden – Maddison Project Databas...,21586,https://ourworldindata.org/grapher/gdp-per-cap...,2025-09-21
2,distribution-of-population-poverty-thresholds.csv,Distribution of population between different p...,Number of people living in households with an ...,World Bank Poverty and Inequality Platform (2025),2743,https://ourworldindata.org/grapher/distributio...,2025-09-21
3,human-development-index.csv,Human Development Index,The Human Development Index (HDI) is a summary...,"UNDP, Human Development Report (2025)",6683,https://ourworldindata.org/grapher/human-devel...,2025-09-21
4,prevalence-of-undernourishment.csv,Share of people who are undernourished,Share of the population whose daily food intak...,Food and Agriculture Organization of the Unite...,4683,https://ourworldindata.org/grapher/prevalence-...,2025-09-21


In [27]:
stats = test_runner(datasets)
stats_df = pd.DataFrame(stats)
# stats_df.to_csv('../data/outputs/columns_overview.csv', index=False)

# prep for json export
first_digit_cols = [f"first_digit_{i}" for i in range(1, 10)]

first_digit_count_cols = [f"first_digit_count_{i}" for i in range(1, 10)]

last_digit_cols = [f"last_digit_{i}" for i in range(0, 10)]


# work on a copy so original stats_df is unchanged
stats_json_df = stats_df.copy()


stats_json_df["first_digits_proportions"] = stats_json_df[first_digit_cols].values.tolist()
stats_json_df["last_digits_proportions"] = stats_json_df[last_digit_cols].values.tolist()
stats_json_df["first_digits_counts"] = stats_json_df[first_digit_count_cols].values.tolist()

stats_json_df = stats_json_df.drop(columns=first_digit_cols + first_digit_count_cols + last_digit_cols)

# stats_json_df.to_json("../data/outputs/columns_overview.json", orient="records", indent=2)

Processing dataset: share-of-population-in-extreme-poverty.csv
Found 2 numerical columns in exploration-datasets/share-of-population-in-extreme-poverty.csv: ['Year', 'Share of population in poverty ($3 a day, 2021 prices) - Income or consumption consolidated'] 


COLUMN-WISE ANALYSIS:
----------------------------------

Column "Year" has a magnitude range of 0.01.
Since magnitude range(0.01) < 2, "Year" is unlikely to follow Benford's Law.
first_digits: Year
1     675
2    2068
3       0
4       0
5       0
6       0
7       0
8       0
9       0
Name: count, dtype: int64
last_digits: Year
0    291
1    293
2    306
3    256
4    253
5    281
6    262
7    257
8    273
9    271
Name: count, dtype: int64

Testing goodness of fit (MAD) for first digits:
observed proportions for first digits: [0.24608093 0.75391907 0.         0.         0.         0.
 0.         0.         0.        ]
expected proportions for first digits: [0.3010299956639812, 0.17609125905568124, 0.12493873660829993, 0.0

  observed_probs = observed_counts / observed_counts.sum()
  observed_probs = observed_counts / observed_counts.sum()


In [28]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_numeric_distribution(filename, column, bins=50, figsize=(10, 6), log_y=False, kde=False):
    """
    Read CSV by filename and display a distribution plot for `column` using seaborn.
    - filename: path or filename (will try given path, else ../data/{filename}).
    - column: column name to plot.
    - bins: number of bins for histogram when many unique values.
    - figsize: matplotlib figure size.
    - log_y: use log scale on y axis when True.
    - kde: if True and histogram is used, overlay a KDE.
    """
    # load dataframe
    try:
        df = pd.read_csv(filename)
    except Exception:
        df = pd.read_csv(f'../data/{filename}')

    if column not in df.columns:
        raise ValueError(f'Column "{column}" not found in dataset')

    # coerce to numeric and drop NaNs
    series = pd.to_numeric(df[column], errors='coerce').dropna()
    if series.empty:
        raise ValueError(f'Column "{column}" contains no numeric data after coercion')

    plt.figure(figsize=figsize)

    if series.nunique() <= 30:
        counts = series.value_counts().sort_index()
        # seaborn likes DataFrame/arrays; use barplot for consistent styling
        sns.barplot(x=counts.index.astype(str), y=counts.values, color='C0', edgecolor='k')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')
    else:
        # histogram with optional KDE overlay
        sns.histplot(series, bins=bins, color='C0', edgecolor='k', kde=kde)
        plt.xlabel(column)
        plt.ylabel('Frequency')

    if log_y:
        plt.yscale('log')

    plt.title(f'Distribution of "{column}"')
    plt.tight_layout()
    plt.show()


In [29]:
stats_df[stats_df['magnitude_range'] > 3]

Unnamed: 0,dataset,column,data_range_min,data_range_max,magnitude_range,mad_first,mad_first_category,mad_last,mad_last_category,first_digit_1,...,last_digit_4,last_digit_5,last_digit_6,last_digit_7,last_digit_8,last_digit_9,title,source,description,url
1,exploration-datasets/share-of-population-in-ex...,"Share of population in poverty ($3 a day, 2021...",0.003731,98.95,3.6,0.016376,Non-conformity,0.030598,Non-conformity,0.245465,...,0.12582,0.149363,0.121575,0.132767,0.100347,0.074103,Share of population living in extreme poverty,World Bank Poverty and Inequality Platform (2025),Percentage of population living in households ...,https://ourworldindata.org/grapher/share-of-po...
5,exploration-datasets/distribution-of-populatio...,Number of people not in poverty (above $10 a d...,2310.0,3901470000.0,4.75,0.015554,Non-conformity,0.003425,Close conformity,0.263081,...,0.098427,0.093304,0.098427,0.104647,0.100988,0.109769,Distribution of population between different p...,World Bank Poverty and Inequality Platform (2025),Number of people living in households with an ...,https://ourworldindata.org/grapher/distributio...
6,exploration-datasets/distribution-of-populatio...,Number of people in poverty (between $8.30 and...,29.0,565308300.0,5.92,0.006225,Acceptable conformity,0.005467,Close conformity,0.300971,...,0.085885,0.100822,0.104929,0.104182,0.095967,0.0941,Distribution of population between different p...,World Bank Poverty and Inequality Platform (2025),Number of people living in households with an ...,https://ourworldindata.org/grapher/distributio...
7,exploration-datasets/distribution-of-populatio...,Number of people in poverty (between $4.20 and...,50.0,2252155000.0,6.34,0.010343,Acceptable conformity,0.00329,Close conformity,0.343785,...,0.103652,0.09443,0.100332,0.101439,0.099963,0.111029,Distribution of population between different p...,World Bank Poverty and Inequality Platform (2025),Number of people living in households with an ...,https://ourworldindata.org/grapher/distributio...
8,exploration-datasets/distribution-of-populatio...,Number of people in poverty (between $3 and $4...,3.0,1042967000.0,6.63,0.008863,Acceptable conformity,0.004037,Close conformity,0.286849,...,0.104092,0.098927,0.1029,0.096146,0.092968,0.102503,Distribution of population between different p...,World Bank Poverty and Inequality Platform (2025),Number of people living in households with an ...,https://ourworldindata.org/grapher/distributio...
9,exploration-datasets/distribution-of-populatio...,"Number of people in poverty ($3 a day, 2021 pr...",18.0,2346264000.0,6.73,0.008245,Acceptable conformity,0.004886,Close conformity,0.309533,...,0.101891,0.094172,0.089927,0.09147,0.103049,0.101891,Distribution of population between different p...,World Bank Poverty and Inequality Platform (2025),Number of people living in households with an ...,https://ourworldindata.org/grapher/distributio...
22,exploration-datasets/population-with-un-projec...,Population - Sex: all - Age: all - Variant: es...,489.0,8091735000.0,6.21,0.008735,Acceptable conformity,0.002274,Close conformity,0.270904,...,0.099609,0.095386,0.09924,0.098184,0.101035,0.102196,Population,"UN, World Population Prospects (2024)","De facto total population in a country, area o...",https://ourworldindata.org/grapher/population-...
23,exploration-datasets/population-with-un-projec...,Population - Sex: all - Age: all - Variant: me...,497.0,10289320000.0,6.45,0.015848,Non-conformity,0.002179,Close conformity,0.278105,...,0.101613,0.098366,0.100802,0.09654,0.102526,0.096794,Population,"UN, World Population Prospects (2024)","De facto total population in a country, area o...",https://ourworldindata.org/grapher/population-...
25,exploration-datasets/population.csv,Population (historical),1.0,8091735000.0,6.48,0.005578,Close conformity,0.007719,Acceptable conformity,0.28358,...,0.097521,0.094176,0.09606,0.09387,0.096485,0.096553,Population,HYDE (2023); Gapminder (2022); UN WPP (2024),"Population by country, available from 10,000 B...",https://ourworldindata.org/grapher/population
32,exploration-datasets/co-emissions-per-capita.csv,Annual COâ emissions (per capita),2e-06,796.3908,5.0,0.007335,Acceptable conformity,0.031571,Non-conformity,0.288861,...,0.134386,0.142166,0.134696,0.123549,0.100054,0.072651,CO₂ emissions per capita,Global Carbon Budget (2024); Population based ...,Carbon dioxide (CO₂) emissions from [burning f...,https://ourworldindata.org/grapher/co-emission...


In [32]:
import json
from pathlib import Path

random_data = [{
    "path": "../data/exploration-datasets/deaths-in-armed-conflicts-by-region.csv",
    "col": "Deaths in ongoing conflicts (best estimate) - Conflict type: all"
    }, {
    "path": "../data/exploration-datasets/electricity-prod-source-stacked.csv",
    "col": "Electricity from gas - TWh (adapted for visualization of chart electricity-prod-source-stacked)"
    },{
    "path": "../data/exploration-datasets/population.csv",
    "col": "Population (historical)"
    },        
]

current_dataset = random_data[1]

random_df = pd.read_csv(current_dataset["path"])

col_name = current_dataset['col']

# Filter rows where 'col' is non-zero
filtered_df = random_df[random_df[col_name] != 0 ]

# Pick N random rows (example: N = 10)
N = 20
sample_df = filtered_df.sample(n=N, random_state=42)

# Convert the sampled rows to JSON
json_output = sample_df.to_json(orient="records")

# Optionally, save the JSON to a file
# sanitize filename (keep letters, numbers, dot, underscore, hyphen)
safe_name = re.sub(r'[^A-Za-z0-9_.-]', '_', col_name).strip('_')
out_dir = Path("../data/outputs")
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / f"{safe_name}.json"
with out_path.open("w", encoding="utf-8") as f:
    json.dump(sample_df.to_dict(orient="records"), f, indent=2)

print("JSON created successfully.")
print(json_output)

JSON created successfully.
[{"Entity":"Angola","Code":"AGO","Year":2021,"Other renewables excluding bioenergy - TWh (adapted for visualization of chart electricity-prod-source-stacked)":0.0,"Electricity from bioenergy - TWh (adapted for visualization of chart electricity-prod-source-stacked)":0.05,"Electricity from solar - TWh (adapted for visualization of chart electricity-prod-source-stacked)":0.02,"Electricity from wind - TWh (adapted for visualization of chart electricity-prod-source-stacked)":0.0,"Electricity from hydro - TWh (adapted for visualization of chart electricity-prod-source-stacked)":12.64,"Electricity from nuclear - TWh (adapted for visualization of chart electricity-prod-source-stacked)":0.0,"Electricity from oil - TWh (adapted for visualization of chart electricity-prod-source-stacked)":2.48,"Electricity from gas - TWh (adapted for visualization of chart electricity-prod-source-stacked)":1.66,"Electricity from coal - TWh (adapted for visualization of chart electricit

In [31]:
rects = pd.read_json('../data/site-rects.json')
rects = rects[rects['area'] != 0]
rects
dataset_stats = get_dataset_stats('../data/site-rects.json',json_data = rects, dataset_metadata=None, print_output=True)


Found 6 numerical columns in ../data/site-rects.json: ['index', 'width', 'height', 'area', 'x', 'y'] 


COLUMN-WISE ANALYSIS:
----------------------------------

Column "index" has a magnitude range of 1.85.
Since magnitude range(1.85) < 2, "index" is unlikely to follow Benford's Law.
first_digits: index
1    1095
2    1102
3    1100
4     142
5     110
6     110
7     109
8     105
9     110
Name: count, dtype: int64
last_digits: index
0    401
1    396
2    401
3    394
4    399
5    398
6    401
7    393
8    402
9    398
Name: count, dtype: int64


'Distribution of first and last digits for column "index":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.301,0.1007
1,0.2749,0.1761,0.0994
2,0.2767,0.1249,0.1007
3,0.2762,0.0969,0.0989
4,0.0357,0.0792,0.1002
5,0.0276,0.0669,0.0999
6,0.0276,0.058,0.1007
7,0.0274,0.0512,0.0987
8,0.0264,0.0458,0.1009
9,0.0276,0.0,0.0999



Testing goodness of fit (MAD) for first digits:
observed proportions for first digits: [0.2749184  0.27667587 0.27617374 0.03565152 0.02761737 0.02761737
 0.02736631 0.02636204 0.02761737]
expected proportions for first digits: [0.3010299956639812, 0.17609125905568124, 0.12493873660829993, 0.09691001300805642, 0.07918124604762482, 0.06694678963061322, 0.05799194697768673, 0.05115252244738129, 0.04575749056067514]
First digits show Non-conformity with Benford's Law

Testing goodness of fit (MAD) for last digits:
observed proportions for last digits: [0.10067788 0.09942255 0.10067788 0.09892041 0.10017575 0.09992468
 0.10067788 0.09866934 0.10092895 0.09992468]
expected proportions for last digits: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
Last digits show Close conformity with Benford's Law
----------------------------------

Column "width" has a magnitude range of 2.1.
Since magnitude range(2.1) > 2, it is a good candidate for Benford analysis.
first_digits: width
1     708
2

'Distribution of first and last digits for column "width":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.301,0.0371
1,0.1777,0.1761,0.0233
2,0.622,0.1249,0.0324
3,0.0379,0.0969,0.0457
4,0.0181,0.0792,0.0075
5,0.005,0.0669,0.8198
6,0.01,0.058,0.005
7,0.0339,0.0512,0.0093
8,0.0256,0.0458,0.0181
9,0.0698,0.0,0.0018



Testing goodness of fit (MAD) for first digits:
observed proportions for first digits: [0.17771084 0.62198795 0.03790161 0.01807229 0.00502008 0.01004016
 0.03388554 0.02560241 0.06977912]
expected proportions for first digits: [0.3010299956639812, 0.17609125905568124, 0.12493873660829993, 0.09691001300805642, 0.07918124604762482, 0.06694678963061322, 0.05799194697768673, 0.05115252244738129, 0.04575749056067514]
First digits show Non-conformity with Benford's Law

Testing goodness of fit (MAD) for last digits:
observed proportions for last digits: [0.03714859 0.02334337 0.03237952 0.04568273 0.00753012 0.81977912
 0.00502008 0.00928715 0.01807229 0.00175703]
expected proportions for last digits: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
Last digits show Non-conformity with Benford's Law
----------------------------------

Column "height" has a magnitude range of 1.66.
Since magnitude range(1.66) < 2, "height" is unlikely to follow Benford's Law.
first_digits: height
1    275

'Distribution of first and last digits for column "height":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.301,0.0555
1,0.6918,0.1761,0.0198
2,0.1057,0.1249,0.0304
3,0.0748,0.0969,0.0038
4,0.0527,0.0792,0.0128
5,0.0113,0.0669,0.2475
6,0.0211,0.058,0.4977
7,0.0238,0.0512,0.0033
8,0.0128,0.0458,0.1255
9,0.006,0.0,0.0038



Testing goodness of fit (MAD) for first digits:
observed proportions for first digits: [0.69176707 0.10567269 0.0747992  0.05271084 0.01129518 0.02108434
 0.02384538 0.0128012  0.0060241 ]
expected proportions for first digits: [0.3010299956639812, 0.17609125905568124, 0.12493873660829993, 0.09691001300805642, 0.07918124604762482, 0.06694678963061322, 0.05799194697768673, 0.05115252244738129, 0.04575749056067514]
First digits show Non-conformity with Benford's Law

Testing goodness of fit (MAD) for last digits:
observed proportions for last digits: [0.05547189 0.01982932 0.03037149 0.00376506 0.0128012  0.24748996
 0.49774096 0.00326305 0.12550201 0.00376506]
expected proportions for last digits: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
Last digits show Non-conformity with Benford's Law
----------------------------------

Column "area" has a magnitude range of 3.3.
Since magnitude range(3.3) > 2, it is a good candidate for Benford analysis.
first_digits: area
1    1204
2    

'Distribution of first and last digits for column "area":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.301,0.0771
1,0.3022,0.1761,0.0279
2,0.0969,0.1249,0.0377
3,0.1343,0.0969,0.0314
4,0.3321,0.0792,0.0321
5,0.0128,0.0669,0.6398
6,0.0469,0.058,0.0572
7,0.0387,0.0512,0.0196
8,0.012,0.0458,0.055
9,0.0241,0.0,0.0223



Testing goodness of fit (MAD) for first digits:
observed proportions for first digits: [0.30220884 0.09688755 0.13428715 0.33207831 0.0128012  0.04693775
 0.03865462 0.01204819 0.02409639]
expected proportions for first digits: [0.3010299956639812, 0.17609125905568124, 0.12493873660829993, 0.09691001300805642, 0.07918124604762482, 0.06694678963061322, 0.05799194697768673, 0.05115252244738129, 0.04575749056067514]
First digits show Non-conformity with Benford's Law

Testing goodness of fit (MAD) for last digits:
observed proportions for last digits: [0.07705823 0.02786145 0.0376506  0.0313755  0.03212851 0.63980924
 0.05722892 0.01957831 0.05496988 0.02233936]
expected proportions for last digits: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
Last digits show Non-conformity with Benford's Law
----------------------------------

Column "x" has a magnitude range of 1.0.
Since magnitude range(1.0) < 2, "x" is unlikely to follow Benford's Law.
first_digits: x
1    1728
2     218
3    

'Distribution of first and last digits for column "x":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.301,0.001
1,0.434,0.1761,0.0023
2,0.0547,0.1249,0.0208
3,0.045,0.0969,0.0023
4,0.0937,0.0792,0.0063
5,0.0673,0.0669,0.8975
6,0.0874,0.058,0.0063
7,0.0723,0.0512,0.0053
8,0.0741,0.0458,0.049
9,0.0716,0.0,0.0093



Testing goodness of fit (MAD) for first digits:
observed proportions for first digits: [0.43395279 0.05474636 0.04495229 0.09367152 0.06730286 0.08739327
 0.07232546 0.07408338 0.07157207]
expected proportions for first digits: [0.3010299956639812, 0.17609125905568124, 0.12493873660829993, 0.09691001300805642, 0.07918124604762482, 0.06694678963061322, 0.05799194697768673, 0.05115252244738129, 0.04575749056067514]
First digits show Non-conformity with Benford's Law

Testing goodness of fit (MAD) for last digits:
observed proportions for last digits: [0.00100452 0.00226017 0.0208438  0.00226017 0.00627825 0.89753893
 0.00627825 0.00527373 0.04897037 0.00929181]
expected proportions for last digits: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
Last digits show Non-conformity with Benford's Law
----------------------------------

Column "y" has a magnitude range of 2.44.
Since magnitude range(2.44) > 2, it is a good candidate for Benford analysis.
first_digits: y
1     298
2     460

'Distribution of first and last digits for column "y":'

Unnamed: 0,First Digit Ratio,Expected Benford First Digit Ratio,Last Digit Ratio
0,0.0,0.301,0.0
1,0.0748,0.1761,0.004
2,0.1155,0.1249,0.004
3,0.1082,0.0969,0.0003
4,0.06,0.0792,0.0043
5,0.363,0.0669,0.9761
6,0.2217,0.058,0.0025
7,0.0404,0.0512,0.0015
8,0.0138,0.0458,0.0025
9,0.0025,0.0,0.0048



Testing goodness of fit (MAD) for first digits:
observed proportions for first digits: [0.07481798 0.11549084 0.10820989 0.06000502 0.36304293 0.22169219
 0.04042179 0.01380869 0.00251067]
expected proportions for first digits: [0.3010299956639812, 0.17609125905568124, 0.12493873660829993, 0.09691001300805642, 0.07918124604762482, 0.06694678963061322, 0.05799194697768673, 0.05115252244738129, 0.04575749056067514]
First digits show Non-conformity with Benford's Law

Testing goodness of fit (MAD) for last digits:
observed proportions for last digits: [0.00000000e+00 4.01707256e-03 4.01707256e-03 2.51067035e-04
 4.26813959e-03 9.76148632e-01 2.51067035e-03 1.50640221e-03
 2.51067035e-03 4.77027366e-03]
expected proportions for last digits: [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
Last digits show Non-conformity with Benford's Law
