In [9]:
import pandas as pd
import numpy as np
import joblib  # For saving and loading the model
from scipy.stats import chisquare
from sklearn.ensemble import RandomForestClassifier

In [10]:
# Read the manipulated dataset
dfM = pd.read_csv("mtcars_manipulated.csv")

# Read the original dataset
dfO = pd.read_csv("mtcars_original.csv")

# Select only relavant variables
X_M = dfM[['mpg', 'hp', 'wt']]
X_O = dfO[['mpg', 'hp', 'wt']]


In [11]:
# Random Forest Classifier
# -----------------------------

# Load trained RandomForest model
# -----------------------------
# Load a pre-trained Random Forest Model
model = joblib.load("myRFmodel.pkl")

# -----------------------------
# Classify using loaded model
# -----------------------------
classifiedM = model.predict(X_M)
classifiedO = model.predict(X_O)

# -----------------------------
# Decide if the predicted is manipulated by majority
# -----------------------------
threshold = 0.5  
print("=== Random Forest Classification ===\n")

manipulated_ratioM = np.mean(classifiedM)
if manipulated_ratioM >= threshold:
    dataset_classificationM = "Manipulated"
else:
    dataset_classificationM = "Not Manipulated"
print(f"mtcars_manipulated.csv classified as: {dataset_classificationM}")
    
manipulated_ratioO = np.mean(classifiedO)
if manipulated_ratioO >= threshold:
    dataset_classificationO = "Manipulated"
else:
    dataset_classificationO = "Not Manipulated"
print(f"mtcars_original.csv classified as: {dataset_classificationO}")



=== Random Forest Classification ===

mtcars_manipulated.csv classified as: Manipulated
mtcars_original.csv classified as: Not Manipulated


In [12]:
#Benford functions

def benford_expected_distribution():
    return np.array([np.log10(1 + 1/d) for d in range(1, 10)])

def extract_first_digit(series):
    return (
        series.dropna()
        .astype(str)
        .str.replace(".", "", regex=False)
        .str.extract(r'([1-9])')[0]
        .dropna()
        .astype(int)
    )

def perform_benford_test(series):
    first_digits = extract_first_digit(series)
    observed_counts = first_digits.value_counts().sort_index()
    observed = observed_counts.reindex(range(1, 10), fill_value=0).values
    expected = benford_expected_distribution() * observed.sum()
    chi2, p = chisquare(f_obs=observed, f_exp=expected)
    return chi2, p

In [13]:
#Benford Classifier 

variable = 'mpg'
chi2_statM, p_valueM = perform_benford_test(X_M[variable])
chi2_statO, p_valueO = perform_benford_test(X_O[variable])
# -----------------------------
# Classify dataset:  Find extreme manipulations
#  30.58	â‰ˆ 0.0001	Detects only the most extreme manipulations
# -----------------------------
if chi2_statM > 30.58 and p_valueM < 0.0001:
    classificationM = "Manipulated"
else:
    classificationM = "Not Manipulated"
    
if chi2_statO > 30.58 and p_valueO < 0.0001:
    classificationO = "Manipulated"
else:
    classificationO = "Not Manipulated"

# -----------------------------
# 5. Output results
# -----------------------------
print("=== Benford Classification ===")
print("\nmtcars_manipulated.csv")
print(f"Variable analyzed: {variable}")
print(f"Chi-square statistic: {chi2_statM:.3f}")
print(f"P-value: {p_valueM:.15f}")
print(f"Dataset classified as: {classificationM}")
print("\nmtcars_original.csv")
print(f"Chi-square statistic: {chi2_statO:.3f}")
print(f"P-value: {p_valueO:.15f}")
print(f"Dataset classified as: {classificationO}")

=== Benford Classification ===

mtcars_manipulated.csv
Variable analyzed: mpg
Chi-square statistic: 74.302
P-value: 0.000000000000681
Dataset classified as: Manipulated

mtcars_original.csv
Chi-square statistic: 23.383
P-value: 0.002905835458132
Dataset classified as: Not Manipulated
