In [18]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from scipy.stats import f_oneway
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency

In [19]:
# Configuration
dataset_name = "cais/hle"
hf_split = "test"
domain_col = "category"  # Adjust if needed
test_size = 0.3  # 30% for evaluation
min_count_per_domain = 50  # Minimum samples per domain for sufficiency check


In [20]:

# Load dataset
dataset = load_dataset(dataset_name, split=hf_split)
df = pd.DataFrame(dataset)


In [None]:
import numpy as np
import math

# Z-test function (as previously provided)
def z_test_p_value(sample_mean, population_mean, population_std, sample_size, alternative='two-sided'):
    z = (sample_mean - population_mean) / (population_std / math.sqrt(sample_size))
    if alternative == 'two-sided':
        p_value = 2 * (1 - norm.cdf(abs(z)))
    elif alternative == 'greater':
        p_value = 1 - norm.cdf(z)
    elif alternative == 'less':
        p_value = norm.cdf(z)
    else:
        raise ValueError("Alternative must be one of ['two-sided', 'greater', 'less']")
    
    return p_value

# Function to flag outliers using Z-test
def detect_outliers_z(data, population_mean, population_std, threshold=0.05):
    sample_size = len(data)
    sample_mean = np.mean(data)
    
    outliers = []
    for value in data:
        p_value = z_test_p_value(value, population_mean, population_std, sample_size)
        if p_value <= threshold:  # Flag as outlier if p-value is smaller than threshold
            outliers.append(value)
    
    return outliers

# T-test function (as previously provided)
def t_test_p_value(sample_mean, population_mean, sample_std, sample_size, alternative='two-sided'):
    t_stat = (sample_mean - population_mean) / (sample_std / math.sqrt(sample_size))
    degrees_of_freedom = sample_size - 1
    if alternative == 'two-sided':
        p_value = 2 * (1 - t.cdf(abs(t_stat), df=degrees_of_freedom))
    elif alternative == 'greater':
        p_value = 1 - t.cdf(t_stat, df=degrees_of_freedom)
    elif alternative == 'less':
        p_value = t.cdf(t_stat, df=degrees_of_freedom)
    else:
        raise ValueError("Alternative must be one of ['two-sided', 'greater', 'less']")
    
    return p_value

# Function to flag outliers using T-test
def detect_outliers_t(data, population_mean, sample_std, threshold=0.05):
    sample_size = len(data)
    sample_mean = np.mean(data)
    
    outliers = []
    for value in data:
        p_value = t_test_p_value(value, population_mean, sample_std, sample_size)
        if p_value <= threshold:  # Flag as outlier if p-value is smaller than threshold
            outliers.append(value)
    
    return outliers


In [23]:

# Validate expected column
if domain_col not in df.columns:
  raise ValueError(f"Expected column '{domain_col}' not found in dataset.")

# Overall domain distribution
domain_counts = df[domain_col].value_counts().reset_index()
domain_counts.columns = [domain_col, "count"]

# Create a dictionary to hold train and eval sets for each domain
df_train = pd.DataFrame()
df_eval = pd.DataFrame()

# For each domain, split it such that it appears in both the training and evaluation sets
for domain in df[domain_col].unique():
  # Select all rows for the current domain
  domain_data = df[df[domain_col] == domain]

  # Split the domain data into train and eval sets
  train_data, eval_data = train_test_split(domain_data, test_size=test_size, random_state=42)

  # Append the data to the respective train and eval sets
  df_train = pd.concat([df_train, train_data])
  df_eval = pd.concat([df_eval, eval_data])

# Check that both train and eval sets contain all domains
print("Domains in training set:", df_train[domain_col].unique())
print("Domains in evaluation set:", df_eval[domain_col].unique())

# Training set distribution
train_counts = df_train[domain_col].value_counts().reset_index()
train_counts.columns = [domain_col, "count"]

# Evaluation set distribution
eval_counts = df_eval[domain_col].value_counts().reset_index()
eval_counts.columns = [domain_col, "count"]

# Detailed stats per domain
# Step 1: Compute stats without chi-squared
stats = []

# Iterate over all domains to compute basic stats first
for dom in domain_counts[domain_col]:
    # Get counts for each domain in the train set
    train_n = train_counts[train_counts[domain_col] == dom]["count"].values[0] if dom in train_counts[domain_col].values else 0
    
    # Get counts for each domain in the eval set
    eval_n = eval_counts[eval_counts[domain_col] == dom]["count"].values[0] if dom in eval_counts[domain_col].values else 0
    
    # Ensure total counts are correct
    total_n = train_n + eval_n
    
    # Sufficient data check
    sufficiency = total_n >= min_count_per_domain
    
    # Imbalance ratio
    imbalance_ratio = train_n / total_n if total_n > 0 else 0
    
    # Eval ratio
    eval_ratio = eval_n / total_n if total_n > 0 else 0
    
    # Train/Eval ratio
    train_eval_ratio = train_n / eval_n if eval_n > 0 else float("inf")
    
    # Imbalance severity
    imbalance_severity = abs(0.8 - imbalance_ratio)  # Deviation from ideal split
    
    # Append the domain stats for now, without chi-squared results
    stats.append({
        "Domain": dom, "Total": total_n, "Train": train_n, "Eval": eval_n, 
        "Train %": imbalance_ratio, "Eval %": eval_ratio, "Train/Eval Ratio": train_eval_ratio,
        "Imbalance Severity": imbalance_severity, "Sufficient Data": sufficiency
    })

# Step 2: Perform Chi-Squared Calculation on valid domains
chi2_results = []
for stat in stats:
    train_n = stat["Train"]
    eval_n = stat["Eval"]
    
    test, p_value, dof, expected = ????????????
    
    # Add chi-squared results to the domain stats
    chi2_results.append({
        "Domain": stat["Domain"], "P-Value": p_value, "Test Score": test, "DOF": dof, "Expected": expected
    })

# Step 3: Combine the stats and chi-squared results
df_stats = pd.DataFrame(stats)
df_chi2 = pd.DataFrame(chi2_results)

# Merge both stats and chi-squared results
df_stats = df_stats.merge(df_chi2, on="Domain", how="left")


# Display the stats
df_stats

Domains in training set: ['Other' 'Humanities/Social Science' 'Math' 'Physics'
 'Computer Science/AI' 'Biology/Medicine' 'Chemistry' 'Engineering']
Domains in evaluation set: ['Other' 'Humanities/Social Science' 'Math' 'Physics'
 'Computer Science/AI' 'Biology/Medicine' 'Chemistry' 'Engineering']


Unnamed: 0,Domain,Total,Train,Eval,Train %,Eval %,Train/Eval Ratio,Imbalance Severity,Sufficient Data,P-Value,Test Score,DOF,Expected
0,Math,1106,774,332,0.699819,0.300181,2.331325,0.100181,True,1.0,0.0,1,"[[774.0, 332.0], [774.0, 332.0]]"
1,Biology/Medicine,303,212,91,0.69967,0.30033,2.32967,0.10033,True,1.0,0.0,1,"[[212.0, 91.0], [212.0, 91.0]]"
2,Other,258,180,78,0.697674,0.302326,2.307692,0.102326,True,1.0,0.0,1,"[[180.0, 78.0], [180.0, 78.0]]"
3,Computer Science/AI,258,180,78,0.697674,0.302326,2.307692,0.102326,True,1.0,0.0,1,"[[180.0, 78.0], [180.0, 78.0]]"
4,Physics,240,168,72,0.7,0.3,2.333333,0.1,True,1.0,0.0,1,"[[168.0, 72.0], [168.0, 72.0]]"
5,Humanities/Social Science,235,164,71,0.697872,0.302128,2.309859,0.102128,True,1.0,0.0,1,"[[164.0, 71.0], [164.0, 71.0]]"
6,Chemistry,170,119,51,0.7,0.3,2.333333,0.1,True,1.0,0.0,1,"[[119.0, 51.0], [119.0, 51.0]]"
7,Engineering,130,91,39,0.7,0.3,2.333333,0.1,True,1.0,0.0,1,"[[91.0, 39.0], [91.0, 39.0]]"
