<a href="https://colab.research.google.com/github/rymarinelli/Number_Of_Thoughts/blob/main/Copy_of_Number_of_Thoughts_Comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

import pandas as pd
from datasets import load_dataset
from statsmodels.stats.power import TTestIndPower

from scipy.stats import ttest_ind
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az



# Read Data From Hugging Face

In [None]:
tiny_rl  = pd.read_parquet("hf://datasets/zrmarine/Chain_Of_Thought_Count_TinyR1/data/train-00000-of-00001.parquet")
deepseek = pd.read_parquet("hf://datasets/zrmarine/Chain_Of_Thought_Count_Ablation_Deepseek/data/train-00000-of-00001.parquet")
random_forest = pd.read_parquet("hf://datasets/zrmarine/DIA-Number-Of-Thoughts.csv/data/train-00000-of-00001.parquet")


In [None]:
# Remove duplicate instructions so each key is unique
unique_random_forest = random_forest[['instructions', 'level']].drop_duplicates(subset='instructions')

# Now perform the left join
merged_df = deepseek.merge(
    unique_random_forest,
    left_on='question',
    right_on='instructions',
    how='left'
)

# Optionally drop the extra 'instructions' column
deepseek = merged_df.drop(columns=['instructions'])


In [None]:
# Function to perform pairwise t-tests for a given thought count column
def perform_pairwise_tests(df, col):
    groups = {
        'easy': df[df['level'] == 'easy'][col].dropna(),
        'medium': df[df['level'] == 'medium'][col].dropna(),
        'hard': df[df['level'] == 'hard'][col].dropna()
    }
    print(f"\nPairwise t-tests for {col}:")

    # Easy vs. Medium
    t_stat, p_val = ttest_ind(groups['easy'], groups['medium'], equal_var=False)
    print(f"  Easy vs Medium: t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")

    # Easy vs. Hard
    t_stat, p_val = ttest_ind(groups['easy'], groups['hard'], equal_var=False)
    print(f"  Easy vs Hard: t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")

    # Medium vs. Hard
    t_stat, p_val = ttest_ind(groups['medium'], groups['hard'], equal_var=False)
    print(f"  Medium vs Hard: t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")

In [None]:
def calculate_effect_size(mean_1, mean_2, std_1, std_2, n_1, n_2):
    """
    Calculate Cohen's d for two independent samples.
    """
    pooled_std = np.sqrt(((std_1 ** 2) / n_1) + ((std_2 ** 2) / n_2))
    return abs(mean_1 - mean_2) / pooled_std

def pairwise_power_analysis(means, stds, n):
    """
    Perform pairwise power analysis for all group comparisons.

    Parameters:
        means (dict): Mean values for each group.
        stds (dict): Standard deviations for each group.
        n (dict): Sample sizes for each group.

    Returns:
        DataFrame: Summary of effect sizes, power, and required sample sizes.
    """
    alpha = 0.05  # Significance level
    power_analysis = TTestIndPower()
    group_pairs = [(g1, g2) for i, g1 in enumerate(means.keys()) for g2 in list(means.keys())[i+1:]]

    results = []

    for group1, group2 in group_pairs:
        mean_1, mean_2 = means[group1], means[group2]
        std_1, std_2 = stds[group1], stds[group2]
        n_1, n_2 = n[group1], n[group2]

        # Calculate effect size
        effect_size = calculate_effect_size(mean_1, mean_2, std_1, std_2, n_1, n_2)

        if effect_size == 0:
            power, required_sample_size = "N/A", "N/A"
        else:
            # Compute achieved power
            power = power_analysis.power(
                effect_size=effect_size,
                nobs1=n_1,
                alpha=alpha,
                ratio=n_2 / n_1,
                alternative='two-sided'
            )

            # Compute required sample size for 80% power
            required_sample_size = power_analysis.solve_power(
                effect_size=effect_size,
                power=0.8,
                alpha=alpha,
                alternative='two-sided'
            )

        # Store results
        results.append({
            'Comparison': f"{group1} vs {group2}",
            'Effect Size (Cohen\'s d)': round(effect_size, 3),
            'Achieved Power': round(power, 3) if power != "N/A" else "N/A",
            'Required Sample Size per Group for 80% Power': round(required_sample_size, 2) if required_sample_size != "N/A" else "N/A"
        })

    return pd.DataFrame(results)


#TinyR1

# TinyR1 Average Number of Thoughts

In [None]:
# Assuming your dataframe is named df
group_stats = tiny_rl.groupby('level')['thought_count'].agg(['mean', 'median', 'std', 'count'])
print(group_stats)


        mean  median       std  count
level                                
easy     5.8     6.0  3.381062     20
hard     5.0     5.0  2.339591     20
medium   5.8     5.0  2.894641     20


In [None]:
# Define function to calculate Cohen's d for independent samples
def calculate_effect_size(mean_1, mean_2, std_1, std_2, n_1, n_2):
    """
    Calculate Cohen's d for two independent samples.
    """
    pooled_std = np.sqrt(((std_1 ** 2) / n_1) + ((std_2 ** 2) / n_2))
    effect_size = abs(mean_1 - mean_2) / pooled_std
    return effect_size  # Keep effect size 0 if means are the same


means = {'easy': 5.8, 'hard': 5.8, 'medium': 5.0}
stds = {'easy': 3.3, 'hard': 2.3, 'medium': 2.89}
n = {'easy': 20, 'hard': 20, 'medium': 20}
alpha = 0.05  # Significance level


power_analysis = TTestIndPower()

# all group combinations
group_pairs = [('easy', 'hard'), ('easy', 'medium'), ('medium', 'hard')]


results = []

# Iterate through all group combinations
for group1, group2 in group_pairs:
    mean_1, mean_2 = means[group1], means[group2]
    std_1, std_2 = stds[group1], stds[group2]
    n_1, n_2 = n[group1], n[group2]

    # Calculate effect size
    effect_size = calculate_effect_size(mean_1, mean_2, std_1, std_2, n_1, n_2)

    # If effect size is zero, manually set power and required sample size to None
    if effect_size == 0:
        power = None
        required_sample_size = None
    else:
        # Calculate achieved power
        power = power_analysis.power(
            effect_size=effect_size,
            nobs1=n_1,
            alpha=alpha,
            ratio=n_2 / n_1,
            alternative='two-sided'
        )

        # Calculate required sample size for 80% power
        required_sample_size = power_analysis.solve_power(
            effect_size=effect_size,
            power=0.8,
            alpha=alpha,
            alternative='two-sided'
        )

    # Collect results
    results.append({
        'Comparison': f"{group1} vs {group2}",
        'Effect Size (Cohen\'s d)': round(effect_size, 3),
        'Achieved Power': round(power, 3) if power is not None else "N/A",
        'Required Sample Size per Group for 80% Power': round(required_sample_size, 2) if required_sample_size is not None else "N/A"
    })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)

results_df

Unnamed: 0,Comparison,Effect Size (Cohen's d),Achieved Power,Required Sample Size per Group for 80% Power
0,easy vs hard,0.0,,
1,easy vs medium,0.816,0.71,24.6
2,medium vs hard,0.969,0.847,17.74


In [None]:
means = {'easy': 5.8, 'hard': 5.8, 'medium': 5.0}
stds = {'easy': 3.3, 'hard': 2.3, 'medium': 2.89}
n = {'easy': 20, 'hard': 20, 'medium': 20}

with pm.Model() as model:
    # Priors for the group means. Using the observed means as centers
    mu_easy = pm.Normal("mu_easy", mu=means['easy'], sigma=stds['easy'])
    mu_medium = pm.Normal("mu_medium", mu=means['medium'], sigma=stds['medium'])
    mu_hard = pm.Normal("mu_hard", mu=means['hard'], sigma=stds['hard'])

    # Likelihood for each observed group mean, where the standard error accounts for sample size.
    se_easy = stds['easy'] / np.sqrt(n['easy'])
    se_medium = stds['medium'] / np.sqrt(n['medium'])
    se_hard = stds['hard'] / np.sqrt(n['hard'])

    obs_easy = pm.Normal("obs_easy", mu=mu_easy, sigma=se_easy, observed=means['easy'])
    obs_medium = pm.Normal("obs_medium", mu=mu_medium, sigma=se_medium, observed=means['medium'])
    obs_hard = pm.Normal("obs_hard", mu=mu_hard, sigma=se_hard, observed=means['hard'])

    # Define the differences in means
    delta_easy_hard = pm.Deterministic("delta_easy_hard", mu_easy - mu_hard)
    delta_medium_hard = pm.Deterministic("delta_medium_hard", mu_medium - mu_hard)

    # Draw samples from the posterior
    trace = pm.sample(2000, return_inferencedata=True, target_accept=0.9)


print(az.summary(trace, var_names=["mu_easy", "mu_medium", "mu_hard",
                                     "delta_easy_hard", "delta_medium_hard"]))

Output()

                    mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  \
mu_easy            5.795  0.725   4.454    7.150      0.011    0.008   
mu_medium          4.997  0.637   3.812    6.194      0.009    0.006   
mu_hard            5.805  0.495   4.809    6.699      0.008    0.006   
delta_easy_hard   -0.010  0.867  -1.666    1.569      0.014    0.014   
delta_medium_hard -0.808  0.805  -2.300    0.674      0.012    0.010   

                   ess_bulk  ess_tail  r_hat  
mu_easy              4154.0    3074.0    1.0  
mu_medium            4947.0    3162.0    1.0  
mu_hard              3511.0    3099.0    1.0  
delta_easy_hard      4072.0    2853.0    1.0  
delta_medium_hard    4727.0    3113.0    1.0  


In [None]:
import scipy.stats as stats

easy =  tiny_rl[ tiny_rl['level'] == 'easy']['thought_count']
medium = tiny_rl[ tiny_rl['level'] == 'medium']['thought_count']
hard =  tiny_rl[ tiny_rl['level'] == 'hard']['thought_count']

# Perform one-way ANOVA
f_stat, p_value = stats.f_oneway(easy, medium, hard)
print("ANOVA F-statistic:", f_stat)
print("ANOVA p-value:", p_value)


ANOVA F-statistic: 0.5062447960033306
ANOVA p-value: 0.6054392001213811


# TinyR1 T-Test

In [None]:
for col in ['thought_count']:
    perform_pairwise_tests(tiny_rl, col)


Pairwise t-tests for thought_count:
  Easy vs Medium: t-statistic = 0.000, p-value = 1.000
  Easy vs Hard: t-statistic = 0.870, p-value = 0.390
  Medium vs Hard: t-statistic = 0.961, p-value = 0.343


# TinyR1 Power Analysis

In [None]:
pairwise_power_analysis(means, stds, n)

Unnamed: 0,Comparison,Effect Size (Cohen's d),Achieved Power,Required Sample Size per Group for 80% Power
0,easy vs hard,0.0,,
1,easy vs medium,0.816,0.71,24.6
2,hard vs medium,0.969,0.847,17.74


#Random Forest Trained on Math Instruct

# Random Forest Average Number of Thoughts

In [None]:
group_stats = random_forest.groupby('level')['predicted_thought_count'].agg(['mean', 'median', 'std', 'count'])
print(group_stats)


            mean  median       std  count
level                                    
easy    7.583261   7.430  0.774122   1110
hard    8.062500   8.065  0.648933     20
medium  7.471257   7.295  1.071061    370


In [None]:
means = {'easy': 7.583261, 'hard': 8.062500, 'medium': 7.471257}
medians = {'easy': 7.430, 'hard': 8.065, 'medium': 7.295}
stds = {'easy': 0.774122, 'hard': 0.648933, 'medium': 1.071061}
counts = {'easy': 1110, 'hard': 20, 'medium': 370}

pairwise_power_analysis(means, stds, n)


Unnamed: 0,Comparison,Effect Size (Cohen's d),Achieved Power,Required Sample Size per Group for 80% Power
0,easy vs hard,2.122,1.0,4.67
1,easy vs medium,0.379,0.215,110.24
2,hard vs medium,2.111,1.0,4.71


In [None]:
means = {'easy': 7.583261, 'hard': 8.062500, 'medium': 7.471257}
stds = {'easy': 0.774122, 'hard': 0.648933, 'medium': 1.071061}
n = {'easy': 1110, 'hard': 20, 'medium': 370}

with pm.Model() as model:
    # Priors for the group means. Using the observed means as centers
    mu_easy = pm.Normal("mu_easy", mu=means['easy'], sigma=stds['easy'])
    mu_medium = pm.Normal("mu_medium", mu=means['medium'], sigma=stds['medium'])
    mu_hard = pm.Normal("mu_hard", mu=means['hard'], sigma=stds['hard'])

    # Likelihood for each observed group mean, where the standard error accounts for sample size.
    se_easy = stds['easy'] / np.sqrt(n['easy'])
    se_medium = stds['medium'] / np.sqrt(n['medium'])
    se_hard = stds['hard'] / np.sqrt(n['hard'])

    obs_easy = pm.Normal("obs_easy", mu=mu_easy, sigma=se_easy, observed=means['easy'])
    obs_medium = pm.Normal("obs_medium", mu=mu_medium, sigma=se_medium, observed=means['medium'])
    obs_hard = pm.Normal("obs_hard", mu=mu_hard, sigma=se_hard, observed=means['hard'])

    # Define the differences in means
    delta_easy_hard = pm.Deterministic("delta_easy_hard", mu_easy - mu_hard)
    delta_medium_hard = pm.Deterministic("delta_medium_hard", mu_medium - mu_hard)

    # Draw samples from the posterior
    trace = pm.sample(2000, return_inferencedata=True, target_accept=0.9)


print(az.summary(trace, var_names=["mu_easy", "mu_medium", "mu_hard",
                                     "delta_easy_hard", "delta_medium_hard"]))


Output()

                    mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  \
mu_easy            7.584  0.023   7.541    7.626      0.000    0.000   
mu_medium          7.471  0.055   7.366    7.569      0.001    0.001   
mu_hard            8.063  0.138   7.807    8.325      0.002    0.002   
delta_easy_hard   -0.480  0.139  -0.725   -0.206      0.002    0.002   
delta_medium_hard -0.592  0.147  -0.866   -0.309      0.002    0.002   

                   ess_bulk  ess_tail  r_hat  
mu_easy              5014.0    3251.0    1.0  
mu_medium            3538.0    3238.0    1.0  
mu_hard              4131.0    3044.0    1.0  
delta_easy_hard      4234.0    2988.0    1.0  
delta_medium_hard    4163.0    3117.0    1.0  


## **Bayesian Difference of Means: Summary Table**

| Comparison                 | Mean Difference (ùõø) | Standard Deviation (SD) | 94% HDI (Lower, Upper) | Significant? |
|----------------------------|------------------|------------------|----------------------|--------------|
| **Easy - Hard (ùõø_easy-hard)**   | **-0.480**      | **0.144**        | **(-0.761, -0.229)** | Significant |
| **Medium - Hard (ùõø_medium-hard)** | **-0.593**      | **0.151**        | **(-0.880, -0.316)** | Significant |



*   Both differences are negative, meaning that easy and medium groups have significantly lower means than hard.
*   Since zero is NOT in the HDI ranges, we can conclude that these differences are statistically significant.
*   The medium group is even lower than hard





# Random Forest T-Test

In [None]:
for col in ['predicted_thought_count']:
    perform_pairwise_tests(random_forest, col)


Pairwise t-tests for predicted_thought_count:
  Easy vs Medium: t-statistic = 1.856, p-value = 0.064
  Easy vs Hard: t-statistic = -3.261, p-value = 0.004
  Medium vs Hard: t-statistic = -3.804, p-value = 0.001


# Deepseek


In [None]:
import pandas as pd

grouped_means = deepseek.groupby('level')[['thought_count_small', 'thought_count_medium', 'thought_count_large', 'average_thought_count']].agg(['mean', 'std', 'count'])

print(grouped_means)


       thought_count_small                  thought_count_medium            \
                      mean        std count                 mean       std   
level                                                                        
easy                  8.80  14.225995    20                 6.70  3.961658   
hard                 11.15  15.607943    20                 6.05  5.296225   
medium                9.00   9.531112    20                 5.20  4.212763   

             thought_count_large                 average_thought_count  \
       count                mean       std count                  mean   
level                                                                    
easy      20                5.65  1.663066    20              7.050000   
hard      20                7.15  5.815361    20              8.116667   
medium    20                7.25  2.807415    20              7.150000   

                        
             std count  
level                   
easy    5.0

In [None]:
# Small Model
means = {'easy': 8.80, 'hard': 11.15, 'medium': 9.00}
stds = {'easy': 14.22, 'hard': 15.6, 'medium': 9.5}
n = {'easy': 20, 'hard': 20, 'medium': 20}

with pm.Model() as model:
    # Priors for the group means. Using the observed means as centers
    mu_easy = pm.Normal("mu_easy", mu=means['easy'], sigma=stds['easy'])
    mu_medium = pm.Normal("mu_medium", mu=means['medium'], sigma=stds['medium'])
    mu_hard = pm.Normal("mu_hard", mu=means['hard'], sigma=stds['hard'])

    # Likelihood for each observed group mean, where the standard error accounts for sample size.
    se_easy = stds['easy'] / np.sqrt(n['easy'])
    se_medium = stds['medium'] / np.sqrt(n['medium'])
    se_hard = stds['hard'] / np.sqrt(n['hard'])

    obs_easy = pm.Normal("obs_easy", mu=mu_easy, sigma=se_easy, observed=means['easy'])
    obs_medium = pm.Normal("obs_medium", mu=mu_medium, sigma=se_medium, observed=means['medium'])
    obs_hard = pm.Normal("obs_hard", mu=mu_hard, sigma=se_hard, observed=means['hard'])

    # Define the differences in means
    delta_easy_hard = pm.Deterministic("delta_easy_hard", mu_easy - mu_hard)
    delta_medium_hard = pm.Deterministic("delta_medium_hard", mu_medium - mu_hard)

    # Draw samples from the posterior
    trace = pm.sample(2000, return_inferencedata=True, target_accept=0.9)

# Summarize the posterior estimates
print(az.summary(trace,  var_names=["mu_easy", "mu_medium", "mu_hard",
                                     "delta_easy_hard", "delta_medium_hard"]))

Output()

                     mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  \
mu_easy             8.667  3.133   2.854   14.456      0.051    0.036   
mu_medium           8.995  1.999   5.331   12.763      0.032    0.023   
mu_hard            11.255  3.345   4.885   17.311      0.053    0.038   
delta_easy_hard    -2.588  4.599 -11.365    5.859      0.073    0.063   
delta_medium_hard  -2.260  3.879  -9.501    4.944      0.063    0.053   

                   ess_bulk  ess_tail  r_hat  
mu_easy              3689.0    2779.0    1.0  
mu_medium            3802.0    3048.0    1.0  
mu_hard              3931.0    2877.0    1.0  
delta_easy_hard      3940.0    2928.0    1.0  
delta_medium_hard    3819.0    3081.0    1.0  


In [None]:
from scipy.stats import ttest_ind

# Function to perform pairwise t-tests for a given thought count column
def perform_pairwise_tests(df, col):
    groups = {
        'easy': df[df['level'] == 'easy'][col].dropna(),
        'medium': df[df['level'] == 'medium'][col].dropna(),
        'hard': df[df['level'] == 'hard'][col].dropna()
    }
    print(f"\nPairwise t-tests for {col}:")

    # Easy vs. Medium
    t_stat, p_val = ttest_ind(groups['easy'], groups['medium'], equal_var=False)
    print(f"  Easy vs Medium: t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")

    # Easy vs. Hard
    t_stat, p_val = ttest_ind(groups['easy'], groups['hard'], equal_var=False)
    print(f"  Easy vs Hard: t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")

    # Medium vs. Hard
    t_stat, p_val = ttest_ind(groups['medium'], groups['hard'], equal_var=False)
    print(f"  Medium vs Hard: t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")

# Apply the tests to each thought_count column
for col in ['thought_count_small', 'thought_count_medium', 'thought_count_large']:
    perform_pairwise_tests(deepseek, col)



Pairwise t-tests for thought_count_small:
  Easy vs Medium: t-statistic = -0.052, p-value = 0.959
  Easy vs Hard: t-statistic = -0.498, p-value = 0.622
  Medium vs Hard: t-statistic = -0.526, p-value = 0.603

Pairwise t-tests for thought_count_medium:
  Easy vs Medium: t-statistic = 1.160, p-value = 0.253
  Easy vs Hard: t-statistic = 0.440, p-value = 0.663
  Medium vs Hard: t-statistic = -0.562, p-value = 0.578

Pairwise t-tests for thought_count_large:
  Easy vs Medium: t-statistic = -2.193, p-value = 0.036
  Easy vs Hard: t-statistic = -1.109, p-value = 0.279
  Medium vs Hard: t-statistic = 0.069, p-value = 0.945


In [None]:
#Deepseek 1.7B
means_small = {'easy': 8.80, 'hard': 11.15, 'medium': 9.00}
stds_small = {'easy': 14.225995, 'hard': 15.607943, 'medium': 9.531112}
n_small = {'easy': 20, 'hard': 20, 'medium': 20}

# Deepseek 7B
means_med = {'easy': 6.70, 'hard': 6.05, 'medium': 5.20}
stds_med = {'easy': 3.961658, 'hard': 5.296225, 'medium': 4.212763}
n_med = {'easy': 20, 'hard': 20, 'medium': 20}

# Deepseek 14B
means_large = {'easy': 5.65, 'hard': 7.15, 'medium': 7.25}
stds_large = {'easy': 1.663066, 'hard': 5.815361, 'medium': 2.807415}
n_large = {'easy': 20, 'hard': 20, 'medium': 20}

# average_thought_count
means_avg = {'easy': 7.05, 'hard': 8.116667, 'medium': 7.15}
stds_avg = {'easy': 5.018416, 'hard': 5.494575, 'medium': 3.227214}
n_avg = {'easy': 20, 'hard': 20, 'medium': 20}

# Power Analysis for Each Measure

print("=== Power Analysis: thought_count_small ===")
results_small = pairwise_power_analysis(means_small, stds_small, n_small)
print(results_small, "\n")

print("=== Power Analysis: thought_count_medium ===")
results_med = pairwise_power_analysis(means_med, stds_med, n_med)
print(results_med, "\n")

print("=== Power Analysis: thought_count_large ===")
results_large = pairwise_power_analysis(means_large, stds_large, n_large)
print(results_large, "\n")

print("=== Power Analysis: average_thought_count ===")
results_avg = pairwise_power_analysis(means_avg, stds_avg, n_avg)
print(results_avg, "\n")

=== Power Analysis: thought_count_small ===
       Comparison  Effect Size (Cohen's d)  Achieved Power  \
0    easy vs hard                    0.498           0.335   
1  easy vs medium                    0.052           0.053   
2  hard vs medium                    0.526           0.367   

   Required Sample Size per Group for 80% Power  
0                                         64.36  
1                                       5754.59  
2                                         57.76   

=== Power Analysis: thought_count_medium ===
       Comparison  Effect Size (Cohen's d)  Achieved Power  \
0    easy vs hard                    0.440           0.273   
1  easy vs medium                    1.160           0.947   
2  hard vs medium                    0.562           0.410   

   Required Sample Size per Group for 80% Power  
0                                         82.24  
1                                         12.70  
2                                         50.73   

=== Power