In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import random
import pandas as pd 
import math 
import json
import nltk 
nltk.download('punkt')
from tqdm import tqdm
tqdm.pandas()
from scipy.stats import ttest_ind

[nltk_data] Downloading package punkt to /Users/DKWB2F7/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import spacy
import textdescriptives as td
# load your favourite spacy model (remember to install it first using e.g. `python -m spacy download en_core_web_sm`)
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives/all") 

<textdescriptives.load_components.TextDescriptives at 0x173670150>

In [41]:
def get_linguistic_feature(text):
    D = {}
    doc = nlp(text) 
    D['total_wc'] = doc._.descriptive_stats['n_tokens'] 
    D['avg_word_len'] = doc._.descriptive_stats['token_length_mean']
    D['n_sentences'] = doc._.descriptive_stats['n_sentences']
    D['sentence_length_mean'] = doc._.descriptive_stats['sentence_length_mean']
    D['ttr'] = doc._.descriptive_stats['proportion_unique_tokens']
    D['syllables_per_token_mean'] = doc._.descriptive_stats['syllables_per_token_mean']
    D['stopwords_frac'] = doc._.quality.n_stop_words.value *100/D['total_wc'] 
    D['flesch_reading_ease'] = doc._.readability['flesch_reading_ease']
    D['entropy'] = doc._.information_theory['entropy']
    D['perplexity'] = doc._.information_theory['perplexity']
    D['per_word_perplexity'] = doc._.information_theory['per_word_perplexity']
    D['dependency_distance_mean'] = doc._.dependency_distance['dependency_distance_mean']
    D['first_order_coherence'] = doc._.coherence['first_order_coherence']
    D['second_order_coherence'] = doc._.coherence['second_order_coherence']
    return  D

In [42]:
text = "The world is changed. I feel it in the water. I feel it in the earth. I smell it in the air. Much that once was is lost, for none now live who remember it."
get_linguistic_feature(text)

{'total_wc': 35,
 'avg_word_len': 3.2857142857142856,
 'n_sentences': 5,
 'sentence_length_mean': 7.0,
 'ttr': 0.6571428571428571,
 'syllables_per_token_mean': 1.0857142857142856,
 'stopwords_frac': 68.57142857142857,
 'flesch_reading_ease': 107.87857142857146,
 'entropy': 2.022515801318652,
 'perplexity': 7.557313737483889,
 'per_word_perplexity': 0.1843247253044851,
 'dependency_distance_mean': 1.961904761904762,
 'first_order_coherence': 0.6005533672869205,
 'second_order_coherence': 0.5181362926959991}

In [15]:
# Function to collect values for each key from a list of dictionaries
def collect_values(data):
    values_dict = {}
    for d in data:
        for key, value in d.items():
            if key not in values_dict:
                values_dict[key] = []
            values_dict[key].append(value)
    return values_dict


# Partwise calculations

## n  = 1

In [58]:
df = pd.read_csv('1_author.csv')
df.columns

Index(['Unnamed: 0', 'prompt', 'human_story', 'author1', 'llm_story'], dtype='object')

In [59]:
df.groupby('author1').size()

author1
gemma      1794
llama      1774
mistral    1800
olmo       1797
dtype: int64

In [40]:
human_values = []
human_calculation_done = False
authors = ['gemma','llama','mistral','olmo']
llm_values_1 = []
for author in authors:
    df_p = df[df['author1']==author]
    print(author)
    print('='*100)
    llm_values = []
    human_wc, human_avg_wc, human_sc, human_avg_sl, human_ttr, human_stopwords, human_readability = [],[],[],[],[],[],[]
    llm_wc, llm_avg_wc, llm_sc, llm_avg_sl, llm_ttr, llm_stopwords, llm_readability = [],[],[],[],[],[],[]
    for index, row in tqdm(df_p.iterrows(), total=df_p.shape[0]):
        if not human_calculation_done:
            human_values.append(get_linguistic_feature(row['human_story']))
        llm_values.append(get_linguistic_feature(row['llm_story']))

    human_calculation_done = True 
    values_dict1 = collect_values(human_values)
    values_dict2 = collect_values(llm_values)

    # Calculate and print the mean, standard deviation, and p-value for each key
    for key in values_dict1.keys():
        values1 = values_dict1[key]
        values2 = values_dict2[key]
        mean1 = np.mean(values1)
        std_dev1 = np.std(values1)
        mean2 = np.mean(values2)
        std_dev2 = np.std(values2)
        
        # Perform independent t-test with Welch’s correction for unequal variances
        t_stat, p_value = ttest_ind(values1, values2, equal_var=False)
        
        # Print results
        print(f"{key}:")
        print(f"  human: {mean1:.2f} ± {std_dev1:.2f}")
        print(f"  llm: {mean2:.2f} ± {std_dev2:.2f}")
        print(f"  p-value: {p_value:.5f}")
    print('='*100)
    llm_values_1 = llm_values_1 + llm_values
    

gemma


100%|██████████| 1794/1794 [12:18<00:00,  2.43it/s]


total_wc:
  human: 1172.84 ± 366.11
  llm: 368.58 ± 34.86
  p-value: 0.00000
avg_word_len:
  human: 3.92 ± 0.24
  llm: 4.82 ± 0.27
  p-value: 0.00000
n_sentences:
  human: 126.57 ± 55.14
  llm: 29.07 ± 4.91
  p-value: 0.00000
sentence_length_mean:
  human: 9.87 ± 2.51
  llm: 12.97 ± 2.06
  p-value: 0.00000
ttr:
  human: 0.36 ± 0.05
  llm: 0.57 ± 0.03
  p-value: 0.00000
syllables_per_token_mean:
  human: 1.19 ± 0.05
  llm: 1.39 ± 0.08
  p-value: 0.00000
stopwords_frac:
  human: 49.18 ± 6.05
  llm: 47.69 ± 3.35
  p-value: 0.00000
flesch_reading_ease:
  human: 96.46 ± 6.05
  llm: 75.73 ± 7.56
  p-value: 0.00000
entropy:
  human: 38.44 ± 12.20
  llm: 15.50 ± 1.71
  p-value: 0.00000
perplexity:
  human: 13551447155987963025912622930198528.00 ± 516158453813077790092565595367145472.00
  llm: 32912107.89 ± 249770737.25
  p-value: 0.26641
per_word_perplexity:
  human: 5100958117911535871877743378432.00 ± 194379954202003993301593805029376.00
  llm: 62407.77 ± 413198.83
  p-value: 0.26663
depende

100%|██████████| 1774/1774 [04:29<00:00,  6.59it/s]


total_wc:
  human: 1172.84 ± 366.11
  llm: 583.03 ± 225.53
  p-value: 0.00000
avg_word_len:
  human: 3.92 ± 0.24
  llm: 4.17 ± 0.26
  p-value: 0.00000
n_sentences:
  human: 126.57 ± 55.14
  llm: 40.20 ± 18.00
  p-value: 0.00000
sentence_length_mean:
  human: 9.87 ± 2.51
  llm: 15.18 ± 3.28
  p-value: 0.00000
ttr:
  human: 0.36 ± 0.05
  llm: 0.41 ± 0.08
  p-value: 0.00000
syllables_per_token_mean:
  human: 1.19 ± 0.05
  llm: 1.24 ± 0.07
  p-value: 0.00000
stopwords_frac:
  human: 49.18 ± 6.05
  llm: 61.36 ± 4.27
  p-value: 0.00000
flesch_reading_ease:
  human: 96.46 ± 6.05
  llm: 86.38 ± 7.42
  p-value: 0.00000
entropy:
  human: 38.44 ± 12.20
  llm: 25.67 ± 18.60
  p-value: 0.00000
perplexity:
  human: 13551447155987963025912622930198528.00 ± 516158453813077790092565595367145472.00
  llm: 5852669516980846499817407939307510038265994956461792054714842998266017472246068255720377022317136974258211245942124863324291694575006871417290812609364536852656898965504.00 ± inf
  p-value: 1.00000
per

100%|██████████| 1800/1800 [06:41<00:00,  4.49it/s]


total_wc:
  human: 1172.84 ± 366.11
  llm: 811.55 ± 195.35
  p-value: 0.00000
avg_word_len:
  human: 3.92 ± 0.24
  llm: 4.34 ± 0.26
  p-value: 0.00000
n_sentences:
  human: 126.57 ± 55.14
  llm: 49.25 ± 13.89
  p-value: 0.00000
sentence_length_mean:
  human: 9.87 ± 2.51
  llm: 16.90 ± 2.93
  p-value: 0.00000
ttr:
  human: 0.36 ± 0.05
  llm: 0.39 ± 0.05
  p-value: 0.00000
syllables_per_token_mean:
  human: 1.19 ± 0.05
  llm: 1.27 ± 0.07
  p-value: 0.00000
stopwords_frac:
  human: 49.18 ± 6.05
  llm: 58.20 ± 3.71
  p-value: 0.00000
flesch_reading_ease:
  human: 96.46 ± 6.05
  llm: 81.88 ± 7.05
  p-value: 0.00000
entropy:
  human: 38.44 ± 12.20
  llm: 33.57 ± 8.49
  p-value: 0.00000
perplexity:
  human: 13551447155987963025912622930198528.00 ± 516158453813077790092565595367145472.00
  llm: 2047006804070606038560757776384.00 ± 86497053012255762389970683166720.00
  p-value: 0.26648
per_word_perplexity:
  human: 5100958117911535871877743378432.00 ± 194379954202003993301593805029376.00
  llm:

100%|██████████| 1797/1797 [05:51<00:00,  5.11it/s]

total_wc:
  human: 1172.84 ± 366.11
  llm: 720.87 ± 217.08
  p-value: 0.00000
avg_word_len:
  human: 3.92 ± 0.24
  llm: 4.37 ± 0.31
  p-value: 0.00000
n_sentences:
  human: 126.57 ± 55.14
  llm: 42.32 ± 16.17
  p-value: 0.00000
sentence_length_mean:
  human: 9.87 ± 2.51
  llm: 17.83 ± 3.84
  p-value: 0.00000
ttr:
  human: 0.36 ± 0.05
  llm: 0.38 ± 0.09
  p-value: 0.00000
syllables_per_token_mean:
  human: 1.19 ± 0.05
  llm: 1.29 ± 0.09
  p-value: 0.00000
stopwords_frac:
  human: 49.18 ± 6.05
  llm: 57.97 ± 4.91
  p-value: 0.00000
flesch_reading_ease:
  human: 96.46 ± 6.05
  llm: 79.89 ± 9.29
  p-value: 0.00000
entropy:
  human: 38.44 ± 12.20
  llm: 29.34 ± 9.45
  p-value: 0.00000
perplexity:
  human: 13551447155987963025912622930198528.00 ± 516158453813077790092565595367145472.00
  llm: 14501970882986753391657064272947576832.00 ± 614582507435598813812926437674224451584.00
  p-value: 0.31790
per_word_perplexity:
  human: 5100958117911535871877743378432.00 ± 19437995420200399330159380502




In [23]:
values_dict1 = collect_values(human_values)
values_dict2 = collect_values(llm_values)

# Calculate and print the mean, standard deviation, and p-value for each key
for key in values_dict1.keys():
    values1 = values_dict1[key]
    values2 = values_dict2[key]
    mean1 = np.mean(values1)
    std_dev1 = np.std(values1)
    mean2 = np.mean(values2)
    std_dev2 = np.std(values2)
    
    # Perform independent t-test with Welch’s correction for unequal variances
    t_stat, p_value = ttest_ind(values1, values2, equal_var=False)
    
    # Print results
    print(f"{key}:")
    print(f"  human: {mean1:.2f} ± {std_dev1:.2f}")
    print(f"  llm: {mean2:.2f} ± {std_dev2:.2f}")
    print(f"  p-value: {p_value:.5f}")

total_wc:
  human: 1173.79 ± 367.82
  llm: 818.63 ± 222.50
  p-value: 0.00000
avg_word_len:
  human: 3.92 ± 0.24
  llm: 4.33 ± 0.24
  p-value: 0.00000
n_sentences:
  human: 126.64 ± 54.93
  llm: 51.67 ± 16.80
  p-value: 0.00000
sentence_length_mean:
  human: 126.64 ± 54.93
  llm: 51.67 ± 16.80
  p-value: 0.00000
ttr:
  human: 0.36 ± 0.05
  llm: 0.39 ± 0.05
  p-value: 0.00888
syllables_per_token_mean:
  human: 1.19 ± 0.05
  llm: 1.26 ± 0.05
  p-value: 0.00000
stopwords_frac:
  human: 49.19 ± 6.09
  llm: 58.56 ± 3.56
  p-value: 0.00000
flesch_reading_ease:
  human: 96.46 ± 6.05
  llm: 83.37 ± 5.11
  p-value: 0.00000
entropy:
  human: 38.49 ± 12.25
  llm: 33.55 ± 9.36
  p-value: 0.01306
perplexity:
  human: 13498785251096858996956582222233600.00 ± 515155082908459625412432661592408064.00
  llm: 590662716993854752423936.00 ± 3007230335504000504299520.00
  p-value: 0.26641
per_word_perplexity:
  human: 5081135752388379806843254341632.00 ± 194002094809484586561740432474112.00
  llm: 339562483

### Difference across n (when multiple authors collaborate on the story, compared with the single author

In [43]:
len(llm_values_1)

7165

n = 2

In [44]:
df_2 = pd.read_csv('2_author.csv')
df_2['author_list'] = df_2['author1']+'_'+df_2['author2']
df_2.groupby('author_list').size()

author_list
gemma_llama      317
gemma_mistral    394
gemma_olmo       389
gemma_orca       260
llama_gemma      391
llama_mistral    398
llama_olmo       397
llama_orca       289
mistral_gemma    395
mistral_llama    340
mistral_olmo     400
mistral_orca     288
olmo_gemma       397
olmo_llama       347
olmo_mistral     399
olmo_orca        264
orca_gemma       370
orca_llama       305
orca_mistral     363
orca_olmo        367
dtype: int64

In [32]:
df_2.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'prompt', 'human_story', 'author1',
       'author2', 'part1', 'part2', 'llm_story', 'author_list'],
      dtype='object')

In [46]:
llm_values_n = []
for index, row in tqdm(df_2.iterrows(), total=df_2.shape[0]):
    llm_values_n.append(get_linguistic_feature(row['llm_story']))
values_dict1 = collect_values(llm_values_1)
values_dict2 = collect_values(llm_values_n)

# Calculate and print the mean, standard deviation, and p-value for each key
for key in values_dict1.keys():
    values1 = values_dict1[key]
    values2 = values_dict2[key]
    mean1 = np.mean(values1)
    std_dev1 = np.std(values1)
    mean2 = np.mean(values2)
    std_dev2 = np.std(values2)
    
    # Perform independent t-test with Welch’s correction for unequal variances
    t_stat, p_value = ttest_ind(values1, values2, equal_var=False)
    
    # Print results
    print(f"{key}:")
    print(f"  llm_1: {mean1:.2f} ± {std_dev1:.2f}")
    print(f"  llm_2: {mean2:.2f} ± {std_dev2:.2f}")
    print(f"  p-value: {p_value:.5f}")  

100%|██████████| 7070/7070 [1:09:51<00:00,  1.69it/s]   

total_wc:
  llm_1: 621.31 ± 249.47
  llm_2: 936.15 ± 181.29
  p-value: 0.00000
avg_word_len:
  llm_1: 4.43 ± 0.37
  llm_2: 4.45 ± 0.27
  p-value: 0.00004
n_sentences:
  llm_1: 40.22 ± 15.91
  llm_2: 60.83 ± 14.19
  p-value: 0.00000
sentence_length_mean:
  llm_1: 15.72 ± 3.61
  llm_2: 15.74 ± 2.70
  p-value: 0.81757
ttr:
  llm_1: 0.44 ± 0.10
  llm_2: 0.40 ± 0.06
  p-value: 0.00000
syllables_per_token_mean:
  llm_1: 1.30 ± 0.10
  llm_2: 1.30 ± 0.07
  p-value: 0.03395
stopwords_frac:
  llm_1: 56.29 ± 6.58
  llm_2: 56.69 ± 4.39
  p-value: 0.00002
flesch_reading_ease:
  llm_1: 80.95 ± 8.76
  llm_2: 81.20 ± 7.20
  p-value: 0.07322
entropy:
  llm_1: 26.03 ± 13.10
  llm_2: 38.42 ± 7.93
  p-value: 0.00000
perplexity:
  llm_1: 1449076862962180353152875207888493519293563325966298079010272959911758332049947410390642156043386853958061026031987783958279938336653975821056201639897017267350692429824.00 ± inf
  llm_2: 11317695211048858084704256.00 ± 692498657341208613619761152.00
  p-value: 1.00000
per




n= 3

In [49]:
df_3 = pd.read_csv('3_author.csv')
df_3['author_list'] = df_3['author1']+'_'+df_3['author2'] + '_'+df_3['author3']
df_3.groupby('author_list').size()

author_list
gemma_mistral_olmo     276
gemma_olmo_llama       233
llama_gemma_mistral    468
llama_gemma_olmo       471
llama_olmo_gemma       461
llama_olmo_mistral     462
mistral_gemma_olmo     480
mistral_llama_gemma    385
mistral_llama_olmo     394
mistral_olmo_gemma     471
olmo_gemma_llama       382
olmo_gemma_mistral     478
olmo_llama_gemma       378
olmo_llama_mistral     368
olmo_mistral_llama     386
dtype: int64

In [50]:
df_3.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'prompt', 'human_story', 'author1',
       'author2', 'author3', 'part1', 'part2', 'part3', 'llm_story',
       'author_list'],
      dtype='object')

In [51]:
llm_values_n = []
for index, row in tqdm(df_3.iterrows(), total=df_3.shape[0]):
    llm_values_n.append(get_linguistic_feature(row['llm_story']))
values_dict1 = collect_values(llm_values_1)
values_dict2 = collect_values(llm_values_n)

# Calculate and print the mean, standard deviation, and p-value for each key
for key in values_dict1.keys():
    values1 = values_dict1[key]
    values2 = values_dict2[key]
    mean1 = np.mean(values1)
    std_dev1 = np.std(values1)
    mean2 = np.mean(values2)
    std_dev2 = np.std(values2)
    
    # Perform independent t-test with Welch’s correction for unequal variances
    t_stat, p_value = ttest_ind(values1, values2, equal_var=False)
    
    # Print results
    print(f"{key}:")
    print(f"  llm_1: {mean1:.2f} ± {std_dev1:.2f}")
    print(f"  llm_3: {mean2:.2f} ± {std_dev2:.2f}")
    print(f"  p-value: {p_value:.10f}")  

100%|██████████| 6093/6093 [27:10<00:00,  3.74it/s]

total_wc:
  llm_1: 621.31 ± 249.47
  llm_3: 989.58 ± 94.92
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.43 ± 0.37
  llm_3: 4.48 ± 0.25
  p-value: 0.0000000000
n_sentences:
  llm_1: 40.22 ± 15.91
  llm_3: 65.20 ± 11.34
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 15.72 ± 3.61
  llm_3: 15.51 ± 2.31
  p-value: 0.0000368788
ttr:
  llm_1: 0.44 ± 0.10
  llm_3: 0.40 ± 0.04
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.30 ± 0.10
  llm_3: 1.30 ± 0.07
  p-value: 0.9910348132
stopwords_frac:
  llm_1: 56.29 ± 6.58
  llm_3: 56.06 ± 3.66
  p-value: 0.0095144892
flesch_reading_ease:
  llm_1: 80.95 ± 8.76
  llm_3: 81.17 ± 6.87
  p-value: 0.1069458150
entropy:
  llm_1: 26.03 ± 13.10
  llm_3: 40.30 ± 4.75
  p-value: 0.0000000000
perplexity:
  llm_1: 1449076862962180353152875207888493519293563325966298079010272959911758332049947410390642156043386853958061026031987783958279938336653975821056201639897017267350692429824.00 ± inf
  llm_3: 10162588300539433320448.00 ± 515430681320




n=4

In [52]:
df_4 = pd.read_csv('4_author.csv')
df_4['author_list'] = df_4['author1']+'_'+df_4['author2'] + '_'+df_4['author3'] + '_' + df_4['author4']
df_4.groupby('author_list').size()

author_list
gemma_mistral_olmo_llama    455
gemma_olmo_llama_mistral    447
llama_gemma_mistral_olmo    478
llama_gemma_olmo_mistral    475
llama_olmo_gemma_mistral    473
llama_olmo_mistral_gemma    470
mistral_gemma_olmo_llama    457
mistral_llama_gemma_olmo    475
mistral_llama_olmo_gemma    468
mistral_olmo_gemma_llama    453
olmo_gemma_llama_mistral    457
olmo_gemma_mistral_llama    458
olmo_llama_gemma_mistral    471
olmo_llama_mistral_gemma    469
olmo_mistral_llama_gemma    449
dtype: int64

In [53]:
llm_values_n = []
for index, row in tqdm(df_4.iterrows(), total=df_4.shape[0]):
    llm_values_n.append(get_linguistic_feature(row['llm_story']))
values_dict1 = collect_values(llm_values_1)
values_dict2 = collect_values(llm_values_n)

# Calculate and print the mean, standard deviation, and p-value for each key
for key in values_dict1.keys():
    values1 = values_dict1[key]
    values2 = values_dict2[key]
    mean1 = np.mean(values1)
    std_dev1 = np.std(values1)
    mean2 = np.mean(values2)
    std_dev2 = np.std(values2)
    
    # Perform independent t-test with Welch’s correction for unequal variances
    t_stat, p_value = ttest_ind(values1, values2, equal_var=False)
    
    # Print results
    print(f"{key}:")
    print(f"  llm_1: {mean1:.2f} ± {std_dev1:.2f}")
    print(f"  llm_4: {mean2:.2f} ± {std_dev2:.2f}")
    print(f"  p-value: {p_value:.10f}")  

100%|██████████| 6955/6955 [29:07<00:00,  3.98it/s]

total_wc:
  llm_1: 621.31 ± 249.47
  llm_4: 943.34 ± 70.66
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.43 ± 0.37
  llm_4: 4.49 ± 0.25
  p-value: 0.0000000000
n_sentences:
  llm_1: 40.22 ± 15.91
  llm_4: 60.41 ± 9.88
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 15.72 ± 3.61
  llm_4: 15.94 ± 2.23
  p-value: 0.0000225347
ttr:
  llm_1: 0.44 ± 0.10
  llm_4: 0.41 ± 0.03
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.30 ± 0.10
  llm_4: 1.31 ± 0.07
  p-value: 0.0000457922
stopwords_frac:
  llm_1: 56.29 ± 6.58
  llm_4: 55.94 ± 3.40
  p-value: 0.0000503220
flesch_reading_ease:
  llm_1: 80.95 ± 8.76
  llm_4: 80.25 ± 7.11
  p-value: 0.0000001474
entropy:
  llm_1: 26.03 ± 13.10
  llm_4: 37.97 ± 4.07
  p-value: 0.0000000000
perplexity:
  llm_1: 1449076862962180353152875207888493519293563325966298079010272959911758332049947410390642156043386853958061026031987783958279938336653975821056201639897017267350692429824.00 ± inf
  llm_4: 677016273764254819203821653196800.00 ± 564




n = 5

In [54]:
df_5 = pd.read_csv('5_author.csv')
df_5['author_list'] = df_5['author1']+'_'+df_5['author2'] + '_'+df_5['author3'] + '_' + df_5['author4']+'_'+df_5['author5']
df_5.groupby('author_list').size()

author_list
gemma_llama_olmo_mistral_orca    410
gemma_orca_llama_mistral_olmo    444
llama_gemma_olmo_orca_mistral    448
llama_gemma_orca_mistral_olmo    451
llama_olmo_orca_gemma_mistral    418
mistral_gemma_llama_olmo_orca    433
mistral_llama_gemma_orca_olmo    400
mistral_olmo_gemma_orca_llama    387
mistral_orca_gemma_olmo_llama    390
olmo_gemma_orca_mistral_llama    213
olmo_llama_mistral_orca_gemma    201
olmo_mistral_gemma_llama_orca    202
olmo_mistral_llama_orca_gemma    214
olmo_mistral_orca_gemma_llama    198
orca_mistral_olmo_llama_gemma    411
dtype: int64

In [55]:
llm_values_n = []
for index, row in tqdm(df_5.iterrows(), total=df_5.shape[0]):
    llm_values_n.append(get_linguistic_feature(row['llm_story']))
values_dict1 = collect_values(llm_values_1)
values_dict2 = collect_values(llm_values_n)

# Calculate and print the mean, standard deviation, and p-value for each key
for key in values_dict1.keys():
    values1 = values_dict1[key]
    values2 = values_dict2[key]
    mean1 = np.mean(values1)
    std_dev1 = np.std(values1)
    mean2 = np.mean(values2)
    std_dev2 = np.std(values2)
    
    # Perform independent t-test with Welch’s correction for unequal variances
    t_stat, p_value = ttest_ind(values1, values2, equal_var=False)
    
    # Print results
    print(f"{key}:")
    print(f"  llm_1: {mean1:.2f} ± {std_dev1:.2f}")
    print(f"  llm_4: {mean2:.2f} ± {std_dev2:.2f}")
    print(f"  p-value: {p_value:.10f}")  

100%|██████████| 5220/5220 [19:50<00:00,  4.39it/s]

total_wc:
  llm_1: 621.31 ± 249.47
  llm_4: 859.35 ± 60.68
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.43 ± 0.37
  llm_4: 4.47 ± 0.26
  p-value: 0.0000000000
n_sentences:
  llm_1: 40.22 ± 15.91
  llm_4: 55.25 ± 9.03
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 15.72 ± 3.61
  llm_4: 15.88 ± 2.23
  p-value: 0.0027719747
ttr:
  llm_1: 0.44 ± 0.10
  llm_4: 0.42 ± 0.03
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.30 ± 0.10
  llm_4: 1.30 ± 0.07
  p-value: 0.6869240220
stopwords_frac:
  llm_1: 56.29 ± 6.58
  llm_4: 55.84 ± 3.58
  p-value: 0.0000009142
flesch_reading_ease:
  llm_1: 80.95 ± 8.76
  llm_4: 80.85 ± 7.24
  p-value: 0.4521086463
entropy:
  llm_1: 26.03 ± 13.10
  llm_4: 34.69 ± 3.58
  p-value: 0.0000000000
perplexity:
  llm_1: 1449076862962180353152875207888493519293563325966298079010272959911758332049947410390642156043386853958061026031987783958279938336653975821056201639897017267350692429824.00 ± inf
  llm_4: 148349059703247328.00 ± 123192494808885734




In [57]:
for key in values_dict1.keys():
    values1 = values_dict1[key]
    values2 = values_dict2[key]
    mean1 = np.nanmean(values1)
    std_dev1 = np.nanstd(values1)
    mean2 = np.mean(values2)
    std_dev2 = np.std(values2)
    
    # Perform independent t-test with Welch’s correction for unequal variances
    t_stat, p_value = ttest_ind(values1, values2, equal_var=False)
    
    # Print results
    print(f"{key}:")
    print(f"  llm_1: {mean1:.2f} ± {std_dev1:.2f}")
    print(f"  llm_3: {mean2:.2f} ± {std_dev2:.2f}")
    print(f"  p-value: {p_value:.10f}")  

total_wc:
  llm_1: 621.31 ± 249.47
  llm_3: 859.35 ± 60.68
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.43 ± 0.37
  llm_3: 4.47 ± 0.26
  p-value: 0.0000000000
n_sentences:
  llm_1: 40.22 ± 15.91
  llm_3: 55.25 ± 9.03
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 15.72 ± 3.61
  llm_3: 15.88 ± 2.23
  p-value: 0.0027719747
ttr:
  llm_1: 0.44 ± 0.10
  llm_3: 0.42 ± 0.03
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.30 ± 0.10
  llm_3: 1.30 ± 0.07
  p-value: 0.6869240220
stopwords_frac:
  llm_1: 56.29 ± 6.58
  llm_3: 55.84 ± 3.58
  p-value: 0.0000009142
flesch_reading_ease:
  llm_1: 80.95 ± 8.76
  llm_3: 80.85 ± 7.24
  p-value: 0.4521086463
entropy:
  llm_1: 26.03 ± 13.10
  llm_3: 34.69 ± 3.58
  p-value: 0.0000000000
perplexity:
  llm_1: 1449076862962180353152875207888493519293563325966298079010272959911758332049947410390642156043386853958061026031987783958279938336653975821056201639897017267350692429824.00 ± inf
  llm_3: 148349059703247328.00 ± 123192494808885734

## How much each llm behave differently in different part 

In [65]:
def compare_values_with_part1(llm_values_1, llm_values_n):
    values_dict1 = collect_values(llm_values_1)
    values_dict2 = collect_values(llm_values_n)

    # Calculate and print the mean, standard deviation, and p-value for each key
    for key in values_dict1.keys():
        values1 = values_dict1[key]
        values2 = values_dict2[key]
        mean1 = np.nanmean(values1)
        std_dev1 = np.nanstd(values1)
        mean2 = np.nanmean(values2)
        std_dev2 = np.nanstd(values2)
        
        # Perform independent t-test with Welch’s correction for unequal variances
        t_stat, p_value = ttest_ind(values1, values2, equal_var=False)
        
        # Print results
        print(f"{key}:")
        print(f"  llm_1: {mean1:.2f} ± {std_dev1:.2f}")
        print(f"  llm_n: {mean2:.2f} ± {std_dev2:.2f}")
        print(f"  p-value: {p_value:.10f}")  

In [60]:
df = pd.read_csv('5_author.csv')
df.columns

Index(['Unnamed: 0', 'prompt', 'human_story', 'author1', 'author2', 'author3',
       'author4', 'author5', 'part1', 'part2', 'part3', 'part4', 'part5',
       'llm_story'],
      dtype='object')

In [61]:
authors = df.author1.unique()
authors

array(['llama', 'olmo', 'mistral', 'gemma', 'orca'], dtype=object)

In [66]:
for author in authors:
    print(author)
    print('='*100)
    df_1 = df[df['author1']==author]
    llm_values_1 = []
    for index, row in tqdm(df_1.iterrows(), total=df_1.shape[0]):
        llm_values_1.append(get_linguistic_feature(row['part1'])) 
    
    print('-'*100)
    print('Comparision with part-2')
    df_n = df[df['author2']==author]
    llm_values_n = []
    for index, row in tqdm(df_n.iterrows(), total=df_n.shape[0]):
        llm_values_n.append(get_linguistic_feature(row['part2'])) 
    compare_values_with_part1(llm_values_1,llm_values_n)
    
    print('-'*100)
    print('Comparision with part-3')
    df_n = df[df['author3']==author]
    llm_values_n = []
    for index, row in tqdm(df_n.iterrows(), total=df_n.shape[0]):
        llm_values_n.append(get_linguistic_feature(row['part3'])) 
    compare_values_with_part1(llm_values_1,llm_values_n)

    print('-'*100)
    print('Comparision with part-4')
    df_n = df[df['author4']==author]
    llm_values_n = []
    for index, row in tqdm(df_n.iterrows(), total=df_n.shape[0]):
        llm_values_n.append(get_linguistic_feature(row['part4']))
    compare_values_with_part1(llm_values_1,llm_values_n)

    print('-'*100)
    print('Comparision with part-5')
    df_n = df[df['author5']==author]
    llm_values_n = []
    for index, row in tqdm(df_n.iterrows(), total=df_n.shape[0]):
        llm_values_n.append(get_linguistic_feature(row['part5']))
    compare_values_with_part1(llm_values_1,llm_values_n)

    print('='*100)

llama


100%|██████████| 1317/1317 [00:59<00:00, 22.27it/s]


----------------------------------------------------------------------------------------------------
Comparision with part-2


100%|██████████| 1011/1011 [00:44<00:00, 22.69it/s]


total_wc:
  llm_1: 172.51 ± 19.91
  llm_n: 170.88 ± 13.28
  p-value: 0.0179200125
avg_word_len:
  llm_1: 4.32 ± 0.28
  llm_n: 4.33 ± 0.30
  p-value: 0.2638956300
n_sentences:
  llm_1: 10.93 ± 2.66
  llm_n: 10.34 ± 2.41
  p-value: 0.0000000382
sentence_length_mean:
  llm_1: 16.53 ± 3.64
  llm_n: 17.31 ± 3.68
  p-value: 0.0000004863
ttr:
  llm_1: 0.60 ± 0.05
  llm_n: 0.61 ± 0.05
  p-value: 0.2985939907
syllables_per_token_mean:
  llm_1: 1.26 ± 0.08
  llm_n: 1.26 ± 0.08
  p-value: 0.3148460089
stopwords_frac:
  llm_1: 59.14 ± 5.56
  llm_n: 57.20 ± 4.85
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 83.11 ± 8.35
  llm_n: 82.61 ± 8.50
  p-value: 0.1578329165
entropy:
  llm_1: 6.83 ± 0.98
  llm_n: 6.84 ± 0.81
  p-value: 0.7674865467
perplexity:
  llm_1: 1361.46 ± 1311.87
  llm_n: 1282.25 ± 1182.96
  p-value: 0.1270725640
per_word_perplexity:
  llm_1: 6.49 ± 5.91
  llm_n: 6.24 ± 5.36
  p-value: 0.2939786446
dependency_distance_mean:
  llm_1: 2.63 ± 0.25
  llm_n: 2.68 ± 0.25
  p-value: 

100%|██████████| 1091/1091 [00:48<00:00, 22.41it/s]


total_wc:
  llm_1: 172.51 ± 19.91
  llm_n: 173.35 ± 15.29
  p-value: 0.2421826789
avg_word_len:
  llm_1: 4.32 ± 0.28
  llm_n: 4.32 ± 0.32
  p-value: 0.8775417410
n_sentences:
  llm_1: 10.93 ± 2.66
  llm_n: 10.60 ± 2.51
  p-value: 0.0018780650
sentence_length_mean:
  llm_1: 16.53 ± 3.64
  llm_n: 17.15 ± 3.70
  p-value: 0.0000387575
ttr:
  llm_1: 0.60 ± 0.05
  llm_n: 0.59 ± 0.05
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.26 ± 0.08
  llm_n: 1.26 ± 0.09
  p-value: 0.3434504960
stopwords_frac:
  llm_1: 59.14 ± 5.56
  llm_n: 57.80 ± 4.92
  p-value: 0.0000000004
flesch_reading_ease:
  llm_1: 83.11 ± 8.35
  llm_n: 82.75 ± 9.00
  p-value: 0.3154746061
entropy:
  llm_1: 6.83 ± 0.98
  llm_n: 7.02 ± 0.89
  p-value: 0.0000007134
perplexity:
  llm_1: 1361.46 ± 1311.87
  llm_n: 1607.96 ± 1623.86
  p-value: 0.0000559105
per_word_perplexity:
  llm_1: 6.49 ± 5.91
  llm_n: 7.69 ± 7.28
  p-value: 0.0000115658
dependency_distance_mean:
  llm_1: 2.63 ± 0.25
  llm_n: 2.68 ± 0.27
  p-value: 

100%|██████████| 613/613 [00:28<00:00, 21.68it/s]


total_wc:
  llm_1: 172.51 ± 19.91
  llm_n: 174.76 ± 16.18
  p-value: 0.0086869790
avg_word_len:
  llm_1: 4.32 ± 0.28
  llm_n: 4.36 ± 0.34
  p-value: 0.0124314772
n_sentences:
  llm_1: 10.93 ± 2.66
  llm_n: 10.53 ± 2.58
  p-value: 0.0020491446
sentence_length_mean:
  llm_1: 16.53 ± 3.64
  llm_n: 17.45 ± 3.89
  p-value: 0.0000011327
ttr:
  llm_1: 0.60 ± 0.05
  llm_n: 0.58 ± 0.05
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.26 ± 0.08
  llm_n: 1.28 ± 0.09
  p-value: 0.0004533130
stopwords_frac:
  llm_1: 59.14 ± 5.56
  llm_n: 58.41 ± 5.09
  p-value: 0.0043566721
flesch_reading_ease:
  llm_1: 83.11 ± 8.35
  llm_n: 80.88 ± 10.10
  p-value: 0.0000020942
entropy:
  llm_1: 6.83 ± 0.98
  llm_n: 6.99 ± 0.94
  p-value: 0.0006340255
perplexity:
  llm_1: 1361.46 ± 1311.87
  llm_n: 1629.76 ± 1723.10
  p-value: 0.0006557870
per_word_perplexity:
  llm_1: 6.49 ± 5.91
  llm_n: 7.71 ± 7.63
  p-value: 0.0004660569
dependency_distance_mean:
  llm_1: 2.63 ± 0.25
  llm_n: 2.68 ± 0.24
  p-value:

100%|██████████| 1188/1188 [00:52<00:00, 22.72it/s]


total_wc:
  llm_1: 172.51 ± 19.91
  llm_n: 172.23 ± 19.33
  p-value: 0.7218469276
avg_word_len:
  llm_1: 4.32 ± 0.28
  llm_n: 4.36 ± 0.32
  p-value: 0.0007762945
n_sentences:
  llm_1: 10.93 ± 2.66
  llm_n: 9.66 ± 2.40
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 16.53 ± 3.64
  llm_n: 18.66 ± 3.82
  p-value: 0.0000000000
ttr:
  llm_1: 0.60 ± 0.05
  llm_n: 0.58 ± 0.06
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.26 ± 0.08
  llm_n: 1.27 ± 0.09
  p-value: 0.0030893840
stopwords_frac:
  llm_1: 59.14 ± 5.56
  llm_n: 58.41 ± 4.70
  p-value: 0.0003587606
flesch_reading_ease:
  llm_1: 83.11 ± 8.35
  llm_n: 80.13 ± 9.19
  p-value: 0.0000000000
entropy:
  llm_1: 6.83 ± 0.98
  llm_n: 6.87 ± 1.03
  p-value: 0.2989780430
perplexity:
  llm_1: 1361.46 ± 1311.87
  llm_n: 1500.89 ± 1792.09
  p-value: 0.0278465971
per_word_perplexity:
  llm_1: 6.49 ± 5.91
  llm_n: 7.23 ± 7.99
  p-value: 0.0089999056
dependency_distance_mean:
  llm_1: 2.63 ± 0.25
  llm_n: 2.73 ± 0.24
  p-value: 0

100%|██████████| 1028/1028 [00:44<00:00, 23.21it/s]


----------------------------------------------------------------------------------------------------
Comparision with part-2


100%|██████████| 805/805 [00:41<00:00, 19.36it/s]


total_wc:
  llm_1: 168.01 ± 8.69
  llm_n: 197.91 ± 18.93
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.42 ± 0.30
  llm_n: 4.35 ± 0.32
  p-value: 0.0000020391
n_sentences:
  llm_1: 10.23 ± 2.28
  llm_n: 13.64 ± 3.60
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 17.23 ± 3.89
  llm_n: 15.38 ± 3.77
  p-value: 0.0000000000
ttr:
  llm_1: 0.62 ± 0.05
  llm_n: 0.57 ± 0.06
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.28 ± 0.09
  llm_n: 1.28 ± 0.09
  p-value: 0.0673831901
stopwords_frac:
  llm_1: 54.66 ± 5.04
  llm_n: 57.65 ± 5.86
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 80.78 ± 9.01
  llm_n: 83.31 ± 9.87
  p-value: 0.0000000181
entropy:
  llm_1: 6.50 ± 0.68
  llm_n: 8.15 ± 1.14
  p-value: 0.0000000000
perplexity:
  llm_1: 840.67 ± 661.02
  llm_n: 5963.67 ± 7092.64
  p-value: 0.0000000000
per_word_perplexity:
  llm_1: 4.28 ± 3.25
  llm_n: 24.43 ± 28.01
  p-value: 0.0000000000
dependency_distance_mean:
  llm_1: 2.66 ± 0.27
  llm_n: 2.54 ± 0.29
  p-value: 0

100%|██████████| 1269/1269 [01:04<00:00, 19.57it/s]


total_wc:
  llm_1: 168.01 ± 8.69
  llm_n: 194.89 ± 23.64
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.42 ± 0.30
  llm_n: 4.38 ± 0.33
  p-value: 0.0003659052
n_sentences:
  llm_1: 10.23 ± 2.28
  llm_n: 12.66 ± 3.47
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 17.23 ± 3.89
  llm_n: 16.36 ± 4.13
  p-value: 0.0000002204
ttr:
  llm_1: 0.62 ± 0.05
  llm_n: 0.58 ± 0.06
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.28 ± 0.09
  llm_n: 1.29 ± 0.09
  p-value: 0.4201272425
stopwords_frac:
  llm_1: 54.66 ± 5.04
  llm_n: 57.06 ± 5.65
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 80.78 ± 9.01
  llm_n: 81.41 ± 9.81
  p-value: 0.1094797002
entropy:
  llm_1: 6.50 ± 0.68
  llm_n: 7.96 ± 1.27
  p-value: 0.0000000000
perplexity:
  llm_1: 840.67 ± 661.02
  llm_n: 5316.66 ± 6942.37
  p-value: 0.0000000000
per_word_perplexity:
  llm_1: 4.28 ± 3.25
  llm_n: 22.02 ± 27.35
  p-value: 0.0000000000
dependency_distance_mean:
  llm_1: 2.66 ± 0.27
  llm_n: 2.58 ± 0.34
  p-value: 0

100%|██████████| 823/823 [00:41<00:00, 19.62it/s]


total_wc:
  llm_1: 168.01 ± 8.69
  llm_n: 192.64 ± 26.60
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.42 ± 0.30
  llm_n: 4.41 ± 0.36
  p-value: 0.2852021114
n_sentences:
  llm_1: 10.23 ± 2.28
  llm_n: 12.42 ± 3.65
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 17.23 ± 3.89
  llm_n: 16.58 ± 4.37
  p-value: 0.0008346220
ttr:
  llm_1: 0.62 ± 0.05
  llm_n: 0.58 ± 0.06
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.28 ± 0.09
  llm_n: 1.29 ± 0.10
  p-value: 0.0175148781
stopwords_frac:
  llm_1: 54.66 ± 5.04
  llm_n: 56.78 ± 5.55
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 80.78 ± 9.01
  llm_n: 80.55 ± 10.73
  p-value: 0.6194104130
entropy:
  llm_1: 6.50 ± 0.68
  llm_n: 7.86 ± 1.41
  p-value: 0.0000000000
perplexity:
  llm_1: 840.67 ± 661.02
  llm_n: 5225.43 ± 6362.26
  p-value: 0.0000000000
per_word_perplexity:
  llm_1: 4.28 ± 3.25
  llm_n: 21.65 ± 25.23
  p-value: 0.0000000000
dependency_distance_mean:
  llm_1: 2.66 ± 0.27
  llm_n: 2.58 ± 0.27
  p-value: 

100%|██████████| 1295/1295 [01:04<00:00, 20.12it/s]


total_wc:
  llm_1: 168.01 ± 8.69
  llm_n: 191.41 ± 30.28
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.42 ± 0.30
  llm_n: 4.42 ± 0.34
  p-value: 0.8185814127
n_sentences:
  llm_1: 10.23 ± 2.28
  llm_n: 12.19 ± 3.59
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 17.23 ± 3.89
  llm_n: 16.68 ± 4.16
  p-value: 0.0010943821
ttr:
  llm_1: 0.62 ± 0.05
  llm_n: 0.57 ± 0.07
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.28 ± 0.09
  llm_n: 1.29 ± 0.10
  p-value: 0.0056727663
stopwords_frac:
  llm_1: 54.66 ± 5.04
  llm_n: 56.69 ± 5.47
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 80.78 ± 9.01
  llm_n: 80.45 ± 10.20
  p-value: 0.3980000201
entropy:
  llm_1: 6.50 ± 0.68
  llm_n: 7.85 ± 1.48
  p-value: 0.0000000000
perplexity:
  llm_1: 840.67 ± 661.02
  llm_n: 5325.62 ± 6911.90
  p-value: 0.0000000000
per_word_perplexity:
  llm_1: 4.28 ± 3.25
  llm_n: 22.04 ± 27.27
  p-value: 0.0000000000
dependency_distance_mean:
  llm_1: 2.66 ± 0.27
  llm_n: 2.59 ± 0.31
  p-value: 

100%|██████████| 1610/1610 [01:14<00:00, 21.65it/s]


----------------------------------------------------------------------------------------------------
Comparision with part-2


100%|██████████| 1025/1025 [00:49<00:00, 20.84it/s]


total_wc:
  llm_1: 177.25 ± 12.61
  llm_n: 182.24 ± 12.88
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.49 ± 0.32
  llm_n: 4.39 ± 0.30
  p-value: 0.0000000000
n_sentences:
  llm_1: 11.89 ± 2.53
  llm_n: 12.78 ± 2.71
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 15.54 ± 3.28
  llm_n: 14.86 ± 3.10
  p-value: 0.0000000822
ttr:
  llm_1: 0.64 ± 0.04
  llm_n: 0.62 ± 0.04
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.30 ± 0.09
  llm_n: 1.27 ± 0.08
  p-value: 0.0000000000
stopwords_frac:
  llm_1: 53.78 ± 5.07
  llm_n: 56.10 ± 4.82
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 81.04 ± 8.58
  llm_n: 83.99 ± 8.40
  p-value: 0.0000000000
entropy:
  llm_1: 7.29 ± 0.81
  llm_n: 7.47 ± 0.84
  p-value: 0.0000000313
perplexity:
  llm_1: 2018.08 ± 1848.89
  llm_n: 2511.62 ± 2578.75
  p-value: 0.0000001201
per_word_perplexity:
  llm_1: 9.38 ± 8.09
  llm_n: 11.33 ± 11.06
  p-value: 0.0000012212
dependency_distance_mean:
  llm_1: 2.60 ± 0.24
  llm_n: 2.55 ± 0.22
  p-value

100%|██████████| 201/201 [00:09<00:00, 21.27it/s]


total_wc:
  llm_1: 177.25 ± 12.61
  llm_n: 178.09 ± 22.71
  p-value: 0.6064777328
avg_word_len:
  llm_1: 4.49 ± 0.32
  llm_n: 4.42 ± 0.34
  p-value: 0.0067363213
n_sentences:
  llm_1: 11.89 ± 2.53
  llm_n: 11.61 ± 2.92
  p-value: 0.1864784394
sentence_length_mean:
  llm_1: 15.54 ± 3.28
  llm_n: 16.11 ± 3.50
  p-value: 0.0319511104
ttr:
  llm_1: 0.64 ± 0.04
  llm_n: 0.61 ± 0.05
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.30 ± 0.09
  llm_n: 1.29 ± 0.10
  p-value: 0.0775318804
stopwords_frac:
  llm_1: 53.78 ± 5.07
  llm_n: 56.43 ± 4.69
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 81.04 ± 8.58
  llm_n: 81.59 ± 10.07
  p-value: 0.4586410853
entropy:
  llm_1: 7.29 ± 0.81
  llm_n: 7.23 ± 1.12
  p-value: 0.4763001760
perplexity:
  llm_1: 2018.08 ± 1848.89
  llm_n: 2291.33 ± 2499.77
  p-value: 0.1360763194
per_word_perplexity:
  llm_1: 9.38 ± 8.09
  llm_n: 10.35 ± 10.66
  p-value: 0.2139176983
dependency_distance_mean:
  llm_1: 2.60 ± 0.24
  llm_n: 2.61 ± 0.21
  p-valu

100%|██████████| 1518/1518 [01:11<00:00, 21.32it/s]


total_wc:
  llm_1: 177.25 ± 12.61
  llm_n: 178.82 ± 19.47
  p-value: 0.0078541919
avg_word_len:
  llm_1: 4.49 ± 0.32
  llm_n: 4.41 ± 0.32
  p-value: 0.0000000000
n_sentences:
  llm_1: 11.89 ± 2.53
  llm_n: 11.74 ± 2.80
  p-value: 0.0996951782
sentence_length_mean:
  llm_1: 15.54 ± 3.28
  llm_n: 15.98 ± 3.52
  p-value: 0.0003766743
ttr:
  llm_1: 0.64 ± 0.04
  llm_n: 0.61 ± 0.04
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.30 ± 0.09
  llm_n: 1.28 ± 0.09
  p-value: 0.0000001525
stopwords_frac:
  llm_1: 53.78 ± 5.07
  llm_n: 56.21 ± 4.71
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 81.04 ± 8.58
  llm_n: 82.02 ± 9.09
  p-value: 0.0020495947
entropy:
  llm_1: 7.29 ± 0.81
  llm_n: 7.32 ± 1.04
  p-value: 0.2791024802
perplexity:
  llm_1: 2018.08 ± 1848.89
  llm_n: 2378.86 ± 2528.46
  p-value: 0.0000061063
per_word_perplexity:
  llm_1: 9.38 ± 8.09
  llm_n: 10.82 ± 10.76
  p-value: 0.0000257480
dependency_distance_mean:
  llm_1: 2.60 ± 0.24
  llm_n: 2.60 ± 0.23
  p-value

100%|██████████| 866/866 [00:39<00:00, 21.69it/s]


total_wc:
  llm_1: 177.25 ± 12.61
  llm_n: 178.15 ± 22.69
  p-value: 0.2782941172
avg_word_len:
  llm_1: 4.49 ± 0.32
  llm_n: 4.45 ± 0.33
  p-value: 0.0009425928
n_sentences:
  llm_1: 11.89 ± 2.53
  llm_n: 11.18 ± 2.68
  p-value: 0.0000000002
sentence_length_mean:
  llm_1: 15.54 ± 3.28
  llm_n: 16.64 ± 3.58
  p-value: 0.0000000000
ttr:
  llm_1: 0.64 ± 0.04
  llm_n: 0.61 ± 0.05
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.30 ± 0.09
  llm_n: 1.30 ± 0.10
  p-value: 0.9546966327
stopwords_frac:
  llm_1: 53.78 ± 5.07
  llm_n: 56.50 ± 4.72
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 81.04 ± 8.58
  llm_n: 79.91 ± 9.44
  p-value: 0.0033619803
entropy:
  llm_1: 7.29 ± 0.81
  llm_n: 7.29 ± 1.14
  p-value: 0.9165784291
perplexity:
  llm_1: 2018.08 ± 1848.89
  llm_n: 2409.30 ± 2609.89
  p-value: 0.0000959810
per_word_perplexity:
  llm_1: 9.38 ± 8.09
  llm_n: 10.99 ± 11.18
  p-value: 0.0001855808
dependency_distance_mean:
  llm_1: 2.60 ± 0.24
  llm_n: 2.62 ± 0.24
  p-value

100%|██████████| 854/854 [00:39<00:00, 21.50it/s]


----------------------------------------------------------------------------------------------------
Comparision with part-2


100%|██████████| 1545/1545 [01:05<00:00, 23.59it/s]


total_wc:
  llm_1: 172.97 ± 16.47
  llm_n: 157.17 ± 36.22
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.80 ± 0.29
  llm_n: 4.76 ± 0.34
  p-value: 0.0031048579
n_sentences:
  llm_1: 12.61 ± 2.65
  llm_n: 11.70 ± 3.72
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 14.20 ± 2.65
  llm_n: 14.05 ± 2.78
  p-value: 0.1854688309
ttr:
  llm_1: 0.67 ± 0.04
  llm_n: 0.67 ± 0.05
  p-value: 0.0201563205
syllables_per_token_mean:
  llm_1: 1.38 ± 0.10
  llm_n: 1.36 ± 0.10
  p-value: 0.0036165083
stopwords_frac:
  llm_1: 46.51 ± 3.97
  llm_n: 49.56 ± 4.74
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 75.95 ± 8.53
  llm_n: 77.13 ± 9.70
  p-value: 0.0018982316
entropy:
  llm_1: 6.97 ± 0.87
  llm_n: 6.24 ± 1.59
  p-value: 0.0000000000
perplexity:
  llm_1: 1527.20 ± 1658.08
  llm_n: 1210.79 ± 1590.61
  p-value: 0.0000060803
per_word_perplexity:
  llm_1: 7.05 ± 6.83
  llm_n: 5.59 ± 6.67
  p-value: 0.0000005286
dependency_distance_mean:
  llm_1: 2.47 ± 0.21
  llm_n: 2.46 ± 0.22
  p-value: 

100%|██████████| 1379/1379 [00:46<00:00, 29.62it/s]


total_wc:
  llm_1: 172.97 ± 16.47
  llm_n: 124.51 ± 46.13
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.80 ± 0.29
  llm_n: 4.81 ± 0.42
  p-value: 0.5782471887
n_sentences:
  llm_1: 12.61 ± 2.65
  llm_n: 9.19 ± 4.12
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 14.20 ± 2.65
  llm_n: 14.32 ± 3.05
  p-value: 0.3558719642
ttr:
  llm_1: 0.67 ± 0.04
  llm_n: 0.70 ± 0.07
  p-value: 0.0000000000
syllables_per_token_mean:
  llm_1: 1.38 ± 0.10
  llm_n: 1.39 ± 0.13
  p-value: 0.0860631585
stopwords_frac:
  llm_1: 46.51 ± 3.97
  llm_n: 49.99 ± 5.20
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 75.95 ± 8.53
  llm_n: 75.13 ± 12.29
  p-value: 0.0646353800
entropy:
  llm_1: 6.97 ± 0.87
  llm_n: 4.95 ± 1.99
  p-value: 0.0000000000
perplexity:
  llm_1: 1527.20 ± 1658.08
  llm_n: 703.67 ± 1241.23
  p-value: 0.0000000000
per_word_perplexity:
  llm_1: 7.05 ± 6.83
  llm_n: 3.33 ± 5.33
  p-value: 0.0000000000
dependency_distance_mean:
  llm_1: 2.47 ± 0.21
  llm_n: 2.48 ± 0.28
  p-value: 0

100%|██████████| 616/616 [00:21<00:00, 29.04it/s]


total_wc:
  llm_1: 172.97 ± 16.47
  llm_n: 129.76 ± 47.16
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.80 ± 0.29
  llm_n: 4.83 ± 0.42
  p-value: 0.0755270156
n_sentences:
  llm_1: 12.61 ± 2.65
  llm_n: 9.30 ± 4.15
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 14.20 ± 2.65
  llm_n: 14.71 ± 2.91
  p-value: 0.0007435376
ttr:
  llm_1: 0.67 ± 0.04
  llm_n: 0.69 ± 0.07
  p-value: 0.0000008719
syllables_per_token_mean:
  llm_1: 1.38 ± 0.10
  llm_n: 1.39 ± 0.13
  p-value: 0.0102775721
stopwords_frac:
  llm_1: 46.51 ± 3.97
  llm_n: 50.51 ± 5.35
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 75.95 ± 8.53
  llm_n: 74.11 ± 11.81
  p-value: 0.0010324759
entropy:
  llm_1: 6.97 ± 0.87
  llm_n: 5.12 ± 2.04
  p-value: 0.0000000000
perplexity:
  llm_1: 1527.20 ± 1658.08
  llm_n: 828.02 ± 1389.55
  p-value: 0.0000000000
per_word_perplexity:
  llm_1: 7.05 ± 6.83
  llm_n: 3.86 ± 5.91
  p-value: 0.0000000000
dependency_distance_mean:
  llm_1: 2.47 ± 0.21
  llm_n: 2.49 ± 0.24
  p-value: 0

100%|██████████| 826/826 [00:29<00:00, 27.78it/s]


total_wc:
  llm_1: 172.97 ± 16.47
  llm_n: 133.75 ± 46.90
  p-value: 0.0000000000
avg_word_len:
  llm_1: 4.80 ± 0.29
  llm_n: 4.84 ± 0.44
  p-value: 0.0223985011
n_sentences:
  llm_1: 12.61 ± 2.65
  llm_n: 9.59 ± 4.29
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 14.20 ± 2.65
  llm_n: 14.86 ± 3.21
  p-value: 0.0000047137
ttr:
  llm_1: 0.67 ± 0.04
  llm_n: 0.68 ± 0.06
  p-value: 0.0029071703
syllables_per_token_mean:
  llm_1: 1.38 ± 0.10
  llm_n: 1.40 ± 0.14
  p-value: 0.0000609947
stopwords_frac:
  llm_1: 46.51 ± 3.97
  llm_n: 50.38 ± 5.32
  p-value: 0.0000000000
flesch_reading_ease:
  llm_1: 75.95 ± 8.53
  llm_n: 73.28 ± 12.96
  p-value: 0.0000007981
entropy:
  llm_1: 6.97 ± 0.87
  llm_n: 5.31 ± 2.05
  p-value: 0.0000000000
perplexity:
  llm_1: 1527.20 ± 1658.08
  llm_n: 955.59 ± 1778.28
  p-value: 0.0000000000
per_word_perplexity:
  llm_1: 7.05 ± 6.83
  llm_n: 4.43 ± 7.44
  p-value: 0.0000000000
dependency_distance_mean:
  llm_1: 2.47 ± 0.21
  llm_n: 2.50 ± 0.24
  p-value: 0

100%|██████████| 411/411 [00:18<00:00, 21.86it/s]


----------------------------------------------------------------------------------------------------
Comparision with part-2


100%|██████████| 834/834 [00:38<00:00, 21.93it/s]


total_wc:
  llm_1: 174.45 ± 22.20
  llm_n: 175.61 ± 10.42
  p-value: 0.3139138563
avg_word_len:
  llm_1: 4.34 ± 0.29
  llm_n: 4.38 ± 0.30
  p-value: 0.0233548482
n_sentences:
  llm_1: 11.52 ± 2.82
  llm_n: 11.35 ± 2.55
  p-value: 0.2908143716
sentence_length_mean:
  llm_1: 15.83 ± 3.44
  llm_n: 16.22 ± 3.56
  p-value: 0.0629179730
ttr:
  llm_1: 0.62 ± 0.05
  llm_n: 0.62 ± 0.04
  p-value: 0.2719430899
syllables_per_token_mean:
  llm_1: 1.27 ± 0.08
  llm_n: 1.28 ± 0.08
  p-value: 0.6732976554
stopwords_frac:
  llm_1: 57.61 ± 5.38
  llm_n: 56.93 ± 4.77
  p-value: 0.0296424228
flesch_reading_ease:
  llm_1: 83.08 ± 8.54
  llm_n: 82.51 ± 8.45
  p-value: 0.2674072215
entropy:
  llm_1: 7.02 ± 1.13
  llm_n: 7.17 ± 0.79
  p-value: 0.0144951515
perplexity:
  llm_1: 1824.31 ± 2153.68
  llm_n: 1846.71 ± 2054.93
  p-value: 0.8611367017
per_word_perplexity:
  llm_1: 8.53 ± 9.50
  llm_n: 8.72 ± 9.15
  p-value: 0.7339136686
dependency_distance_mean:
  llm_1: 2.61 ± 0.23
  llm_n: 2.61 ± 0.23
  p-value: 

100%|██████████| 1280/1280 [00:59<00:00, 21.62it/s]


total_wc:
  llm_1: 174.45 ± 22.20
  llm_n: 178.11 ± 15.62
  p-value: 0.0020116818
avg_word_len:
  llm_1: 4.34 ± 0.29
  llm_n: 4.41 ± 0.33
  p-value: 0.0000800672
n_sentences:
  llm_1: 11.52 ± 2.82
  llm_n: 10.93 ± 2.53
  p-value: 0.0001837492
sentence_length_mean:
  llm_1: 15.83 ± 3.44
  llm_n: 17.03 ± 3.66
  p-value: 0.0000000018
ttr:
  llm_1: 0.62 ± 0.05
  llm_n: 0.61 ± 0.04
  p-value: 0.0000474865
syllables_per_token_mean:
  llm_1: 1.27 ± 0.08
  llm_n: 1.28 ± 0.09
  p-value: 0.0133433593
stopwords_frac:
  llm_1: 57.61 ± 5.38
  llm_n: 57.79 ± 4.94
  p-value: 0.5518321509
flesch_reading_ease:
  llm_1: 83.08 ± 8.54
  llm_n: 80.86 ± 9.65
  p-value: 0.0000107129
entropy:
  llm_1: 7.02 ± 1.13
  llm_n: 7.14 ± 0.93
  p-value: 0.0422323550
perplexity:
  llm_1: 1824.31 ± 2153.68
  llm_n: 1854.83 ± 1797.86
  p-value: 0.7954233341
per_word_perplexity:
  llm_1: 8.53 ± 9.50
  llm_n: 8.64 ± 7.91
  p-value: 0.8244671872
dependency_distance_mean:
  llm_1: 2.61 ± 0.23
  llm_n: 2.66 ± 0.23
  p-value: 

100%|██████████| 1650/1650 [01:16<00:00, 21.66it/s]


total_wc:
  llm_1: 174.45 ± 22.20
  llm_n: 178.01 ± 14.37
  p-value: 0.0020828739
avg_word_len:
  llm_1: 4.34 ± 0.29
  llm_n: 4.43 ± 0.34
  p-value: 0.0000000624
n_sentences:
  llm_1: 11.52 ± 2.82
  llm_n: 10.75 ± 2.51
  p-value: 0.0000006070
sentence_length_mean:
  llm_1: 15.83 ± 3.44
  llm_n: 17.37 ± 3.89
  p-value: 0.0000000000
ttr:
  llm_1: 0.62 ± 0.05
  llm_n: 0.61 ± 0.05
  p-value: 0.0004985951
syllables_per_token_mean:
  llm_1: 1.27 ± 0.08
  llm_n: 1.29 ± 0.09
  p-value: 0.0000627112
stopwords_frac:
  llm_1: 57.61 ± 5.38
  llm_n: 57.33 ± 4.88
  p-value: 0.3410024754
flesch_reading_ease:
  llm_1: 83.08 ± 8.54
  llm_n: 79.95 ± 9.97
  p-value: 0.0000000003
entropy:
  llm_1: 7.02 ± 1.13
  llm_n: 7.16 ± 0.87
  p-value: 0.0151559324
perplexity:
  llm_1: 1824.31 ± 2153.68
  llm_n: 1863.87 ± 1953.05
  p-value: 0.7347935621
per_word_perplexity:
  llm_1: 8.53 ± 9.50
  llm_n: 8.71 ± 8.45
  p-value: 0.7241132888
dependency_distance_mean:
  llm_1: 2.61 ± 0.23
  llm_n: 2.68 ± 0.23
  p-value: 

100%|██████████| 1045/1045 [00:47<00:00, 21.93it/s]

total_wc:
  llm_1: 174.45 ± 22.20
  llm_n: 177.89 ± 16.57
  p-value: 0.0046331080
avg_word_len:
  llm_1: 4.34 ± 0.29
  llm_n: 4.43 ± 0.33
  p-value: 0.0000009566
n_sentences:
  llm_1: 11.52 ± 2.82
  llm_n: 10.37 ± 2.39
  p-value: 0.0000000000
sentence_length_mean:
  llm_1: 15.83 ± 3.44
  llm_n: 17.97 ± 4.23
  p-value: 0.0000000000
ttr:
  llm_1: 0.62 ± 0.05
  llm_n: 0.60 ± 0.05
  p-value: 0.0000000012
syllables_per_token_mean:
  llm_1: 1.27 ± 0.08
  llm_n: 1.29 ± 0.09
  p-value: 0.0007597915
stopwords_frac:
  llm_1: 57.61 ± 5.38
  llm_n: 57.63 ± 4.75
  p-value: 0.9388196546
flesch_reading_ease:
  llm_1: 83.08 ± 8.54
  llm_n: 79.51 ± 9.80
  p-value: 0.0000000000
entropy:
  llm_1: 7.02 ± 1.13
  llm_n: 7.17 ± 0.95
  p-value: 0.0188298843
perplexity:
  llm_1: 1824.31 ± 2153.68
  llm_n: 1979.02 ± 2309.45
  p-value: 0.2276797273
per_word_perplexity:
  llm_1: 8.53 ± 9.50
  llm_n: 9.24 ± 10.04
  p-value: 0.2060876578
dependency_distance_mean:
  llm_1: 2.61 ± 0.23
  llm_n: 2.71 ± 0.24
  p-value:


