## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from scipy.stats import norm

## Methods

In [2]:
def calculate_correlations_with_significance(difference_df_stylo, difference_df):
    correlation_results = {}
    p_value_symbole_results = {}
    p_value_results = {}

    for feature in difference_df_stylo["Feature"].unique():
        filtered_stylo_df = difference_df_stylo[difference_df_stylo["Feature"] == feature]

        # Calculate Pearson correlation and p-value
        correlation, p_value = pearsonr(filtered_stylo_df["Difference"], difference_df["Difference"])
        correlation_results[feature] = correlation
        p_value_results[feature] = p_value

        # Assign asterisk based on p-value significance
        if p_value < 0.01:
            p_value_symbole_results[feature] = '**'
        elif p_value < 0.05:
            p_value_symbole_results[feature] = '*'
        else:
            p_value_symbole_results[feature] = ''

    correlation_df = pd.DataFrame({
        'Feature': list(correlation_results.keys()),
        'Correlation': list(correlation_results.values()),
        'P-value_symbole': list(p_value_symbole_results.values()),  # Store significance asterisks
        'P-value': list(p_value_results.values()),
    })

    return correlation_df

In [3]:
def fisher_z(r):
  return 0.5 * np.log((1 + r) / (1 - r))

In [4]:
def compare_correlations(r1, n1, r2, n2):
  """
  r1, r2 : Fischer transformed correlations
  n1, n2 : sample size
  """
  z1 = fisher_z(r1)
  z2 = fisher_z(r2)
  se = np.sqrt(1/(n1 - 3) + 1/(n2 - 3))
  z = np.abs(z1 - z2) / se
  p = 2 * (1 - norm.cdf(z)) # two-tailed
  return z, p
# Example usage:
# z_score, p_value = compare_correlations(0.65, 96, 0.45, 96)

In [5]:
def tost_compare_correlations_z(r1, n1, r2, n2, delta_z=0.01, alpha=0.05):
    """
    TOST equivalence test for two correlations (in Fisher z-space).

    Returns:
        p1, p2 : one-sided p-values
        equivalent : True if both p-values < alpha
    """
    z1 = fisher_z(r1)
    z2 = fisher_z(r2)
    diff = z1 - z2
    print(f"Difference in z-space: {diff:.3f}")
    se = np.sqrt(1 / (n1 - 3) + 1 / (n2 - 3))

    z_stat_low = (diff + delta_z) / se
    z_stat_high = (diff - delta_z) / se

    # CORRECTED: Flip these p-values
    p1 = 1 - norm.cdf(z_stat_low)  # test: diff > -delta
    p2 = norm.cdf(z_stat_high)     # test: diff < delta

    equivalent = (p1 < alpha) and (p2 < alpha)
    return p1, p2, equivalent

def r_to_z_delta(delta_r):
    r = 0.0  # center at 0 to be conservative
    r1 = min(max(r - delta_r, -0.9999), 0.9999)
    r2 = min(max(r + delta_r, -0.9999), 0.9999)
    return np.abs(fisher_z(r2) - fisher_z(r1))

## Dataset

### Stylometry

In [6]:
stylo_df_fr = pd.read_excel('./stylo_terreau_df.xlsx')

In [7]:
stylo_df_tuffery_ref = stylo_df_fr[stylo_df_fr['author'] == "Tuffery"]
stylo_df_style_gen = stylo_df_fr[stylo_df_fr['genAI'] != "No"]
stylo_df_TS_gen= pd.concat([stylo_df_tuffery_ref, stylo_df_style_gen], axis=0)

stylo_df_TS_gen = stylo_df_TS_gen.rename(columns={"id": "file_name", "Unnamed: 0":"id"})
stylo_df_TS_gen = stylo_df_TS_gen.drop(columns=["text_id",
                                                    "id"])
stylo_df_TS_gen = stylo_df_TS_gen.rename(columns={'Indexes': 'Entropy'})

stylo_df_TS_gen["class"] = stylo_df_TS_gen["genAI"].map({"No": "A1", "mistral": "A2", "gpt": "A2", "gemini":"A2"})

stylo_df = stylo_df_TS_gen

In [8]:
stylo_df.head()

Unnamed: 0,author,genAI,Punctuation,TAG,Structural,Entropy,NER,Letters,file_name,class
960,Tuffery,No,0.01951,0.195833,8.794737,7.159736,0.083333,0.029505,tuffery_agitato_atrabile.txt,A1
961,Tuffery,No,0.011617,0.436869,22.03807,7.771713,0.454545,0.029591,tuffery_agitato_lamentabile.txt,A1
962,Tuffery,No,0.011944,0.407407,18.214147,6.612949,0.222222,0.028414,tuffery_anacephaleose.txt,A1
963,Tuffery,No,0.011217,0.200617,9.13342,6.714099,0.092593,0.028595,tuffery_anadiploses_epanadiploses.txt,A1
964,Tuffery,No,0.010989,1.333333,59.19,5.093733,1.0,0.031382,tuffery_anaphore.txt,A1


In [9]:
#Normalize
numeric_cols = stylo_df.select_dtypes(include=['number']).columns

for col in numeric_cols:
    stylo_df[f"{col}_normalized"] = (stylo_df[col] - stylo_df[col].min()) / (stylo_df[col].max() - stylo_df[col].min())

#Mean
columns_to_average = ["Structural_normalized",
                      "Letters_normalized",
                      "TAG_normalized",
                      "NER_normalized",
                      "Entropy_normalized",
                      ]
stylo_df["Mean"] = stylo_df[columns_to_average].mean(axis=1)

stylo_df.head()

Unnamed: 0,author,genAI,Punctuation,TAG,Structural,Entropy,NER,Letters,file_name,class,Punctuation_normalized,TAG_normalized,Structural_normalized,Entropy_normalized,NER_normalized,Letters_normalized,Mean
960,Tuffery,No,0.01951,0.195833,8.794737,7.159736,0.083333,0.029505,tuffery_agitato_atrabile.txt,A1,0.239607,0.029706,0.048711,0.830128,0.041667,0.252477,0.240538
961,Tuffery,No,0.011617,0.436869,22.03807,7.771713,0.454545,0.029591,tuffery_agitato_lamentabile.txt,A1,0.142667,0.080749,0.151192,0.901083,0.227273,0.2646,0.324979
962,Tuffery,No,0.011944,0.407407,18.214147,6.612949,0.222222,0.028414,tuffery_anacephaleose.txt,A1,0.146682,0.07451,0.121601,0.766732,0.111111,0.099577,0.234706
963,Tuffery,No,0.011217,0.200617,9.13342,6.714099,0.092593,0.028595,tuffery_anadiploses_epanadiploses.txt,A1,0.137753,0.030719,0.051332,0.778459,0.046296,0.124931,0.206347
964,Tuffery,No,0.010989,1.333333,59.19,5.093733,1.0,0.031382,tuffery_anaphore.txt,A1,0.134959,0.270588,0.438685,0.590588,0.5,0.515662,0.463105


### Distance between embeddings

In [10]:
mean_distances_df = pd.read_excel('./distance_pertext_umap_TS_gen.xlsx')

mean_distances_df["genAI"] = mean_distances_df["Class"].map({"Proust": "No", "Celine": "No", "Yourcenar": "No", "Tuffery": "No",
                                                             "Proust_mistral": "mistral", "Celine_mistral": "mistral", "Yourcenar_mistral": "mistral",
                                                             "Proust_gpt": "gpt", "Celine_gpt": "gpt", "Yourcenar_gpt": "gpt",
                                                             "Proust_gemini": "gemini", "Celine_gemini": "gemini", "Yourcenar_gemini": "gemini",
                                                             })

mean_distances_df["author"] = mean_distances_df["Class"].map({"Proust": "Proust", "Celine": "Celine", "Yourcenar": "Yourcenar","Tuffery": "Tuffery",
                                                             "Proust_mistral": "Proust", "Celine_mistral": "Celine", "Yourcenar_mistral": "Yourcenar",
                                                             "Proust_gpt": "Proust", "Celine_gpt": "Celine", "Yourcenar_gpt": "Yourcenar",
                                                             "Proust_gemini": "Proust", "Celine_gemini": "Celine", "Yourcenar_gemini": "Yourcenar",
                                                             })

mean_distances_df["class"] = mean_distances_df["genAI"].map({"No": "A1", "mistral": "A2", "gpt": "A2", "gemini":"A2"})
mean_distances_df = mean_distances_df.drop(columns=["Class"])

In [11]:
mean_distances_df.head()

Unnamed: 0,Text_Index,Mean_Distance_From_Centroid,genAI,author,class
0,0,1.493536,No,Tuffery,A1
1,1,3.086941,No,Tuffery,A1
2,2,1.641952,No,Tuffery,A1
3,3,1.457144,No,Tuffery,A1
4,4,1.759076,No,Tuffery,A1


## Aggregated features, per author, all genAI : correlation, p-value

In [None]:
Tuffery_distances = mean_distances_df[mean_distances_df['genAI'] == 'No']['Mean_Distance_From_Centroid'].values
Proust_distances = mean_distances_df[mean_distances_df['author'] == 'Proust']['Mean_Distance_From_Centroid'].values
Celine_distances = mean_distances_df[mean_distances_df['author'] == 'Celine']['Mean_Distance_From_Centroid'].values
Yourcenar_distances = mean_distances_df[mean_distances_df['author'] == 'Yourcenar']['Mean_Distance_From_Centroid'].values

difference_results_proust = []

for i, ref_dist in enumerate(Tuffery_distances):
    for j, gen_dist in enumerate(Proust_distances):
        difference = ref_dist - gen_dist
        difference_results_proust.append({
            'Tuffery_Index': i,
            'Proust_Index': j,
            'Difference': difference
        })

difference_df_cond1_proust = pd.DataFrame(difference_results_proust)

difference_results_celine = []

for i, ref_dist in enumerate(Tuffery_distances):
    for j, gen_dist in enumerate(Celine_distances):
        difference = ref_dist - gen_dist
        difference_results_celine.append({
            'Tuffery_Index': i,
            'Celine_Index': j,
            'Difference': difference
        })

difference_df_cond1_celine = pd.DataFrame(difference_results_celine)

difference_results_yourcenar = []

for i, ref_dist in enumerate(Tuffery_distances):
    for j, gen_dist in enumerate(Yourcenar_distances):
        difference = ref_dist - gen_dist
        difference_results_yourcenar.append({
            'Tuffery_Index': i,
            'Yourcenar_Index': j,
            'Difference': difference
        })

difference_df_cond1_yourcenar = pd.DataFrame(difference_results_yourcenar)

In [None]:
Proust_df = stylo_df[stylo_df['author'] == 'Proust']
Celine_df = stylo_df[stylo_df['author'] == 'Celine']
Yourcenar_df = stylo_df[stylo_df['author'] == 'Yourcenar']
Tuffery_df = stylo_df[stylo_df['genAI'] == 'No']
features = ['Mean']

difference_results_stylo_proust = []

for feature in features:
    for i, ref_val in enumerate(Tuffery_df[feature].values):
        for j, gen_val in enumerate(Proust_df[feature].values):
            difference = ref_val - gen_val
            difference_results_stylo_proust.append({
                'Feature': feature,
                'Proust_Index': i,
                'Tuffery_Index': j,
                'Difference': difference
            })

difference_df_stylo_cond1_proust = pd.DataFrame(difference_results_stylo_proust)

difference_results_stylo_celine = []

for feature in features:
    for i, ref_val in enumerate(Tuffery_df[feature].values):
        for j, gen_val in enumerate(Celine_df[feature].values):
            difference = ref_val - gen_val
            difference_results_stylo_celine.append({
                'Feature': feature,
                'Celine_Index': i,
                'Tuffery_Index': j,
                'Difference': difference
            })

difference_df_stylo_cond1_celine = pd.DataFrame(difference_results_stylo_celine)

difference_results_stylo_yourcenar = []

for feature in features:
    for i, ref_val in enumerate(Tuffery_df[feature].values):
        for j, gen_val in enumerate(Yourcenar_df[feature].values):
            difference = ref_val - gen_val
            difference_results_stylo_yourcenar.append({
                'Feature': feature,
                'Yourcenar_Index': i,
                'Tuffery_Index': j,
                'Difference': difference
            })

difference_df_stylo_cond1_yourcenar = pd.DataFrame(difference_results_stylo_yourcenar)


In [None]:
correlation_df_cond1_proust = calculate_correlations_with_significance(difference_df_stylo_cond1_proust, difference_df_cond1_proust)
correlation_df_cond1_proust.rename(columns={'Correlation': 'Correlation_proust'}, inplace=True)
correlation_df_cond1_proust.rename(columns={'P-value': 'P-value_proust'}, inplace=True)
correlation_df_cond1_proust.rename(columns={'P-value_symbole': 'P-value_symbole_proust'}, inplace=True)

correlation_df_cond1_celine = calculate_correlations_with_significance(difference_df_stylo_cond1_celine, difference_df_cond1_celine)
correlation_df_cond1_celine.rename(columns={'Correlation': 'Correlation_celine'}, inplace=True)
correlation_df_cond1_celine.rename(columns={'P-value': 'P-value_celine'}, inplace=True)
correlation_df_cond1_celine.rename(columns={'P-value_symbole': 'P-value_symbole_celine'}, inplace=True)

correlation_df_cond1_yourcenar = calculate_correlations_with_significance(difference_df_stylo_cond1_yourcenar, difference_df_cond1_yourcenar)
correlation_df_cond1_yourcenar.rename(columns={'Correlation': 'Correlation_yourcenar'}, inplace=True)
correlation_df_cond1_yourcenar.rename(columns={'P-value': 'P-value_yourcenar'}, inplace=True)
correlation_df_cond1_yourcenar.rename(columns={'P-value_symbole': 'P-value_symbole_yourcenar'}, inplace=True)


correlation_df_cond1 = correlation_df_cond1_proust.merge(correlation_df_cond1_celine, on='Feature').merge(correlation_df_cond1_yourcenar, on='Feature')
print(correlation_df_cond1.to_string())

## Per feature, per author, all genAI : correlation, p-value

In [None]:
Tuffery_distances = mean_distances_df[mean_distances_df['genAI'] == 'No']['Mean_Distance_From_Centroid'].values

GenAI_distances = {}
difference_df_cond1 = {}



GenAI_distances = {}
difference_df_cond1 = {}

for auteur in ["Proust", "Celine", "Yourcenar"]:
    GenAI_distances[auteur] = mean_distances_df[mean_distances_df['author'] == auteur]['Mean_Distance_From_Centroid'].values

    difference_results = []

    for i, ref_dist in enumerate(Tuffery_distances):
        for j, gen_dist in enumerate(GenAI_distances[auteur]):
            difference = ref_dist - gen_dist
            difference_results.append({
                'Tuffery_Index': i,
                f'{auteur}_Index': j,
                'Difference': difference
            })

    difference_df_cond1[auteur] = pd.DataFrame(difference_results)

In [None]:
Mistral_df = stylo_df[stylo_df['genAI'] == 'mistral']
Gpt_df = stylo_df[stylo_df['genAI'] == 'gpt']
Gemini_df = stylo_df[stylo_df['genAI'] == 'gemini']
Tuffery_df = stylo_df[stylo_df['genAI'] == 'No']

GenAI_stylo = {}
difference_df_stylo_cond1 = {}

features = ['Letters', 'TAG', 'NER', 'Structural', 'Entropy']

for auteur in ["Proust", "Celine", "Yourcenar"]:
    GenAI_stylo[auteur] = stylo_df[stylo_df['author'] == auteur]

    difference_results_stylo = []

    for feature in features:
        for i, ref_val in enumerate(Tuffery_df[feature].values):
            for j, gen_val in enumerate(GenAI_stylo[auteur][feature].values):
                difference = ref_val - gen_val
                difference_results_stylo.append({
                    'Feature': feature,
                    'Mistral_Index': i,
                    f'{auteur}_Index': j,
                    'Difference': difference
                })
    difference_df_stylo_cond1[auteur] = pd.DataFrame(difference_results_stylo)

In [None]:
for auteur in ["Proust", "Celine", "Yourcenar"]:

    correlation_df_cond1 = calculate_correlations_with_significance(difference_df_stylo_cond1[auteur], difference_df_cond1[auteur])

    print(f"\n {auteur}\n ")
    print(correlation_df_cond1.to_string())

## Toutes features confondues, per genAI, all authors : correlation, p-value

### Z-test : correlations différentes deux à deux

In [15]:
## Testing if the correlations are significatively different

print("Entre Mistral et Gpt : ")

for row in correlation_df_cond1.iterrows():
  feature = row[1]['Feature']

  z_score_mistral_gpt, p_value_mistral_gpt = compare_correlations(row[1]["Correlation_mistral"], 27648 , row[1]["Correlation_gpt"], 27648 )
  if len(feature) > 4:
    print(f"{feature} \t z = {z_score_mistral_gpt:.4f} \t p = {p_value_mistral_gpt:.4f}")
  else:
    print(f"{feature} \t \t z = {z_score_mistral_gpt:.4f} \t p = {p_value_mistral_gpt:.4f}")

print("\nEntre Mistral et Gemini : ")

for row in correlation_df_cond1.iterrows():
  feature = row[1]['Feature']

  z_score_mistral_gemini, p_value_mistral_gemini = compare_correlations(row[1]["Correlation_mistral"], 27648 , row[1]["Correlation_gemini"], 27648 )
  if len(feature) > 4:
    print(f"{feature} \t z = {z_score_mistral_gemini:.4f} \t p = {p_value_mistral_gemini:.2e}")
  else:
    print(f"{feature} \t \t z = {z_score_mistral_gemini:.4f} \t p = {p_value_mistral_gemini:.2e}")

print("\nEntre Gpt et Gemini : ")

for row in correlation_df_cond1.iterrows():
  feature = row[1]['Feature']

  z_score_gpt_gemini, p_value_gpt_gemini = compare_correlations(row[1]["Correlation_gpt"], 27648 , row[1]["Correlation_gemini"], 27648 )
  if len(feature) > 4:
    print(f"{feature} \t z = {z_score_gpt_gemini:.4f} \t p = {p_value_gpt_gemini:.2e}")
  else:
    print(f"{feature} \t \t z = {z_score_gpt_gemini:.4f} \t p = {p_value_gpt_gemini:.2e}")



Entre Mistral et Gpt : 
Mean 	 	 z = 4.7397 	 p = 0.0000

Entre Mistral et Gemini : 
Mean 	 	 z = 2.4884 	 p = 1.28e-02

Entre Gpt et Gemini : 
Mean 	 	 z = 2.2513 	 p = 2.44e-02


In [None]:
Tuffery_distances = mean_distances_df[mean_distances_df['genAI'] == 'No']['Mean_Distance_From_Centroid'].values
Proust_distances = mean_distances_df[mean_distances_df['author'] == 'Proust']['Mean_Distance_From_Centroid'].values
Celine_distances = mean_distances_df[mean_distances_df['author'] == 'Celine']['Mean_Distance_From_Centroid'].values
Yourcenar_distances = mean_distances_df[mean_distances_df['author'] == 'Yourcenar']['Mean_Distance_From_Centroid'].values

difference_results_proust = []

for i, ref_dist in enumerate(Tuffery_distances):
    for j, gen_dist in enumerate(Proust_distances):
        difference = ref_dist - gen_dist
        difference_results_proust.append({
            'Tuffery_Index': i,
            'Proust_Index': j,
            'Difference': difference
        })

difference_df_cond1_proust = pd.DataFrame(difference_results_proust)

difference_results_celine = []

for i, ref_dist in enumerate(Tuffery_distances):
    for j, gen_dist in enumerate(Celine_distances):
        difference = ref_dist - gen_dist
        difference_results_celine.append({
            'Tuffery_Index': i,
            'Celine_Index': j,
            'Difference': difference
        })

difference_df_cond1_celine = pd.DataFrame(difference_results_celine)

difference_results_yourcenar = []

for i, ref_dist in enumerate(Tuffery_distances):
    for j, gen_dist in enumerate(Yourcenar_distances):
        difference = ref_dist - gen_dist
        difference_results_yourcenar.append({
            'Tuffery_Index': i,
            'Yourcenar_Index': j,
            'Difference': difference
        })

difference_df_cond1_yourcenar = pd.DataFrame(difference_results_yourcenar)

Unnamed: 0,Tuffery_Index,Yourcenar_Index,Difference
0,0,0,-0.140511
1,0,1,-0.597226
2,0,2,-0.462544
3,0,3,-0.442218
4,0,4,-0.638528
...,...,...,...
27643,95,283,-0.190541
27644,95,284,-0.316553
27645,95,285,0.100418
27646,95,286,0.017426


In [None]:
Proust_df = stylo_df[stylo_df['author'] == 'Proust']
Celine_df = stylo_df[stylo_df['author'] == 'Celine']
Yourcenar_df = stylo_df[stylo_df['author'] == 'Yourcenar']
Tuffery_df = stylo_df[stylo_df['genAI'] == 'No']
features = ['Mean']

difference_results_stylo_proust = []

for feature in features:
    for i, ref_val in enumerate(Tuffery_df[feature].values):
        for j, gen_val in enumerate(Proust_df[feature].values):
            difference = ref_val - gen_val
            difference_results_stylo_proust.append({
                'Feature': feature,
                'Proust_Index': i,
                'Tuffery_Index': j,
                'Difference': difference
            })

difference_df_stylo_cond1_proust = pd.DataFrame(difference_results_stylo_proust)

difference_results_stylo_celine = []

for feature in features:
    for i, ref_val in enumerate(Tuffery_df[feature].values):
        for j, gen_val in enumerate(Celine_df[feature].values):
            difference = ref_val - gen_val
            difference_results_stylo_celine.append({
                'Feature': feature,
                'Celine_Index': i,
                'Tuffery_Index': j,
                'Difference': difference
            })

difference_df_stylo_cond1_celine = pd.DataFrame(difference_results_stylo_celine)

difference_results_stylo_yourcenar = []

for feature in features:
    for i, ref_val in enumerate(Tuffery_df[feature].values):
        for j, gen_val in enumerate(Yourcenar_df[feature].values):
            difference = ref_val - gen_val
            difference_results_stylo_yourcenar.append({
                'Feature': feature,
                'Yourcenar_Index': i,
                'Tuffery_Index': j,
                'Difference': difference
            })

difference_df_stylo_cond1_yourcenar = pd.DataFrame(difference_results_stylo_yourcenar)


Unnamed: 0,Feature,Yourcenar_Index,Tuffery_Index,Difference
0,Mean,0,0,-0.005639
1,Mean,0,1,-0.065813
2,Mean,0,2,-0.093072
3,Mean,0,3,-0.003738
4,Mean,0,4,-0.021742
...,...,...,...,...
27643,Mean,95,283,-0.006012
27644,Mean,95,284,-0.011445
27645,Mean,95,285,0.038178
27646,Mean,95,286,-0.026641


In [None]:
correlation_df_cond1_proust = calculate_correlations_with_significance(difference_df_stylo_cond1_proust, difference_df_cond1_proust)
correlation_df_cond1_proust.rename(columns={'Correlation': 'Correlation_proust'}, inplace=True)
correlation_df_cond1_proust.rename(columns={'P-value': 'P-value_proust'}, inplace=True)
correlation_df_cond1_proust.rename(columns={'P-value_symbole': 'P-value_symbole_proust'}, inplace=True)

correlation_df_cond1_celine = calculate_correlations_with_significance(difference_df_stylo_cond1_celine, difference_df_cond1_celine)
correlation_df_cond1_celine.rename(columns={'Correlation': 'Correlation_celine'}, inplace=True)
correlation_df_cond1_celine.rename(columns={'P-value': 'P-value_celine'}, inplace=True)
correlation_df_cond1_celine.rename(columns={'P-value_symbole': 'P-value_symbole_celine'}, inplace=True)

correlation_df_cond1_yourcenar = calculate_correlations_with_significance(difference_df_stylo_cond1_yourcenar, difference_df_cond1_yourcenar)
correlation_df_cond1_yourcenar.rename(columns={'Correlation': 'Correlation_yourcenar'}, inplace=True)
correlation_df_cond1_yourcenar.rename(columns={'P-value': 'P-value_yourcenar'}, inplace=True)
correlation_df_cond1_yourcenar.rename(columns={'P-value_symbole': 'P-value_symbole_yourcenar'}, inplace=True)


correlation_df_cond1 = correlation_df_cond1_proust.merge(correlation_df_cond1_celine, on='Feature').merge(correlation_df_cond1_yourcenar, on='Feature')
print(correlation_df_cond1.to_string())

  Feature  Correlation_proust P-value_symbole_proust  P-value_proust  Correlation_celine P-value_symbole_celine  P-value_celine  Correlation_yourcenar P-value_symbole_yourcenar  P-value_yourcenar
0    Mean            0.149073                     **   3.925072e-137            0.120099                     **    2.398459e-89               0.086032                        **       1.398676e-46


## Toutes features confondues, per genAI, per authors : correlation, p-value

In [19]:
#### DISPERSION DELTA ANALYSIS FOR 2 COMPARISONS


## Humans vs genAI

# Dispersion delta
Tuffery_distances = mean_distances_df[mean_distances_df['genAI'] == 'No']['Mean_Distance_From_Centroid'].values

GenAI_distances = {}
difference_df_cond1 = {}


for genAI in ["mistral", "gpt", "gemini"]:

    GenAI_distances[genAI] = {}
    difference_df_cond1[genAI] = {}
    mean_distances_genAI = mean_distances_df[mean_distances_df['genAI'] == genAI]

    for auteur in ["Proust", "Celine", "Yourcenar"]:
        GenAI_distances[genAI][auteur] = mean_distances_genAI[mean_distances_genAI['author'] == auteur]['Mean_Distance_From_Centroid'].values

        # List to store the results
        difference_results = []

        for i, ref_dist in enumerate(Tuffery_distances):
            for j, gen_dist in enumerate(GenAI_distances[genAI][auteur]):
                difference = ref_dist - gen_dist
                difference_results.append({
                    'Tuffery_Index': i,
                    f'{genAI}_{auteur}_Index': j,
                    'Difference': difference
                })

        difference_df_cond1[genAI][auteur] = pd.DataFrame(difference_results)

In [20]:
# Style delta calculation

Mistral_df = stylo_df[stylo_df['genAI'] == 'mistral']
Gpt_df = stylo_df[stylo_df['genAI'] == 'gpt']
Gemini_df = stylo_df[stylo_df['genAI'] == 'gemini']
Tuffery_df = stylo_df[stylo_df['genAI'] == 'No']

GenAI_stylo = {}
difference_df_stylo_cond1 = {}

# List of stylistic features
# features = ['Function words', 'Letters', 'Numbers', 'TAG', 'NER', 'Structural', 'Punctuation', 'Indexes']
features = ['Mean']

for genAI in ["mistral", "gpt", "gemini"]:

    GenAI_stylo[genAI] = {}
    difference_df_stylo_cond1[genAI] = {}
    stylo_genAI = stylo_df[stylo_df['genAI'] == genAI]

    for auteur in ["Proust", "Celine", "Yourcenar"]:
        GenAI_stylo[genAI][auteur] = stylo_genAI[stylo_genAI['author'] == auteur]

        # List to store the results
        difference_results_stylo = []

        for feature in features:
            for i, ref_val in enumerate(Tuffery_df[feature].values):
                for j, gen_val in enumerate(GenAI_stylo[genAI][auteur][feature].values):
                    difference = ref_val - gen_val
                    difference_results_stylo.append({
                        'Feature': feature,
                        'Mistral_Index': i,
                        f'{genAI}_{auteur}_Index': j,
                        'Difference': difference
                    })
        difference_df_stylo_cond1[genAI][auteur] = pd.DataFrame(difference_results_stylo)

In [21]:
for genAI in ["mistral", "gpt", "gemini"]:

    for auteur in ["Proust", "Celine", "Yourcenar"]:

        correlation_df_cond1 = calculate_correlations_with_significance(difference_df_stylo_cond1[genAI][auteur], difference_df_cond1[genAI][auteur])

        print(f"{genAI} {auteur}")
        print(correlation_df_cond1.to_string())

mistral Proust
  Feature  Correlation P-value_symbole       P-value
0    Mean     0.147222              **  8.068892e-46
mistral Celine
  Feature  Correlation P-value_symbole       P-value
0    Mean     0.158634              **  5.293407e-53
mistral Yourcenar
  Feature  Correlation P-value_symbole       P-value
0    Mean     0.085907              **  1.442388e-16
gpt Proust
  Feature  Correlation P-value_symbole   P-value
0    Mean     0.040619              **  0.000096
gpt Celine
  Feature  Correlation P-value_symbole       P-value
0    Mean     0.129231              **  1.285324e-35
gpt Yourcenar
  Feature  Correlation P-value_symbole       P-value
0    Mean     0.103893              **  1.532694e-23
gemini Proust
  Feature  Correlation P-value_symbole       P-value
0    Mean     0.163929              **  1.589323e-56
gemini Celine
  Feature  Correlation P-value_symbole       P-value
0    Mean     0.129953              **  5.316481e-36
gemini Yourcenar
  Feature  Correlation P-value_

In [22]:
# Author pairs to compare
author_pairs = [['Proust', 'Celine'], ['Proust', 'Yourcenar'], ['Celine', 'Yourcenar']]

# Loop by generative AI model
for model in ['mistral', 'gpt', 'gemini']:
    print(f"\033[1mPour {model.upper()} \033[0m")

    for author1, author2 in author_pairs:
        print(f"Entre {author1} et {author2} :")

        # Get correlations
        corr_df1 = calculate_correlations_with_significance(
            difference_df_stylo_cond1[model][author1],
            difference_df_cond1[model][author1]
        )
        corr_df2 = calculate_correlations_with_significance(
            difference_df_stylo_cond1[model][author2],
            difference_df_cond1[model][author2]
        )

        # Compare with Fisher's z-test
        z_score, p_value = compare_correlations(
            corr_df1["Correlation"].values[0], 9216,
            corr_df2["Correlation"].values[0], 9216
        )

        print(f"\t z = {z_score:.4f} \t p = {p_value:.2e}")


[1mPour MISTRAL [0m
Entre Proust et Celine :
	 z = 0.7931 	 p = 4.28e-01
Entre Proust et Yourcenar :
	 z = 4.2203 	 p = 2.44e-05
Entre Celine et Yourcenar :
	 z = 5.0134 	 p = 5.35e-07
[1mPour GPT [0m
Entre Proust et Celine :
	 z = 6.0620 	 p = 1.34e-09
Entre Proust et Yourcenar :
	 z = 4.3185 	 p = 1.57e-05
Entre Celine et Yourcenar :
	 z = 1.7435 	 p = 8.12e-02
[1mPour GEMINI [0m
Entre Proust et Celine :
	 z = 2.3571 	 p = 1.84e-02
Entre Proust et Yourcenar :
	 z = 6.4285 	 p = 1.29e-10
Entre Celine et Yourcenar :
	 z = 4.0714 	 p = 4.67e-05


### Z-test : correlations différentes deux à deux pour un auteur fixé

In [23]:
## Testing if the correlations are significatively different
genais = [['mistral', 'gemini'], ['gpt', 'gemini'], ['gpt', 'mistral']]



for auteur in ['Proust', 'Celine', 'Yourcenar']:

    print(f"\033[1m Pour {auteur} \033[0m")

    for genAI_1, genAI_2 in genais:

        print(f"Entre {genAI_1} et {genAI_2} : ")

        correlation_df_cond1_1 = calculate_correlations_with_significance(difference_df_stylo_cond1[genAI_1][auteur], difference_df_cond1[genAI_1][auteur])
        correlation_df_cond1_2 = calculate_correlations_with_significance(difference_df_stylo_cond1[genAI_2][auteur], difference_df_cond1[genAI_2][auteur])
        z_score, p_value = compare_correlations(correlation_df_cond1_1["Correlation"].values[0], 9216 , correlation_df_cond1_2["Correlation"].values[0], 9216 )

        print(f"\t z = {z_score:.4f} \t p = {p_value:.2e}")

[1m Pour Proust [0m
Entre mistral et gemini : 
	 z = 1.1620 	 p = 2.45e-01
Entre gpt et gemini : 
	 z = 8.4689 	 p = 0.00e+00
Entre gpt et mistral : 
	 z = 7.3069 	 p = 2.73e-13
[1m Pour Celine [0m
Entre mistral et gemini : 
	 z = 1.9881 	 p = 4.68e-02
Entre gpt et gemini : 
	 z = 0.0499 	 p = 9.60e-01
Entre gpt et mistral : 
	 z = 2.0380 	 p = 4.15e-02
[1m Pour Yourcenar [0m
Entre mistral et gemini : 
	 z = 1.0461 	 p = 2.95e-01
Entre gpt et gemini : 
	 z = 2.2780 	 p = 2.27e-02
Entre gpt et mistral : 
	 z = 1.2319 	 p = 2.18e-01


In [24]:
author_pairs = [['Proust', 'Celine'], ['Proust', 'Yourcenar'], ['Celine', 'Yourcenar']]

for author1, author2 in author_pairs:
    print(f"\033[1mComparaison entre {author1} et {author2}\033[0m")

    # Concatenate all gen AI data for each author
    all_stylo_1 = pd.concat([difference_df_stylo_cond1[genAI][author1] for genAI in ['mistral', 'gpt', 'gemini']])
    all_base_1 = pd.concat([difference_df_cond1[genAI][author1] for genAI in ['mistral', 'gpt', 'gemini']])

    all_stylo_2 = pd.concat([difference_df_stylo_cond1[genAI][author2] for genAI in ['mistral', 'gpt', 'gemini']])
    all_base_2 = pd.concat([difference_df_cond1[genAI][author2] for genAI in ['mistral', 'gpt', 'gemini']])

    # Compute correlations
    corr1 = calculate_correlations_with_significance(all_stylo_1, all_base_1)
    corr2 = calculate_correlations_with_significance(all_stylo_2, all_base_2)

    # Use combined N (assuming same size across models; else use len() on concatenated data)
    n_total = len(all_stylo_1)

    z, p = compare_correlations(corr1["Correlation"].values[0], n_total, corr2["Correlation"].values[0], n_total)

    print(f"\t z = {z:.4f} \t p = {p:.2e}")


[1mComparaison entre Proust et Celine[0m
	 z = 1.6086 	 p = 1.08e-01
[1mComparaison entre Proust et Yourcenar[0m
	 z = 3.1336 	 p = 1.73e-03
[1mComparaison entre Celine et Yourcenar[0m
	 z = 4.7421 	 p = 2.11e-06


In [25]:
model_pairs = [['mistral', 'gpt'], ['mistral', 'gemini'], ['gpt', 'gemini']]
authors = ['Proust', 'Celine', 'Yourcenar']

for model1, model2 in model_pairs:
    print(f"\033[1mComparaison entre {model1.upper()} et {model2.upper()}\033[0m")

    all_stylo_1 = pd.concat([difference_df_stylo_cond1[model1][a] for a in authors], ignore_index=True)
    all_base_1  = pd.concat([difference_df_cond1[model1][a]        for a in authors], ignore_index=True)

    all_stylo_2 = pd.concat([difference_df_stylo_cond1[model2][a] for a in authors], ignore_index=True)
    all_base_2  = pd.concat([difference_df_cond1[model2][a]        for a in authors], ignore_index=True)

    corr1 = calculate_correlations_with_significance(all_stylo_1, all_base_1)
    corr2 = calculate_correlations_with_significance(all_stylo_2, all_base_2)

    n1 = len(all_stylo_1)
    n2 = len(all_stylo_2)

    z, p = compare_correlations(
        corr1["Correlation"].values[0], n1,
        corr2["Correlation"].values[0], n2
    )
    print(f"\t z = {z:.4f} \t p = {p:.2e}")


[1mComparaison entre MISTRAL et GPT[0m
	 z = 5.2471 	 p = 1.55e-07
[1mComparaison entre MISTRAL et GEMINI[0m
	 z = 2.4884 	 p = 1.28e-02
[1mComparaison entre GPT et GEMINI[0m
	 z = 2.7587 	 p = 5.80e-03


## Per feature, per author, all genAI : correlation, p-value

In [None]:
Tuffery_distances = mean_distances_df[mean_distances_df['genAI'] == 'No']['Mean_Distance_From_Centroid'].values

GenAI_distances = {}
difference_df_cond1 = {}



GenAI_distances = {}
difference_df_cond1 = {}

for auteur in ["Proust", "Celine", "Yourcenar"]:
    GenAI_distances[auteur] = mean_distances_df[mean_distances_df['author'] == auteur]['Mean_Distance_From_Centroid'].values

    difference_results = []

    for i, ref_dist in enumerate(Tuffery_distances):
        for j, gen_dist in enumerate(GenAI_distances[auteur]):
            difference = ref_dist - gen_dist
            difference_results.append({
                'Tuffery_Index': i,
                f'{auteur}_Index': j,
                'Difference': difference
            })

    difference_df_cond1[auteur] = pd.DataFrame(difference_results)

In [None]:
Mistral_df = stylo_df[stylo_df['genAI'] == 'mistral']
Gpt_df = stylo_df[stylo_df['genAI'] == 'gpt']
Gemini_df = stylo_df[stylo_df['genAI'] == 'gemini']
Tuffery_df = stylo_df[stylo_df['genAI'] == 'No']

GenAI_stylo = {}
difference_df_stylo_cond1 = {}

features = ['Letters', 'TAG', 'NER', 'Structural', 'Entropy']

for auteur in ["Proust", "Celine", "Yourcenar"]:
    GenAI_stylo[auteur] = stylo_df[stylo_df['author'] == auteur]

    difference_results_stylo = []

    for feature in features:
        for i, ref_val in enumerate(Tuffery_df[feature].values):
            for j, gen_val in enumerate(GenAI_stylo[auteur][feature].values):
                difference = ref_val - gen_val
                difference_results_stylo.append({
                    'Feature': feature,
                    'Mistral_Index': i,
                    f'{auteur}_Index': j,
                    'Difference': difference
                })
    difference_df_stylo_cond1[auteur] = pd.DataFrame(difference_results_stylo)

KeyError: 'Indexes'

In [None]:
for auteur in ["Proust", "Celine", "Yourcenar"]:

    correlation_df_cond1 = calculate_correlations_with_significance(difference_df_stylo_cond1[auteur], difference_df_cond1[auteur])

    print(f"{auteur}")
    print(correlation_df_cond1.to_string())

Proust
      Feature  Correlation P-value_symbole        P-value
0     Letters     0.082084              **   1.510172e-42
1         TAG     0.043438              **   4.980796e-13
2         NER     0.175239              **  1.563527e-189
3  Structural     0.070063              **   1.955254e-31
4     Indexes     0.113122              **   2.038254e-79
Celine
      Feature  Correlation P-value_symbole        P-value
0     Letters     0.142667              **  1.190345e-125
1         TAG    -0.000534                   9.292923e-01
2         NER     0.172099              **  9.011091e-183
3  Structural     0.007936                   1.870174e-01
4     Indexes     0.069710              **   3.901578e-31
Yourcenar
      Feature  Correlation P-value_symbole       P-value
0     Letters     0.119905              **  4.628675e-89
1         TAG     0.019284              **  1.343164e-03
2         NER     0.119233              **  4.431204e-88
3  Structural     0.012493               *  3.778147

### To latex

### Z-test : correlations différentes deux à deux pour un auteur fixé

In [None]:
## Testing if the correlations are significatively different
genais = [['mistral', 'gemini'], ['gpt', 'gemini'], ['gpt', 'mistral']]

for auteur in ['Proust', 'Celine', 'Yourcenar']:

    print(f"\033[1m Pour {auteur} \033[0m")

    for genAI_1, genAI_2 in genais:

        print(f"Entre {genAI_1} et {genAI_2} : ")

        # Access the correctly structured data from cells like Xm-TMbv0bs4T and FC_BP1KUeulP
        correlation_df_cond1_1 = calculate_correlations_with_significance(
            difference_df_stylo_cond1[genAI_1][auteur],
            difference_df_cond1[genAI_1][auteur]
        )
        correlation_df_cond1_2 = calculate_correlations_with_significance(
            difference_df_stylo_cond1[genAI_2][auteur],
            difference_df_cond1[genAI_2][auteur]
        )

        # The sample size N for each comparison (Tuffery vs a specific genAI for a specific author)
        # Number of Tuffery texts = 96
        # Number of texts for each genAI for a specific author (Proust, Celine, or Yourcenar) = 288 / 3 = 96
        # So, the sample size N for each comparison is 96 * 96 = 9216.
        n_sample = 9216

        z_score, p_value = compare_correlations(
            correlation_df_cond1_1["Correlation"].values[0], n_sample,
            correlation_df_cond1_2["Correlation"].values[0], n_sample
        )

        print(f"\t z = {z_score:.4f} \t p = {p_value:.2e}")

[1m Pour Proust [0m
Entre mistral et gemini : 


KeyError: 'mistral'

## Per feature, per genAI, per author : correlation, p-value

In [None]:
#### DISPERSION DELTA ANALYSIS FOR 2 COMPARISONS


## Humans vs genAI

# Dispersion delta
Tuffery_distances = mean_distances_df[mean_distances_df['genAI'] == 'No']['Mean_Distance_From_Centroid'].values

GenAI_distances = {}
difference_df_cond1 = {}


for genAI in ["mistral", "gpt", "gemini"]:

    GenAI_distances[genAI] = {}
    difference_df_cond1[genAI] = {}
    mean_distances_genAI = mean_distances_df[mean_distances_df['genAI'] == genAI]

    for auteur in ["Proust", "Celine", "Yourcenar"]:
        GenAI_distances[genAI][auteur] = mean_distances_genAI[mean_distances_genAI['author'] == auteur]['Mean_Distance_From_Centroid'].values

        # List to store the results
        difference_results = []

        for i, ref_dist in enumerate(Tuffery_distances):
            for j, gen_dist in enumerate(GenAI_distances[genAI][auteur]):
                difference = ref_dist - gen_dist
                difference_results.append({
                    'Tuffery_Index': i,
                    f'{genAI}_{auteur}_Index': j,
                    'Difference': difference
                })

        difference_df_cond1[genAI][auteur] = pd.DataFrame(difference_results)

In [None]:
# Style delta calculation

Mistral_df = stylo_df[stylo_df['genAI'] == 'mistral']
Gpt_df = stylo_df[stylo_df['genAI'] == 'gpt']
Gemini_df = stylo_df[stylo_df['genAI'] == 'gemini']
Tuffery_df = stylo_df[stylo_df['genAI'] == 'No']

GenAI_stylo = {}
difference_df_stylo_cond1 = {}

# List of stylistic features
features = ['Letters', 'TAG', 'NER', 'Structural', 'Indexes']
# features = ['Mean']

for genAI in ["mistral", "gpt", "gemini"]:

    GenAI_stylo[genAI] = {}
    difference_df_stylo_cond1[genAI] = {}
    stylo_genAI = stylo_df[stylo_df['genAI'] == genAI]

    for auteur in ["Proust", "Celine", "Yourcenar"]:
        GenAI_stylo[genAI][auteur] = stylo_genAI[stylo_genAI['author'] == auteur]

        # List to store the results
        difference_results_stylo = []

        for feature in features:
            for i, ref_val in enumerate(Tuffery_df[feature].values):
                for j, gen_val in enumerate(GenAI_stylo[genAI][auteur][feature].values):
                    difference = ref_val - gen_val
                    difference_results_stylo.append({
                        'Feature': feature,
                        'Mistral_Index': i,
                        f'{genAI}_{auteur}_Index': j,
                        'Difference': difference
                    })
        difference_df_stylo_cond1[genAI][auteur] = pd.DataFrame(difference_results_stylo)

In [None]:
for genAI in ["mistral", "gpt", "gemini"]:

    for auteur in ["Proust", "Celine", "Yourcenar"]:

        correlation_df_cond1 = calculate_correlations_with_significance(difference_df_stylo_cond1[genAI][auteur], difference_df_cond1[genAI][auteur])

        print(f"{genAI} {auteur}")
        print(correlation_df_cond1.to_string())

In [None]:
for genAI in ["mistral", "gpt", "gemini"]:
    # Concatenate DataFrames across all authors
    stylo_all_authors = pd.concat([
        difference_df_stylo_cond1[genAI][auteur] for auteur in ["Proust", "Celine", "Yourcenar"]
    ])
    cond1_all_authors = pd.concat([
        difference_df_cond1[genAI][auteur] for auteur in ["Proust", "Celine", "Yourcenar"]
    ])

    # Now calculate the correlation
    correlation_df_cond1 = calculate_correlations_with_significance(stylo_all_authors, cond1_all_authors)

    print(f"{genAI} (All authors)")
    print(correlation_df_cond1.to_string())


### To latex

In [None]:
Proust = calculate_correlations_with_significance(difference_df_stylo_cond1['gemini']['Proust'], difference_df_cond1['gemini']['Proust'])
Celine = calculate_correlations_with_significance(difference_df_stylo_cond1['gemini']['Celine'], difference_df_cond1['gemini']['Celine'])
Yourcenar = calculate_correlations_with_significance(difference_df_stylo_cond1['gemini']['Yourcenar'], difference_df_cond1['gemini']['Yourcenar'])



# Standard list of features (in desired order)
features = ['Function words', 'Letters', 'Numbers', 'TAG', 'NER', 'Structural', 'Punctuation', 'Indexes']

# Ensure each dataframe is indexed by 'Feature' for easy lookup
Proust.set_index('Feature', inplace=True)
Celine.set_index('Feature', inplace=True)
Yourcenar.set_index('Feature', inplace=True)

# Start LaTeX table
latex = []
latex.append(r" & \multicolumn{2}{|c|}{Proust} & \multicolumn{2}{|c|}{Celine} & \multicolumn{2}{|c|}{Yourcenar} \\")
latex.append(r"\hline")
latex.append(r"Feature & Correlation & P-value & Correlation & P-value & Correlation & P-value  \\")
latex.append(r"\hline")

# Populate rows
for feature in features:
    row = [feature]
    for df in [Proust, Celine, Yourcenar]:
        corr = df.loc[feature, 'Correlation'] if feature in df.index else ''
        pval = df.loc[feature, 'P-value'] if feature in df.index else ''
        psymb= df.loc[feature, 'P-value_symbole'] if feature in df.index else ''
        row.extend([f"{corr:.4f}", f"{pval:.1e} ({psymb})"])
    latex.append(" & ".join(row) + r" \\")  # row end
latex.append(r"\hline")

# Join and print LaTeX code
latex_table = "\n".join(latex)
print(latex_table)