## Imports

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import norm

## Methods

In [30]:
def calculate_correlations_with_significance(difference_df_stylo, difference_df):
    correlation_results = {}
    p_value_symbole_results = {}
    p_value_results = {}

    for feature in difference_df_stylo["Feature"].unique():
        filtered_stylo_df = difference_df_stylo[difference_df_stylo["Feature"] == feature]

        # Calculate Pearson correlation and p-value
        correlation, p_value = pearsonr(filtered_stylo_df["Difference"], difference_df["Difference"])
        correlation_results[feature] = correlation
        p_value_results[feature] = p_value

        # Assign asterisk based on p-value significance
        if p_value < 0.01:
            p_value_symbole_results[feature] = '**'
        elif p_value < 0.05:
            p_value_symbole_results[feature] = '*'
        else:
            p_value_symbole_results[feature] = ''

    correlation_df = pd.DataFrame({
        'Feature': list(correlation_results.keys()),
        'Correlation': list(correlation_results.values()),
        'P-value_symbole': list(p_value_symbole_results.values()),  # Store significance asterisks
        'P-value': list(p_value_results.values()),
    })

    return correlation_df

In [31]:
def fisher_z(r):
  return 0.5 * np.log((1 + r) / (1 - r))

In [None]:
def compare_correlations(r1, n1, r2, n2):
  """
  r1, r2 : Fischer transformed correlations
  n1, n2 : sample size
  """
  z1 = fisher_z(r1)
  z2 = fisher_z(r2)
  se = np.sqrt(1/(n1 - 3) + 1/(n2 - 3))
  z = np.abs(z1 - z2) / se
  p = 2 * (1 - norm.cdf(z)) 
  return z, p

## Dataset

### Stylometry

In [34]:
stylo_df_fr = pd.read_excel('./stylo_terreau_df.xlsx')

In [35]:
stylo_df_tuffery_ref = stylo_df_fr[stylo_df_fr['author'] == "Tuffery"]
stylo_df_style_gen = stylo_df_fr[stylo_df_fr['genAI'] != "No"]
stylo_df_TS_gen= pd.concat([stylo_df_tuffery_ref, stylo_df_style_gen], axis=0)

stylo_df_TS_gen = stylo_df_TS_gen.rename(columns={"id": "file_name", "Unnamed: 0":"id"})
stylo_df_TS_gen = stylo_df_TS_gen.drop(columns=["text_id",
                                                    "id"])
stylo_df_TS_gen = stylo_df_TS_gen.rename(columns={'Indexes': 'Entropy'})

stylo_df_TS_gen["class"] = stylo_df_TS_gen["genAI"].map({"No": "A1", "mistral": "A2", "gpt": "A2", "gemini":"A2"})

stylo_df = stylo_df_TS_gen

In [36]:
stylo_df.head()

Unnamed: 0,author,genAI,Punctuation,TAG,Structural,Entropy,NER,Letters,file_name,class
960,Tuffery,No,0.01951,0.195833,8.794737,7.159736,0.083333,0.029505,tuffery_agitato_atrabile.txt,A1
961,Tuffery,No,0.011617,0.436869,22.03807,7.771713,0.454545,0.029591,tuffery_agitato_lamentabile.txt,A1
962,Tuffery,No,0.011944,0.407407,18.214147,6.612949,0.222222,0.028414,tuffery_anacephaleose.txt,A1
963,Tuffery,No,0.011217,0.200617,9.13342,6.714099,0.092593,0.028595,tuffery_anadiploses_epanadiploses.txt,A1
964,Tuffery,No,0.010989,1.333333,59.19,5.093733,1.0,0.031382,tuffery_anaphore.txt,A1


In [37]:
#Normalize
numeric_cols = stylo_df.select_dtypes(include=['number']).columns

for col in numeric_cols:
    stylo_df[f"{col}_normalized"] = (stylo_df[col] - stylo_df[col].min()) / (stylo_df[col].max() - stylo_df[col].min())

#Mean
columns_to_average = ["Structural_normalized",
                      "Letters_normalized",
                      "TAG_normalized",
                      "NER_normalized",
                      "Entropy_normalized",
                      ]
stylo_df["Mean"] = stylo_df[columns_to_average].mean(axis=1)

stylo_df.head()

Unnamed: 0,author,genAI,Punctuation,TAG,Structural,Entropy,NER,Letters,file_name,class,Punctuation_normalized,TAG_normalized,Structural_normalized,Entropy_normalized,NER_normalized,Letters_normalized,Mean
960,Tuffery,No,0.01951,0.195833,8.794737,7.159736,0.083333,0.029505,tuffery_agitato_atrabile.txt,A1,0.239607,0.029706,0.048711,0.830128,0.041667,0.252477,0.240538
961,Tuffery,No,0.011617,0.436869,22.03807,7.771713,0.454545,0.029591,tuffery_agitato_lamentabile.txt,A1,0.142667,0.080749,0.151192,0.901083,0.227273,0.2646,0.324979
962,Tuffery,No,0.011944,0.407407,18.214147,6.612949,0.222222,0.028414,tuffery_anacephaleose.txt,A1,0.146682,0.07451,0.121601,0.766732,0.111111,0.099577,0.234706
963,Tuffery,No,0.011217,0.200617,9.13342,6.714099,0.092593,0.028595,tuffery_anadiploses_epanadiploses.txt,A1,0.137753,0.030719,0.051332,0.778459,0.046296,0.124931,0.206347
964,Tuffery,No,0.010989,1.333333,59.19,5.093733,1.0,0.031382,tuffery_anaphore.txt,A1,0.134959,0.270588,0.438685,0.590588,0.5,0.515662,0.463105


### Distance between embeddings

In [38]:
mean_distances_df = pd.read_excel('./distance_pertext_umap_TS_gen.xlsx')

mean_distances_df["genAI"] = mean_distances_df["Class"].map({"Proust": "No", "Celine": "No", "Yourcenar": "No", "Tuffery": "No",
                                                             "Proust_mistral": "mistral", "Celine_mistral": "mistral", "Yourcenar_mistral": "mistral",
                                                             "Proust_gpt": "gpt", "Celine_gpt": "gpt", "Yourcenar_gpt": "gpt",
                                                             "Proust_gemini": "gemini", "Celine_gemini": "gemini", "Yourcenar_gemini": "gemini",
                                                             })

mean_distances_df["author"] = mean_distances_df["Class"].map({"Proust": "Proust", "Celine": "Celine", "Yourcenar": "Yourcenar","Tuffery": "Tuffery",
                                                             "Proust_mistral": "Proust", "Celine_mistral": "Celine", "Yourcenar_mistral": "Yourcenar",
                                                             "Proust_gpt": "Proust", "Celine_gpt": "Celine", "Yourcenar_gpt": "Yourcenar",
                                                             "Proust_gemini": "Proust", "Celine_gemini": "Celine", "Yourcenar_gemini": "Yourcenar",
                                                             })

mean_distances_df["class"] = mean_distances_df["genAI"].map({"No": "A1", "mistral": "A2", "gpt": "A2", "gemini":"A2"})
mean_distances_df = mean_distances_df.drop(columns=["Class"])

In [39]:
mean_distances_df.head()

Unnamed: 0,Text_Index,Mean_Distance_From_Centroid,genAI,author,class
0,0,1.493536,No,Tuffery,A1
1,1,3.086941,No,Tuffery,A1
2,2,1.641952,No,Tuffery,A1
3,3,1.457144,No,Tuffery,A1
4,4,1.759076,No,Tuffery,A1


## Aggregated features, per author, all genAI : correlation, p-value

In [40]:
Tuffery_distances = mean_distances_df[mean_distances_df['genAI'] == 'No']['Mean_Distance_From_Centroid'].values
Proust_distances = mean_distances_df[mean_distances_df['author'] == 'Proust']['Mean_Distance_From_Centroid'].values
Celine_distances = mean_distances_df[mean_distances_df['author'] == 'Celine']['Mean_Distance_From_Centroid'].values
Yourcenar_distances = mean_distances_df[mean_distances_df['author'] == 'Yourcenar']['Mean_Distance_From_Centroid'].values

difference_results_proust = []

for i, ref_dist in enumerate(Tuffery_distances):
    for j, gen_dist in enumerate(Proust_distances):
        difference = ref_dist - gen_dist
        difference_results_proust.append({
            'Tuffery_Index': i,
            'Proust_Index': j,
            'Difference': difference
        })

difference_df_cond1_proust = pd.DataFrame(difference_results_proust)

difference_results_celine = []

for i, ref_dist in enumerate(Tuffery_distances):
    for j, gen_dist in enumerate(Celine_distances):
        difference = ref_dist - gen_dist
        difference_results_celine.append({
            'Tuffery_Index': i,
            'Celine_Index': j,
            'Difference': difference
        })

difference_df_cond1_celine = pd.DataFrame(difference_results_celine)

difference_results_yourcenar = []

for i, ref_dist in enumerate(Tuffery_distances):
    for j, gen_dist in enumerate(Yourcenar_distances):
        difference = ref_dist - gen_dist
        difference_results_yourcenar.append({
            'Tuffery_Index': i,
            'Yourcenar_Index': j,
            'Difference': difference
        })

difference_df_cond1_yourcenar = pd.DataFrame(difference_results_yourcenar)

In [41]:
Proust_df = stylo_df[stylo_df['author'] == 'Proust']
Celine_df = stylo_df[stylo_df['author'] == 'Celine']
Yourcenar_df = stylo_df[stylo_df['author'] == 'Yourcenar']
Tuffery_df = stylo_df[stylo_df['genAI'] == 'No']
features = ['Mean']

difference_results_stylo_proust = []

for feature in features:
    for i, ref_val in enumerate(Tuffery_df[feature].values):
        for j, gen_val in enumerate(Proust_df[feature].values):
            difference = ref_val - gen_val
            difference_results_stylo_proust.append({
                'Feature': feature,
                'Proust_Index': i,
                'Tuffery_Index': j,
                'Difference': difference
            })

difference_df_stylo_cond1_proust = pd.DataFrame(difference_results_stylo_proust)

difference_results_stylo_celine = []

for feature in features:
    for i, ref_val in enumerate(Tuffery_df[feature].values):
        for j, gen_val in enumerate(Celine_df[feature].values):
            difference = ref_val - gen_val
            difference_results_stylo_celine.append({
                'Feature': feature,
                'Celine_Index': i,
                'Tuffery_Index': j,
                'Difference': difference
            })

difference_df_stylo_cond1_celine = pd.DataFrame(difference_results_stylo_celine)

difference_results_stylo_yourcenar = []

for feature in features:
    for i, ref_val in enumerate(Tuffery_df[feature].values):
        for j, gen_val in enumerate(Yourcenar_df[feature].values):
            difference = ref_val - gen_val
            difference_results_stylo_yourcenar.append({
                'Feature': feature,
                'Yourcenar_Index': i,
                'Tuffery_Index': j,
                'Difference': difference
            })

difference_df_stylo_cond1_yourcenar = pd.DataFrame(difference_results_stylo_yourcenar)


In [42]:
correlation_df_cond1_proust = calculate_correlations_with_significance(difference_df_stylo_cond1_proust, difference_df_cond1_proust)
correlation_df_cond1_proust.rename(columns={'Correlation': 'Correlation_proust'}, inplace=True)
correlation_df_cond1_proust.rename(columns={'P-value': 'P-value_proust'}, inplace=True)
correlation_df_cond1_proust.rename(columns={'P-value_symbole': 'P-value_symbole_proust'}, inplace=True)

correlation_df_cond1_celine = calculate_correlations_with_significance(difference_df_stylo_cond1_celine, difference_df_cond1_celine)
correlation_df_cond1_celine.rename(columns={'Correlation': 'Correlation_celine'}, inplace=True)
correlation_df_cond1_celine.rename(columns={'P-value': 'P-value_celine'}, inplace=True)
correlation_df_cond1_celine.rename(columns={'P-value_symbole': 'P-value_symbole_celine'}, inplace=True)

correlation_df_cond1_yourcenar = calculate_correlations_with_significance(difference_df_stylo_cond1_yourcenar, difference_df_cond1_yourcenar)
correlation_df_cond1_yourcenar.rename(columns={'Correlation': 'Correlation_yourcenar'}, inplace=True)
correlation_df_cond1_yourcenar.rename(columns={'P-value': 'P-value_yourcenar'}, inplace=True)
correlation_df_cond1_yourcenar.rename(columns={'P-value_symbole': 'P-value_symbole_yourcenar'}, inplace=True)


correlation_df_cond1 = correlation_df_cond1_proust.merge(correlation_df_cond1_celine, on='Feature').merge(correlation_df_cond1_yourcenar, on='Feature')
print(correlation_df_cond1.to_string())

  Feature  Correlation_proust P-value_symbole_proust  P-value_proust  Correlation_celine P-value_symbole_celine  P-value_celine  Correlation_yourcenar P-value_symbole_yourcenar  P-value_yourcenar
0    Mean            0.149073                     **   3.925072e-137            0.120099                     **    2.398459e-89               0.086032                        **       1.398676e-46


## Per feature, per author, all genAI : correlation, p-value

In [None]:
Tuffery_distances = mean_distances_df[mean_distances_df['genAI'] == 'No']['Mean_Distance_From_Centroid'].values

GenAI_distances = {}
difference_df_cond1 = {}



GenAI_distances = {}
difference_df_cond1 = {}

for author in ["Proust", "Celine", "Yourcenar"]:
    GenAI_distances[author] = mean_distances_df[mean_distances_df['author'] == author]['Mean_Distance_From_Centroid'].values

    difference_results = []

    for i, ref_dist in enumerate(Tuffery_distances):
        for j, gen_dist in enumerate(GenAI_distances[author]):
            difference = ref_dist - gen_dist
            difference_results.append({
                'Tuffery_Index': i,
                f'{author}_Index': j,
                'Difference': difference
            })

    difference_df_cond1[author] = pd.DataFrame(difference_results)

In [None]:
Mistral_df = stylo_df[stylo_df['genAI'] == 'mistral']
Gpt_df = stylo_df[stylo_df['genAI'] == 'gpt']
Gemini_df = stylo_df[stylo_df['genAI'] == 'gemini']
Tuffery_df = stylo_df[stylo_df['genAI'] == 'No']

GenAI_stylo = {}
difference_df_stylo_cond1 = {}

features = ['Letters', 'TAG', 'NER', 'Structural', 'Entropy']

for author in ["Proust", "Celine", "Yourcenar"]:
    GenAI_stylo[author] = stylo_df[stylo_df['author'] == author]

    difference_results_stylo = []

    for feature in features:
        for i, ref_val in enumerate(Tuffery_df[feature].values):
            for j, gen_val in enumerate(GenAI_stylo[author][feature].values):
                difference = ref_val - gen_val
                difference_results_stylo.append({
                    'Feature': feature,
                    'Mistral_Index': i,
                    f'{author}_Index': j,
                    'Difference': difference
                })
    difference_df_stylo_cond1[author] = pd.DataFrame(difference_results_stylo)

In [None]:
for author in ["Proust", "Celine", "Yourcenar"]:

    correlation_df_cond1 = calculate_correlations_with_significance(difference_df_stylo_cond1[author], difference_df_cond1[author])

    print(f"\n {author}\n ")
    print(correlation_df_cond1.to_string())


 Proust
 
      Feature  Correlation P-value_symbole        P-value
0     Letters     0.082084              **   1.510172e-42
1         TAG     0.043438              **   4.980796e-13
2         NER     0.175239              **  1.563527e-189
3  Structural     0.070063              **   1.955254e-31
4     Entropy     0.113122              **   2.038254e-79

 Celine
 
      Feature  Correlation P-value_symbole        P-value
0     Letters     0.142667              **  1.190345e-125
1         TAG    -0.000534                   9.292923e-01
2         NER     0.172099              **  9.011091e-183
3  Structural     0.007936                   1.870174e-01
4     Entropy     0.069710              **   3.901578e-31

 Yourcenar
 
      Feature  Correlation P-value_symbole       P-value
0     Letters     0.119905              **  4.628675e-89
1         TAG     0.019284              **  1.343164e-03
2         NER     0.119233              **  4.431204e-88
3  Structural     0.012493              