In [22]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from src.utils import load_config
from src.data_tools.data_utils import load_features_and_meta
from src.stats_analysis.utils_statistics import compute_khi_2_table, compute_anova_table, compute_mwu_table
config = load_config()

In [23]:
data = load_features_and_meta(config)

In [24]:
data.columns

Index(['uuid', 'age', 'sexe', 'code_insee', 'education_degree', 'single',
       'living_alone', 'exp_critereA', 'PTSD_probable',
       'partial_PTSD_probable',
       ...
       'ON_we', 'ON_someone', 'Enunciative_PRESENT', 'Generical_PRESENT',
       'Historical_PRESENT', 'VERB_SENSORY_PERCEPTIONS', 'model_BODY',
       'NOM_PERCEPTIONS_SENSORIELLES', 'model_PHYSICAL_SENSATIONS',
       'agentivity'],
      dtype='object', length=103)

## Data description
### reproduce table 1 & 2 in the paper

#### Table 1. Socio-economic information of our cohort and statistical association with criterion A
In our cohort, criterion A is reduced to (A1: direct exposure, A2: witnessing the trauma)

In [25]:
data['exp_critereA'].value_counts()

exp_critereA
A1    110
A2     38
Name: count, dtype: int64

In [26]:
data.groupby('exp_critereA')['sexe'].value_counts(normalize=True).round(3)*100

exp_critereA  sexe
A1            F       55.5
              M       44.5
A2            F       68.4
              M       31.6
Name: proportion, dtype: float64

In [27]:
data.groupby('exp_critereA')['age'].describe().round(3)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
exp_critereA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A1,110.0,37.018,8.986,20.0,30.0,36.0,41.0,65.0
A2,38.0,41.579,12.411,20.0,32.25,41.5,49.5,66.0


In [28]:
data.groupby('exp_critereA')['code_insee'].value_counts(normalize=True).round(3)*100

exp_critereA  code_insee                                  
A1            MANAGERS AND HIGHER INTELLECTUAL PROFESSIONS    48.2
              INTERMEDIATE PROFESSIONS                        20.9
              EMPLOYEES                                        8.2
              UNEMPLOYED                                       7.3
              NR                                               6.4
              CRAFTSMEN AND MERCHANTS                          3.6
              STUDENTS                                         3.6
              RETIRED                                          0.9
              WORKERS                                          0.9
A2            MANAGERS AND HIGHER INTELLECTUAL PROFESSIONS    52.6
              CRAFTSMEN AND MERCHANTS                         18.4
              INTERMEDIATE PROFESSIONS                        15.8
              STUDENTS                                         7.9
              EMPLOYEES                                        2.6
   

In [29]:
data.groupby('exp_critereA')['education_degree'].value_counts(normalize=True).round(3)*100

exp_critereA  education_degree   
A1            master or higher       59.1
              bachelor               27.3
              high school or less    11.8
              other                   1.8
A2            master or higher       57.9
              bachelor               21.1
              high school or less    21.1
Name: proportion, dtype: float64

In [30]:
data.groupby('exp_critereA')['single'].value_counts(normalize=True).round(3)*100

exp_critereA  single
A1            1         61.8
              0         38.2
A2            1         63.2
              0         36.8
Name: proportion, dtype: float64

In [31]:
data.groupby('exp_critereA')['living_alone'].value_counts(normalize=True).round(3)*100

exp_critereA  living_alone
A1            0               67.3
              1               32.7
A2            0               68.4
              1               31.6
Name: proportion, dtype: float64

### stats analysis

In [32]:
compute_khi_2_table(data, ['exp_critereA'], ['sexe',  'code_insee', 'education_degree', 'single', 'living_alone'], seuil = 1)

Unnamed: 0,x,y,chi2,dof,pval,cramer,power
0,exp_critereA,sexe,1.461262,1.0,0.226729,0.099365,0.22705
1,exp_critereA,code_insee,17.315344,8.0,0.026988,0.342046,0.863348
2,exp_critereA,education_degree,2.820749,3.0,0.420096,0.138055,0.259646
3,exp_critereA,single,0.002227,1.0,0.962364,0.003879,0.050255
4,exp_critereA,living_alone,0.004986,1.0,0.943704,0.005804,0.050571


In [33]:
compute_anova_table(data, ['exp_critereA'], ['age'], seuil = 1)

Unnamed: 0,x,y,p-unc,np2,power
0,exp_critereA,age,0.016225,0.038935,0.932097


#### 2. Table 2. Socio-economic information of our cohort and statistical association with PTSD

In [34]:
data['full_and_partial_PTSD'].value_counts()

full_and_partial_PTSD
2    70
1    42
0    36
Name: count, dtype: int64

In [35]:
data.groupby('full_and_partial_PTSD')['sexe'].value_counts(normalize=True).round(3)*100

full_and_partial_PTSD  sexe
0                      F       50.0
                       M       50.0
1                      F       71.4
                       M       28.6
2                      F       55.7
                       M       44.3
Name: proportion, dtype: float64

In [36]:
data.groupby('full_and_partial_PTSD')['code_insee'].value_counts(normalize=True).round(3)*100

full_and_partial_PTSD  code_insee                                  
0                      MANAGERS AND HIGHER INTELLECTUAL PROFESSIONS    52.8
                       INTERMEDIATE PROFESSIONS                        27.8
                       CRAFTSMEN AND MERCHANTS                         11.1
                       RETIRED                                          5.6
                       EMPLOYEES                                        2.8
1                      MANAGERS AND HIGHER INTELLECTUAL PROFESSIONS    52.4
                       INTERMEDIATE PROFESSIONS                        14.3
                       EMPLOYEES                                       11.9
                       CRAFTSMEN AND MERCHANTS                          7.1
                       NR                                               4.8
                       STUDENTS                                         4.8
                       UNEMPLOYED                                       2.4
                    

In [37]:
data.groupby('full_and_partial_PTSD')['education_degree'].value_counts(normalize=True).round(3)*100

full_and_partial_PTSD  education_degree   
0                      master or higher       61.1
                       bachelor               25.0
                       high school or less    13.9
1                      master or higher       50.0
                       bachelor               33.3
                       high school or less    16.7
2                      master or higher       62.9
                       bachelor               21.4
                       high school or less    12.9
                       other                   2.9
Name: proportion, dtype: float64

In [38]:
data.groupby('full_and_partial_PTSD')['living_alone'].value_counts(normalize=True).round(3)*100

full_and_partial_PTSD  living_alone
0                      0               52.8
                       1               47.2
1                      0               69.0
                       1               31.0
2                      0               74.3
                       1               25.7
Name: proportion, dtype: float64

In [39]:
data.groupby('full_and_partial_PTSD')['single'].value_counts(normalize=True).round(3)*100

full_and_partial_PTSD  single
0                      0         50.0
                       1         50.0
1                      1         66.7
                       0         33.3
2                      1         65.7
                       0         34.3
Name: proportion, dtype: float64

#### analysis stats

In [40]:
compute_khi_2_table(data, ['full_and_partial_PTSD'], ['sexe',  'code_insee', 'education_degree', 'single', 'living_alone'], seuil = 1)

Unnamed: 0,x,y,chi2,dof,pval,cramer,power
0,full_and_partial_PTSD,sexe,4.190325,2.0,0.12305,0.168265,0.432742
1,full_and_partial_PTSD,code_insee,25.030858,16.0,0.069283,0.290798,0.560784
2,full_and_partial_PTSD,education_degree,4.73948,6.0,0.577634,0.126538,0.167544
3,full_and_partial_PTSD,single,3.001804,2.0,0.222929,0.142417,0.321697
4,full_and_partial_PTSD,living_alone,5.077124,2.0,0.07898,0.185216,0.510165


In [41]:
compute_anova_table(data, ['full_and_partial_PTSD'], ['age'], seuil = 1)

Unnamed: 0,x,y,p-unc,np2,power
0,full_and_partial_PTSD,age,0.015199,0.05611,0.997541


### Evaluate human expert

### Table 3.  Evaluation of the blinded rating of a human expert (clinician psychiatrist) 

In [42]:
from sklearn.metrics import confusion_matrix, accuracy_score,recall_score, precision_score, f1_score, classification_report, roc_auc_score, balanced_accuracy_score
import os

human_eval  = pd.read_csv(os.path.join(config['data']['data_folder'],config['data']["human_evaluation_filename"]))

merged = human_eval.merge(data, on = 'uuid')

for col in ["CB","CC","CD","CE","CG", "full_or_partial"] :
    if len(col)<=3 :
        true = merged[f"{col}_probable"]
        pred = merged[col].replace("?",0).fillna(0).astype('int')
    else : 
        true = merged[f"{col}_PTSD"]
        pred = merged[f"{col}_PTSD_human"].replace("?",0).fillna(0).astype('int')
        
    print("### Col : ", col)
    print(f"AUC Score, {roc_auc_score(true, pred).round(2)}")
    print(f"Precison Score, {precision_score(true, pred).round(2)}")
    print(f"Recall Score, {recall_score(true, pred).round(2)}")
    print(f"Specificity Score, {recall_score(true, pred, pos_label=0).round(2)}")

### Col :  CB
AUC Score, 0.74
Precison Score, 0.9
Recall Score, 0.71
Specificity Score, 0.76
### Col :  CC
AUC Score, 0.58
Precison Score, 0.56
Recall Score, 0.55
Specificity Score, 0.61
### Col :  CD
AUC Score, 0.73
Precison Score, 0.81
Recall Score, 0.77
Specificity Score, 0.68
### Col :  CE
AUC Score, 0.8
Precison Score, 0.9
Recall Score, 0.88
Specificity Score, 0.71
### Col :  CG
AUC Score, 0.73
Precison Score, 0.88
Recall Score, 0.64
Specificity Score, 0.82
### Col :  full_or_partial
AUC Score, 0.71
Precison Score, 0.78
Recall Score, 0.76
Specificity Score, 0.67
