In [2]:
import os
import sys
import numpy as np
import pandas as pd

dataset = "NDE"
database = "NDE"

print(f"Current working directory: {os.getcwd()}")
BOX_DIR = os.path.join(os.path.expanduser("~"), "Library", "CloudStorage", "Box-Box", "TMDATA")
print(f"Retrieving data from BOX, locally stored at: {BOX_DIR}")
DATA_DIR = os.path.join(BOX_DIR, dataset)
print(f"Data directory: {DATA_DIR}")


reports_file = os.listdir(DATA_DIR)
print(f"Files for {dataset} dataset (n={len(os.listdir(DATA_DIR))}): {os.listdir(DATA_DIR)}")

raw_file_path = os.path.join(DATA_DIR, database + ".xlsx")
database_file = pd.read_excel(raw_file_path)

Current working directory: /Users/rbeaute/Projects/MOSAIC
Retrieving data from BOX, locally stored at: /Users/rbeaute/Library/CloudStorage/Box-Box/TMDATA
Data directory: /Users/rbeaute/Library/CloudStorage/Box-Box/TMDATA/NDE
Files for NDE dataset (n=5): ['NDE under anesthesia.docx', 'NDE.xlsx', 'translations', 'anesthesia.xlsx', 'NDE_reports.csv']


In [3]:
print(database_file.columns.tolist())

['ID', 'Age', 'Age at interview', 'Age at NDE', 'Age at interview (Greyson)', 'Culture of origin', 'Sex', 'Date of interview', 'Date of NDE', 'Educ level', 'Educ level at NDE', 'Nationality', 'Language', 'Etiology group', 'Etiology (Greyson)', 'ICU stay', 'Loss of consciousness', 'Coma', 'Precipitating factor', 'Time since NDE', 'Time since NDE (Greyson)', "Subject's origin", 'MEQ30 item1', 'MEQ30 item2', 'MEQ30 item3', 'MEQ30 item4', 'MEQ30 item5', 'MEQ30 item6', 'MEQ30 item7', 'MEQ30 item8', 'MEQ30 item9', 'MEQ30 item10', 'MEQ30 item11', 'MEQ30 item12', 'MEQ30 item13', 'MEQ30 item14', 'MEQ30 item15', 'MEQ30 item16', 'MEQ30 item17', 'MEQ30 item18', 'MEQ30 item19', 'MEQ30 item20', 'MEQ30 item21', 'MEQ30 item22', 'MEQ30 item23', 'MEQ30 item24', 'MEQ30 item25', 'MEQ30 item26', 'MEQ30 item27', 'MEQ30 item28', 'MEQ30 item29', 'MEQ30 item30', 'Total score MEQ30', 'Mystical subscore', 'Inefability', 'Positive mood subscore', 'Transcendence of time subscore', 'Complete mystical experience', '

In [4]:
num_languages = database_file['Language'].nunique(dropna=True)
print(f"Number of different languages: {num_languages}")
print("Languages:", database_file['Language'].dropna().unique())

#check how many reports are available for each language
language_counts = database_file['Language'].value_counts(dropna=True)
print("Number of reports per language:")
print(language_counts)

Number of different languages: 3
Languages: ['French' 'Flemish/Dutch' 'English']
Number of reports per language:
Language
English          1023
French            529
Flemish/Dutch     121
Name: count, dtype: int64


In [5]:
database_file

Unnamed: 0,ID,Age,Age at interview,Age at NDE,Age at interview (Greyson),Culture of origin,Sex,Date of interview,Date of NDE,Educ level,...,Questionaires::MCQ8,Questionaires::MCQ9,Questionaires::MCQ10,Questionaires::MCQ11,Questionaires::MCQ12,Questionaires::MCQ13,Questionaires::MCQ14,Questionaires::MCQ15,Questionaires::MCQ16,NDE Narrative
0,DT07_fmp1,,43,27,,,1.0,2001-05-25,1986-01-01,,...,,,,,,,,,,Mon expérience se déroule en plusieurs expérie...
1,IG14_fmp2,,42,31,,,2.0,2002-01-23,1990-12-27,,...,,,,,,,,,,"Le vingt-sept décembre 1990, à huit heures, lo..."
2,JYB16_fmp3,,50,49,,,1.0,2003-03-15,2002-05-20,,...,,,,,,,,,,"Saint Laurent-Nouan, dans le Loir et Cher, le ..."
3,PR23_fmp4,,50,34,,,1.0,2009-05-06,1993-07-01,,...,,,,,,,,,,Circonstances: je déclare une varicelle (contr...
4,VV24_fmp5,,36,14,,,2.0,2009-08-05,1987-07-01,,...,,,,,,,,,,"Un soir de Juillet, dans notre maison de campa..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1694,JH_fmp1695,,66,37,,,1.0,2021-01-28,1991-06-15,0.0,...,1.0,7.0,7.0,7.0,7.0,4.0,7.0,7.0,7.0,"Ik ben een gepensioneerde ontwikkelingswerker,..."
1695,DS_fmp1696,,54,8,,occidental,1.0,2021-01-29,1974-12-31,11.0,...,2.0,7.0,7.0,7.0,7.0,7.0,4.0,4.0,3.0,FAIT\nje revenais de chez des amis de mes pare...
1696,DJ_fmp1697,,73,62,,occidental,1.0,2021-04-17,2009-07-15,9.0,...,7.0,7.0,7.0,1.0,7.0,7.0,7.0,1.0,7.0,
1697,LF_fmp1698,,63,62,,oriental,2.0,2020-03-05,2019-03-01,17.0,...,7.0,7.0,7.0,7.0,7.0,1.0,2.0,7.0,2.0,


In [6]:
reports = database_file["NDE Narrative"]

# Keep only non-empty (non-NaN and non-empty string) reports
reports_nonempty = reports[reports.notna() & (reports.str.strip() != '')]
print(f"Number of non-empty reports: {len(reports_nonempty)}")

# Save the non-empty reports to a CSV file
output_file_path = os.path.join(DATA_DIR, database + "_reports.csv")
reports_nonempty.to_csv(output_file_path, index=False, header=["report"])

Number of non-empty reports: 1457


In [7]:
# Save the non-empty reports along with their corresponding Language values to a CSV file
reports_with_language = database_file.loc[reports_nonempty.index, ["NDE Narrative", "Language"]]
output_file_path_with_lang = os.path.join(DATA_DIR, database + "_reports_with_language.csv")
reports_with_language.to_csv(output_file_path_with_lang, index=False, header=["report", "Language"])