In [3]:
import pandas as pd

# Data Structure Overview

This dataset is organized into three primary folders:

- `Biodiversity`
- `Caribbean`
- `Gulf OSW`

Each folder contains raw data related to Fuzzy Cognitive Maps (FCMs) generated from interviews.

---

## Folder Structure
**Naming conventions for the interview folders is different from this**
```text
/data/
├── Biodiversity/
│   ├── Interview1_FCM/
│   │   └── raw_interview_notes.txt
│   ├── Interview2_FCM/
│   │   └── raw_interview_notes.txt
│   └── Biodiversity_FCMs.xlsx
│
├── Caribbean/
│   ├── InterviewA_FCM/
│   │   └── raw_interview_notes.txt
│   ├── InterviewB_FCM/
│   │   └── raw_interview_notes.txt
│   └── Caribbean_FCMs.xlsx
│
└── Gulf OSW/
    ├── InterviewX_FCM/
    │   └── raw_interview_notes.txt
    ├── InterviewY_FCM/
    │   └── raw_interview_notes.txt
    └── Gulf_OSW_FCMs.xlsx



---

## Excel File Structure

Each top-level folder contains an Excel file (`.xlsx`) with the same name as the folder. These Excel files contain the structured FCMs in matrix form.

- Each sheet in the `.xlsx` file corresponds to a specific FCM. (must set sheet_name=None to get all sheets as shown below)
- The name of the sheet matches the name of the folder containing the raw interview data.

### Matrix Format

Each sheet is formatted as a matrix:

- **Columns** represent **source nodes**
- **Rows** represent **target nodes**

---

## Summary

This structure supports:

- Easy traceability between raw interview data and the corresponding FCMs.
- Convenient access to the structured matrix form of each FCM for downstream processing and analysis.




In [4]:
## Example retrieval from Biodiversity

In [15]:
bio_dict = pd.read_excel('Biodiversity/Biodiversity_FCMs.xlsx',sheet_name=None,index_col=0)

In [16]:
# show file folders
bio_dict.keys()

dict_keys(['BD001', 'BD002', 'BD003', 'BD004', 'BD006', 'BD007', 'BD008', 'BD009', 'BD010', 'BD011', 'BD012', 'BD013', 'BD014', 'BD015', 'BD016', 'BD017', 'BD019', 'BD020', 'BD021', 'BD022', 'BD023', 'BD024', 'BD025', 'BD026', 'BD027', 'BD028', 'BD029', 'BD030', 'BD031', 'BD032', 'BD033', 'BD034', 'BD035', 'BD036', 'BD037', 'BD038', 'BD039', 'BD040', 'BD041', 'BD042', 'BD043', 'BD044', 'BD045', 'BD046', 'BD049', 'BD050', 'BD051', 'BD053', 'BD054', 'BD056', 'BD057', 'BD058', 'BD059', 'BD060', 'BD061', 'BD063', 'BD064', 'BD068', 'BD070', 'BD072', 'BD074', 'BD075', 'BD076', 'BD077', 'BD078', 'BD079', 'BD080', 'BD081', 'BD082', 'BD083', 'BD084', 'BD085', 'BD086', 'BD087', 'BD088', 'BD089', 'BD090', 'BD091', 'BD092'])

In [19]:
bio_dict['BD001']

Unnamed: 0,species diversity,genetic diversity,community diversity,diversity of human communities,resilience,stressors,ecosystem function,human health/well-being,adaptation,quantifiable benefits,biodiversity data,baseline ecosystem data,societal preference (for target biodiversity) data,effective management policies,understanding of biodiversity impacts
species diversity,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
genetic diversity,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
community diversity,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
diversity of human communities,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
resilience,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
stressors,0,0,0,0,-1,0,-1,0,0,0,0,0,0,0,0
ecosystem function,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0
human health/well-being,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
adaptation,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
quantifiable benefits,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [9]:
import os
import pypandoc
from docx import Document

def read_all_text_files(folder_path):
    contents = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        if filename.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as f:
                contents.append(f.read())

        elif filename.endswith('.docx'):
            doc = Document(file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            contents.append(text)

        elif filename.endswith('.doc'):
            try:
                # Convert .doc to plain text using pandoc
                text = pypandoc.convert_file(file_path, 'plain')
                contents.append(text)
            except Exception as e:
                print(f"❌ Could not read {filename}: {e}")

    return contents.join("\n\n\n\n\n\n\n\n\n")


In [10]:
full_text=read_all_text_files('Biodiversity/BD001')

In [19]:
full_text[0][:100]

"Interviewer: Okay, so our first part will be just quantifying participant expertise. If it's okay wi"