## Identifying fields for processing the abstracts

### XML Field Path Extractor

First extracting all unique XML tag paths from the abstracts XML files within the directory.

In [18]:
import os
import xml.etree.ElementTree as ET

def get_all_paths(root, current_path):
    paths = set()
    for elem in list(root):
        path = f"{current_path}/{elem.tag}" if current_path else elem.tag
        paths.add(path)
        paths.update(get_all_paths(elem, path))
    return paths

def extract_all_field_paths(directory_path):
    all_paths = set()
    for xml_file in os.listdir(directory_path):
        if xml_file.endswith('.xml'):
            file_path = os.path.join(directory_path, xml_file)
            tree = ET.parse(file_path)
            root = tree.getroot()
            file_paths = get_all_paths(root, '')
            all_paths.update(file_paths)
    return all_paths

directory_path = "../abstracts_2020/"
all_field_paths = extract_all_field_paths(directory_path)
sorted_field_paths = sorted(list(all_field_paths))
# sorted_field_paths # Uncomment if you want to see the list :)

### Relevant fields for the model
Here are the fields from the list that are likely to be most relevant:

- 'Award/AbstractNarration': This contains the summary of the research which is central to understanding the topic of the paper.
- 'Award/AwardTitle': This is likely to contain keywords or phrases indicative of the research topic.
- 'Award/Organization/Directorate/LongName': The directorate name could indicate a broad area of research.
- 'Award/Organization/Division/LongName': Similar to the directorate, the division name could indicate a more specific area of research within the broader directorate.
- 'Award/ProgramElement/Text': If present, this might contain text about the program that funded the research, which could be related to the research topic.
- 'Award/ProgramReference/Text': This could also provide context about the research topic based on the program reference.

Other fields, particularly those containing names, addresses, numerical codes, and contact information, are less likely to be relevant for text-based topic clustering. They are typically used for administrative or identification purposes and do not contribute to the semantic content of the abstracts so for this case we will avoid them.

'Award/AwardTitle', 'Award/Organization/Directorate/LongName', 'Award/Organization/Division/LongName', 'Award/ProgramElement/Text', 'Award/ProgramReference/Text', 'Award/AbstractNarration'

### Preprocessing
- First parse the files in order to get a dictionary. Here we can define wich fields will be returned for making possible to get as minimum as possible.
- Second clean the text; by taking out the stop words (NLTK library) even if is needed to aggregate custom words that are unlikely to be useful; by lowering and keeping just characters between a-z and space; by finally lematizing (NTKL library).

In [19]:
import os
import re
import xml.etree.ElementTree as ET
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Ensure that the necessary NLTK data is available (uncomment if necessary)
# nltk.download('wordnet')
# nltk.download('stopwords')

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


def parse_xml_for_fields(file_path: str, fields) -> dict:
    """
    Function to parse the XML for specified fields and return a dictionary
    with field names and their corresponding text
    """
    tree = ET.parse(file_path)
    root = tree.getroot()
    field_texts = {}
    for field in fields:
        elements = root.findall('.//' + field.replace('/', '/'))
        combined_text = " ".join([elem.text.strip() for elem in elements if elem.text is not None])
        field_texts[field] = combined_text if combined_text else "N/A"
    return field_texts


def clean_and_lemmatize_text(text: str) -> str:
    """Function to clean and lemmatize text"""
    stop_words_set = set(stopwords.words('english'))
    # stop_words.update(stopwords.words('spanish')) ## Use this line (or more like it) for other languages
    # additional_words = ['customword1', 'customword2', 'customword3'] # some words to avoid (if known)
    # stop_words.update(additional_words)
    text_lower = text.lower()
    text_cleaned = re.sub(r'[^a-z\s]', '', text_lower)
    words = text_cleaned.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words_set]
    return ' '.join(lemmatized_words)


def process_files(directory_path, fields):
    """
    Function to process all XML files in a directory and extract text from specified fields
    """
    xml_files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.xml')]
    processed_data = []
    for file_path in xml_files:
        field_texts = parse_xml_for_fields(file_path, fields)
        for field, text in field_texts.items():
            field_texts[field] = clean_and_lemmatize_text(text)
        processed_data.append(field_texts)
    return processed_data

In [20]:
# Fields to be extracted from the XML file
fields = [
    'Award/AwardTitle',
    'Organization/Directorate/LongName',
    'Organization/Division/LongName',
    'ProgramElement/Text',
    'ProgramReference/Text',
    'AbstractNarration'
]

# Directory path where XML files are located
directory_path = "../abstracts_2020/"

# Process the XML files and extract the data
processed_data = process_files(directory_path, fields)

### Getting some info about the processed files and words 

In [21]:
print('files processed', len(processed_data))
# processed_data[0:4]

# Function to count the total words for each field
def count_total_words(processed_data, fields):
    word_counts = {field: 0 for field in fields}
    for data in processed_data:
        for field in fields:
            word_counts[field] += len(data[field].split()) if data[field] != "N/A" else 0
    return word_counts

# Count the words for each field in processed_data
word_counts = count_total_words(processed_data, fields)

# Print out the total words for each field
for field, count in word_counts.items():
    print(f"Total words in {field}: {count}")


files processed 13300
Total words in Award/AwardTitle: 123835
Total words in Organization/Directorate/LongName: 45857
Total words in Organization/Division/LongName: 47358
Total words in ProgramElement/Text: 43556
Total words in ProgramReference/Text: 86616
Total words in AbstractNarration: 3396389


### Looking some of the data for specific abstracts 

In [22]:
abstract_number_to_look = 42
for i in range(0,6):
    print(processed_data[abstract_number_to_look][fields[i]])

supporting early learning computational thinking using mixed reality technology
directorate stem education
division research learning
itestinov tech exp stu teac
covid impact existing activity broaden particip stem resrch ehr cl opportunity nsf
project introduce young child computational thinking involves breaking complex problem manageable piece identifying step sequence solve problem generalizing solution solve similar problem problemsolving skill foundational cut across science technology engineering mathematics stem domain discipline project research develop innovative mixed reality mr learning environment combining visual display robot programmable movement child grade k program robot solve pathfinding problem hint obstacle problem scenario presented computer tablet project environment designed mainly oneonone use wirelessly flexibly either formal informal setting participating child teacher recruited partnership school community traditionally underrepresented stem ltbrgtltbrgtthe

## EDA step to analyze word distribution and visualize data

- Analyzing the distribution of words and phrases using a word frequency count.
- Visualizing the data using word clouds and possibly bar charts to display the most common words.

In [23]:
from collections import Counter

# Function to calculate word frequency in given fields of a single processed_data object or all objects
def get_word_frequency(processed_data, fields_to_frequency_count, single_object_index=None):
    word_freq = Counter()
    
    # If a single_object_index is provided, work with a single object; otherwise, use all objects
    data_to_process = [processed_data[single_object_index]] if single_object_index is not None else processed_data
    
    for data in data_to_process:
        for field in fields_to_frequency_count:
            text = data[field] if field in data else ""
            word_freq.update(text.split())
    
    return word_freq


In [24]:
# Fields to calculate word frequencies
fields_to_frequency_count_total = ['Award/AwardTitle', 'Organization/Directorate/LongName',
                             'Organization/Division/LongName', 'ProgramElement/Text',
                             'ProgramReference/Text', 'AbstractNarration']

fields_to_frequency_count = ['Award/AwardTitle', 'Organization/Directorate/LongName',
                             'Organization/Division/LongName', 'ProgramElement/Text',
                             'ProgramReference/Text']

# To get word frequency for all objects
all_data_word_freq_all_fields = get_word_frequency(processed_data, fields_to_frequency_count_total)
all_data_word_freq_no_narration = get_word_frequency(processed_data, fields_to_frequency_count)

In [25]:
all_data_word_freq_no_narration

Counter({'research': 9008,
         'science': 7235,
         'division': 7081,
         'direct': 7078,
         'education': 5201,
         'directorate': 5031,
         'div': 3832,
         'system': 3745,
         'collaborative': 3728,
         'mathematical': 3529,
         'engineering': 3195,
         'scie': 3159,
         'exp': 3152,
         'computer': 2943,
         'physical': 2812,
         're': 2557,
         'covid': 2525,
         'stem': 2505,
         'undergraduate': 2493,
         'scien': 2459,
         'career': 2384,
         'tech': 2332,
         'info': 2282,
         'biological': 2227,
         'social': 2087,
         'na': 2001,
         'enginr': 1991,
         'rapid': 1874,
         'geosciences': 1823,
         'prog': 1810,
         'material': 1759,
         'economic': 1702,
         'comp': 1654,
         'stim': 1644,
         'sci': 1608,
         'impact': 1569,
         'learning': 1555,
         'innovation': 1552,
         'phase': 1528,

In [26]:
# To get word frequency for the a single object (experimental)
index_value = 12030
single_data_word_freq = get_word_frequency(processed_data, fields_to_frequency_count, single_object_index=index_value)

single_data_word_freq

Counter({'behavior': 2,
         'involvement': 2,
         'eavesdropping': 1,
         'sound': 1,
         'localization': 1,
         'lesson': 1,
         'small': 1,
         'auditory': 1,
         'specialist': 1,
         'direct': 1,
         'biological': 1,
         'science': 1,
         'division': 1,
         'integrative': 1,
         'organismal': 1,
         'system': 1,
         'animal': 1,
         'minority': 1,
         'bio': 1,
         'undergraduate': 1,
         'education': 1,
         'graduate': 1,
         'reu': 1,
         'suppres': 1,
         'exp': 1,
         'ugrd': 1,
         'supp': 1})