importing libraries

In [None]:
# Import library to extract data from XML file
import xml.etree.ElementTree as ET
import os
import pandas as pd

import xml.etree.ElementTree as ET

extract_data function definition

In [None]:

"""
    Extracts data from an XML file and returns it as a dictionary.

    Args:
        file (str): The path to the XML file.

    Returns:
        dict: A dictionary containing the extracted data.
"""
def extract_data(file):
    # Create a dictionary to store the data
    data = {}
    # Parse the XML file
    tree = ET.parse(file)
    # Get the root of the XML file
    root = tree.getroot()

    # Initialize abstract data
    data['abstract'] = {}

    # Initialize body data
    data['body'] = []

    # Extract title and abstract
    article_meta = root.find('.//article-meta')
    if article_meta is not None:
        title_group = article_meta.find('title-group')
        data['title'] = title_group.find('article-title').text if title_group is not None else None

        abstract_section = article_meta.find('abstract')
        if abstract_section is not None:
            for section in abstract_section.findall('sec'):
                section_title = section.find('title').text if section.find('title') is not None else ''
                section_text = section.find('p').text if section.find('p') is not None else ''
                if 'simple summary' in section_title.lower():
                    data['abstract']['simple_summary'] = section_text
                elif 'abstract' in section_title.lower():
                    data['abstract']['abstract'] = section_text

    # Extract body sections
    body_section = root.find('body')
    if body_section is not None:
        for sec in body_section.findall('sec'):
            section_data = {
                'title': sec.find('title').text if sec.find('title') is not None else None,
                'content': [p.text for p in sec.findall('p') if p.text]
            }
            data['body'].append(section_data)

    # Extract references
    references_section = root.find('.//ref-list')
    if references_section is not None:
        data['references'] = [ET.tostring(reference, encoding='unicode') for reference in references_section.findall('ref')]

    # Return the extracted data
    return data

creating the dataframe

In [None]:
# Extract data from the XML files
# Create a list to store the data
data = []
# Get the path of the XML files
path = './data'

# Get the list of the XML files
files = os.listdir(path)

# Loop through the XML files
for file in files:
    # Extract data from the XML file
    data.append(extract_data(path + '/' + file))

In [None]:
# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(data)
# Save the DataFrame as a JSON file
df.to_json('data.json', orient='records')

importing libraries to preprocess data

In [None]:
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_sm')

defining preprocessing functions

In [None]:
def tokenize(text):
    """Divide the text into tokens (words)."""
    return word_tokenize(text)

def normalize(text):
    """Convert text to lowercase and remove punctuation."""
    text = text.lower()
    # Remove punctuation characters
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    return text

def remove_stopwords(tokens):
    """Remove stopwords from the list of tokens."""
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if not word in stop_words]

def stem(tokens):
    """Apply stemming to the tokens."""
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

def lemmatize(text):
    """Apply lemmatization to the text."""
    doc = nlp(text)
    return [token.lemma_ for token in doc]

actual preprocessing

In [None]:
# iterate over data list to print its contents
for i in range(len(data)):
    # print the title of the article
    print(data[i]['title'])
    # print the abstract of the article
    print(data[i]['abstract'])
    # print the body of the article
    print(data[i]['body'])

In [None]:
pre_processed_data = []

for article in data:
    # Extract the abstract
    # Abstract is a dictionary with two keys: simple_summary and abstract
    # Extract both
    # Apply validation to check if the key exists
    abstract = ''
    if 'simple_summary' in article['abstract'] and article['abstract']['simple_summary'] is not None:
        abstract += article['abstract']['simple_summary']
    if 'abstract' in article['abstract'] and article['abstract']['abstract'] is not None:
        abstract += article['abstract']['abstract']

    # If the abstract is empty, skip the article
    if abstract == '':
        continue

    # Normalize the abstract
    abstract = normalize(abstract)
    # Tokenize the abstract
    abstract = tokenize(abstract)
    # Remove stopwords from the abstract
    abstract = remove_stopwords(abstract)
    # Apply stemming on the abstract
    abstract = stem(abstract)
    # Apply lemmatization on the abstract
    abstract = lemmatize(' '.join(abstract))
    
    # Extract the body
    # Apply validation to check if the key exists
    body = ''
    for section in article['body']:
        body += ' '.join(section['content'])
    # Normalize the body
    body = normalize(body)
    # Tokenize the body
    body = tokenize(body)
    # Remove stopwords from the body
    body = remove_stopwords(body)
    # Apply stemming on the body
    body = stem(body)
    # Apply lemmatization on the body
    body = lemmatize(' '.join(body))

    # Create a dictionary to store the data
    article_data = {
        'abstract': abstract,
        'body': body,
    }

    # Append the dictionary to the list
    pre_processed_data.append(article_data)