# TL;DR – Too Long, Doctor

TL;DR is a ML model designed to synthesize and cluster scientific papers. Tailored for both students and researchers seeking to optimize their study time, TL;DR provides a tool to quickly grasp the essence of complex scientific material. Additionally, it caters to those who desire a concise summary or a preliminary overview of a paper before delving into a detailed reading.

# Importing libraries

In [1]:
# Import library to extract data from XML file
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import os
import json

In [2]:
import torch
from transformers import BertTokenizer

# Dataset Generation

In [4]:
def extract_information_from_xml_general(xml_path):
    """
    Generalized function to extract Title, Abstract, Body, and Keyword Group from scientific papers in XML format.
    This function is designed to be more flexible with varying XML structures within the JATS format.

    Parameters:
    xml_path (str): Path to the XML file.

    Returns:
    dict: A dictionary containing the extracted Title, Abstract, Body, and Keyword Group.
    """
    try:
        # Parse the XML file
        tree = ET.parse(xml_path)
        root = tree.getroot()

        # Attempt to handle namespaces dynamically, if present
        namespaces = {'ns': 'http://www.w3.org/1999/xlink'}

        # Initialize a dictionary to hold the extracted information
        extracted_info = {'Title': '', 'Abstract': '', 'Body': '', 'Keyword Group': []}

        # Extract Title with a more generalized approach
        title_element = root.find('.//article-title', namespaces) or root.find('.//title-group/article-title', namespaces)
        if title_element is not None:
            extracted_info['Title'] = ''.join(title_element.itertext())

        # Extract Abstract with a more generalized approach
        abstract_element = root.find('.//abstract', namespaces)
        if abstract_element is not None:
            extracted_info['Abstract'] = ''.join(abstract_element.itertext())

        # Extract Body with a more generalized approach
        body_element = root.find('.//body', namespaces)
        if body_element is not None:
            extracted_info['Body'] = ''.join(body_element.itertext())

        # Extract Keyword Group with a more generalized approach
        for kwd_group in root.findall('.//kwd-group', namespaces):
            keywords = [kwd.text for kwd in kwd_group.findall('.//kwd', namespaces)]
            extracted_info['Keyword Group'].extend(keywords)

        return extracted_info
    except ET.ParseError as e:
        print(f"XML parsing error: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error: {e}")
        return None

In [5]:
def extract_information_and_write_to_json_file(xml_path):
    """
    Extracts Title, Abstract, Body, and Keyword Group from scientific papers in XML format and writes the information to a JSON file.
    The JSON file will have the same name as the XML file, but with a .json extension.
    
    Parameters:
    xml_path (str): Path to the XML file.
    """
    try:
        # Use the previously defined function to extract information
        extracted_info = extract_information_from_xml_general(xml_path)

        target_directory = './data/json/'
        os.makedirs(target_directory, exist_ok=True)

        # Generate the output JSON file path by changing the extension
        base_filename = os.path.basename(os.path.splitext(xml_path)[0])
        output_json_path = os.path.join(target_directory, base_filename + '.json')

        # Write the dictionary to a JSON file
        with open(output_json_path, 'w', encoding='utf-8') as json_file:
            json.dump(extracted_info, json_file, ensure_ascii=False, indent=4)
        
        return f"JSON file has been written to {output_json_path}"
    except Exception as e:
        return f"Error during JSON file writing: {e}"

xml_directory = "./data/"

# List all the XML files in the directory
xml_files = [f for f in os.listdir(xml_directory) if f.endswith('.xml')]

# Loop through each file and process it
for xml_file in xml_files:
    extract_information_and_write_to_json_file(xml_directory+xml_file)

In [3]:
# def extract_data_to_json(file):
#     # Create a dictionary to store the data
#     data = {}
#     # Parse the XML file
#     tree = ET.parse(file)
#     # Get the root of the XML file
#     root = tree.getroot()

#     # Initialize abstract data
#     data['abstract'] = {}

#     # Initialize body data
#     data['body'] = []

#     # Initialize keywords data
#     data['keywords'] = []

#     # Extract title and abstract
#     article_meta = root.find('.//article-meta')
#     if article_meta is not None:
#         title_group = article_meta.find('title-group')
#         data['title'] = title_group.find('article-title').text if title_group is not None else None

#         abstract_section = article_meta.find('abstract')
#         if abstract_section is not None:
#             for section in abstract_section.findall('sec'):
#                 section_title = section.find('title').text if section.find('title') is not None else ''
#                 section_text = section.find('p').text if section.find('p') is not None else ''
#                 if 'simple summary' in section_title.lower():
#                     data['abstract']['simple_summary'] = section_text
#                 elif 'abstract' in section_title.lower():
#                     data['abstract']['abstract'] = section_text

#         # Extract keywords
#         kwd_group = article_meta.find('kwd-group')
#         if kwd_group is not None:
#             data['keywords'] = [kwd.text for kwd in kwd_group.findall('kwd') if kwd.text]

#     # Extract body sections
#     body_section = root.find('body')
#     if body_section is not None:
#         for sec in body_section.findall('sec'):
#             section_data = {
#                 'title': sec.find('title').text if sec.find('title') is not None else None,
#                 'content': [p.text for p in sec.findall('p') if p.text]
#             }
#             data['body'].append(section_data)

#     # Convert the extracted data to a JSON string
#     json_data = json.dumps(data, ensure_ascii=False, indent=4)

#     # Return the JSON-formatted string
#     return json_data

In [None]:
# xml_directory = "./data/"
# json_directory = "./data/json"

# # Ensure JSON directory exists
# os.makedirs(json_directory, exist_ok=True)

# # List all the XML files in the directory
# xml_files = [f for f in os.listdir(xml_directory) if f.endswith('.xml')]

# # Loop through each file and process it
# for xml_file in xml_files:
#     json_data = extract_data_to_json(xml_directory+xml_file)
#     if json_data is not None:
#         json_file_name = xml_file.replace('.xml', '.json')
#         json_output_path = os.path.join(json_directory, json_file_name)

#         with open(json_output_path, 'w', encoding='utf-8') as json_file:
#             json_file.write(json_data)
#         print(f"Wrote data to {json_output_path}")
#     else:
#         print(f"Skipped incomplete article: {xml_file}")


# Text Classification

For the Text Classification task we will use BERT.

## Pre-Processing

Preparing keywords column

Preparing body column

Controlliamo il dataframe per vedere che non ci siano anomalie

## Modelling