# Prepare trainings data

This notebook creates a dataframe for training the taxonomy described in the paper "Evaluation methods and replicability of software architecture research objects". The authors provide a bib-text file with annotated papers. The data from the bib-text needs to be transformed in a dataframe. This notebook also downloads the paper abstracts.

In [11]:
import os

data_dir = os.path.join('data', 'software_architecture')
output_file = os.path.join(data_dir, 'bib-text.csv')

os.makedirs(data_dir, exist_ok=True)

## Load and parse labeled bib-text

In [12]:
import urllib.request

bibtex_url = "https://gitlab.com/SoftwareArchitectureResearch/StateOfPractice/-/raw/main/Investigated%20Papers.bib"
contents = urllib.request.urlopen(bibtex_url).read()

In [13]:
import bibtexparser

library = bibtexparser.loads(contents)

## Create and fill dataframe with bib-text data

In [14]:
import pandas as pd

df_input = {
    'author': [],
    'title': [],
    'abstract': [],
    'doi': [],
    'classes': [],
    'url': [],
}

for entry in library.entries:
    for field in df_input.keys():
        if field in entry:
            df_input[field].append(entry[field])
        else:
            df_input[field].append(None)

df = pd.DataFrame(df_input)

## Scrape paper abstracts

In [15]:
import re
import json
import requests


def get_abstract(url: str):
    """
    Scrape paper abstracts from URL
    :param url: Source URL
    :return: Paper abstract
    """

    if url is None:
        return None

    # The https://doi.org website returns a 418 if the user agent is not set
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/91.0.4472.114 Safari/537.36'
    }

    url = url.replace('\\', '')
    response = requests.get(url, headers=headers, timeout=5, allow_redirects=True)
    if not response.ok:
        return None

    if 'xplGlobal.document.metadata' in response.text:
        metadata_str = re.search(r'xplGlobal.document.metadata=(.+?);$', response.text, re.MULTILINE)
        metadata = json.loads(metadata_str.group(1))
        return metadata['abstract']
    elif '<script type="application/ld+json">' in response.text:
        metadata_str = re.search(r'<script type="application/ld\+json">(.+?)</script>', response.text, re.MULTILINE)
        metadata = json.loads(metadata_str.group(1))
        return metadata['description']
    else:
        return None

In [16]:
from tqdm import tqdm

for inx, row in tqdm(df.iterrows(), total=len(df)):
    source = row['url']

    # Check if the row is type str
    if not isinstance(source, str):
        source = f'https://doi.org/{row["doi"]}'

    abstract = get_abstract(source)
    if not abstract:
        print(f"Failed to get abstract for {source}")

    df['abstract'][inx] = abstract

100%|██████████| 153/153 [02:36<00:00,  1.02s/it]


In [17]:
df.to_csv(output_file, index=False)
df

Unnamed: 0,author,title,abstract,doi,classes,url
0,Alessio Bucaioni and Patrizio Pelliccione and ...,Aligning Architecture with Business Goals in t...,When designing complex automotive systems in p...,10.1109/ICSA51549.2021.00020,"Meta Data{Research Level{Primary Research}, Ki...",https://doi.org/10.1109/ICSA51549.2021.00020
1,H{\'{e}}ctor Cadavid and Vasilios Andrikopoulo...,System- and Software-level Architecting Harmon...,The problems caused by the gap between system-...,10.1109/ICSA51549.2021.00010,"Meta Data{Kind{full}, Paper class{Evaluation R...",https://doi.org/10.1109/ICSA51549.2021.00010
2,Joshua Garcia and Mehdi Mirakhorli and Lu Xiao...,Constructing a Shared Infrastructure for Softw...,Over the past three decades software engineeri...,10.1109/ICSA51549.2021.00022,"Meta Data{Paper class{Evaluation Research}, Re...",https://doi.org/10.1109/ICSA51549.2021.00022
3,Holger Knoche and Wilhelm Hasselbring,Continuous {API} Evolution in Heterogenous Ent...,The ability to independently deploy parts of a...,10.1109/ICSA51549.2021.00014,"Meta Data{Research Level{Primary Research}, Ki...",https://doi.org/10.1109/ICSA51549.2021.00014
4,Duc Minh Le and Suhrid Karthik and Marcelo Sch...,Architectural Decay as Predictor of Issue- and...,Architectural decay imposes real costs in term...,10.1109/ICSA51549.2021.00017,"Meta Data{Paper class{Evaluation Research}, Re...",https://doi.org/10.1109/ICSA51549.2021.00017
...,...,...,...,...,...,...
148,"Keim, Jan and Schulz, Sophie and Fuch{\ss}, Do...",Trace {Link} {Recovery} for {Software} {Archit...,Software Architecture Documentation often cons...,10.1007/978-3-030-86044-8_7,"Meta Data{Research Level{Primary Research}, Ki...",
149,"Shabelnyk, Oleksandr and Frangoudis, Pantelis ...",Updating {Service}-{Based} {Software} {Systems...,Contemporary component-based systems often man...,10.1007/978-3-030-86044-8_10,"Meta Data{Paper class{Proposal of Solution}, R...",
150,Stefan Kugele and David Hettler and Jan Peter,Data-Centric Communication and Containerizatio...,Context: The functional interconnection and da...,10.1109/ICSA.2018.00016,"Meta Data{Kind{full}, Research Level{Primary R...",https://doi.org/10.1109/ICSA.2018.00016
151,Banani Roy and Amit Kumar Mondal and Chanchal ...,Towards a Reference Architecture for Cloud-Bas...,The domain of plant genotyping and phenotyping...,10.1109/ICSA.2017.42,"Meta Data{Paper class{Validation Research, Eva...",https://doi.org/10.1109/ICSA.2017.42
