In [215]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import re
import os

In [216]:
# Extract metadata from the raw text files

metadata = []
keys = ['paper', 'title', 'author', 'journal', 'year', 'abstract']

parent_folder_path = './abstracts'

# Iterate over all folders
for root, dirs, files in os.walk(parent_folder_path):

    subfolder = os.path.relpath(root, parent_folder_path)
    
    # Iterate over all files
    for filename in files:
        file_path = os.path.join(root, filename)
        # Check if it is a regular file
        if os.path.isfile(file_path):
            # Read file
            with open(file_path, encoding='utf-8') as f:
                block = f.readlines()

                # Dictionary to store extracted data
                data = dict.fromkeys(keys)

                # Parse each line in the block to extract data
                for line in block:
                    x = re.search(r"Paper: hep-th/(.+)\n", line)
                    if x and not data['paper']: data['paper'] = x.group(1)
                    
                    x = re.search(r"Title: (.+)\n", line)
                    if x and not data['title']: data['title'] = x.group(1)

                    x = re.search(r"Authors: (.+)\n|Author: (.+)\n", line)
                    if x and not data['author']: data['author'] = x.group(1)

                    x = re.search(r"Journal-ref: (.+)\n", line)
                    if x and not data['journal']: data['journal'] = x.group(1)

                # Add year data
                data['year'] = subfolder

                # Extract abstract that spans over multiple lines
                first_index = block.index("\\\\\n")
                second_index = block.index("\\\\\n", first_index+1)
                third_index = block.index("\\\\\n", second_index+1)

                filtered = block[second_index+1:third_index]
                merged = ''
                for l in filtered: merged = merged + l
                data['abstract'] = merged.replace('\n', ' ').strip()

                metadata.append(data)

            f.close()

In [218]:
# Convert into dataframe
df = pd.DataFrame(metadata, columns=keys)

In [219]:
# Extract individual author names from string into a list
def get_authors(authors):
    if authors:
        author_list = authors.replace(' and ',',').split(',')
        return [author.strip() for author in author_list]
    return None

df['author'] = df['author'].apply(get_authors)

In [220]:
# Extract journal name from journal column

def get_journal_name(journal):
    if journal:
        pattern = r'(.+\.)'
        match = re.match(pattern, journal)
        if match:
            journal_name = match.group(1).replace(' ', '')
            return journal_name
    return None

    df['journal'] = df['journal'].apply(get_journal_name)

In [221]:
# Data is ready
df.head(10)

Unnamed: 0,paper,title,author,journal,year,abstract
0,9201001,Combinatorics of the Modular Group II: the Kon...,"[C. Itzykson, J.-B. Zuber]",Int.J.Mod.Phys. A7 (1992) 5661-5705,1992,We study algebraic aspects of Kontsevich integ...
1,9201002,Inomogeneous Quantum Groups as Symmetries of P...,"[F.Bonechi, E.Celeghini, R.Giachetti, E.Sorace...",Phys.Rev.Lett. 68 (1992) 3718-3720,1992,The quantum deformed (1+1) Poincare' algebra i...
2,9201003,"Intersection Theory, Integrable Hierarchies an...",[Robbert Dijkgraaf],,1992,In these lecture notes we review the various r...
3,9201004,The Heterotic Green-Schwarz Superstring on an ...,[Nathan Berkovits],Nucl.Phys. B379 (1992) 96-120,1992,By defining the heterotic Green-Schwarz supers...
4,9201005,Ward Identities in Two-Dimensional String Theory,[Igor R. Klebanov],Mod. Phys. Lett. A7 (1992) 723-732,1992,I study the Ward identities of the $w_\infty$ ...
5,9201006,On Symmetries of Some Massless 2D Field Theories,[Denis Bernard],Phys.Lett. B279 (1992) 78-86,1992,We describe few aspects of the quantum symmetr...
6,9201007,Static Domain Walls in N=1 Supergravity,"[Mirjam Cvetic, Stephen Griffies, Soo-Jong Rey]",Nucl.Phys. B381 (1992) 301-328,1992,We study supersymmetric domain walls in N=1 su...
7,9201008,Coulomb Gas Representations and Screening Oper...,[Satoshi Matsuda],Phys.Lett. B282 (1992) 56-62,1992,The Coulomb gas representations are presented ...
8,9201009,Large-Small Equivalence in String Theory,[Eva Silverstein],Phys. Lett. B278 (1992) 111-118,1992,The simplest toroidally compactified string th...
9,9201010,From Virasoro Constraints in Kontsevich's Mode...,"[A.Marshakov, A.Mironov, A.Morozov]",Mod. Phys. Lett. A7 (1992) 1345-1360,1992,The Ward identities in Kontsevich-like 1-matri...


In [222]:
# Save as csv
df.to_csv('data/metadata.csv')