In [1]:
import os, re
import regex, nltk, json
import pandas as pd
import tensorflow_datasets as tfds
import pydetex.pipelines as pi

## Download and setup data
This is going to be janky because tensorflow won't let you customize what features to load

In [2]:
# WARNING: This dataset is 12GB!!!
# Download the dataset. You can't change the features that are loaded though so we need to load it a different way
builder = tfds.builder('scientific_papers')
builder.download_and_prepare()

## Load the Dataset

In [3]:
# Find the path to the downloaded arxiv dataset
path = str(builder.data_path)+'\\..\\..\\..\\downloads\\extracted'
path = os.path.abspath(path)
for root, dirs, files in os.walk(path):
    if 'arxiv-dataset' in dirs:
        path = os.path.join(root, 'arxiv-dataset')
        break
else:
    raise Exception('Could not find the arxiv dataset')
print(path)

C:\Users\luttredn\tensorflow_datasets\downloads\extracted\ZIP.ucid_1b3rmCSIoh6VhD4H-cSwcwbeC_export_downloadgu0w3Xxmpkl-6z18MJDCdOnjLAEkOPjguzzOPmwfyto\arxiv-dataset


In [4]:
data_path = os.path.join(path, 'test.txt')
features = ["article_id", "article_text", "abstract_text"]
df = pd.read_json(data_path, lines=True)[features]
df = df.rename(columns={"article_text": "article", "abstract_text": "abstract"})
df = df.sample(n=500, random_state=1, ignore_index=True)

### Playing with the data

In [5]:
print(df.shape)
df.head()

(500, 3)


Unnamed: 0,article_id,article,abstract
0,gr-qc0101015,[there is considerable current interest in stu...,[<S> in this paper we consider the collision o...
1,0803.1640,[the first data system requiring dark energy (...,[<S> upcoming weak lensing ( wl ) surveys can ...
2,1510.01821,[quantum key distribution ( qkd ) is the first...,[<S> the fully symmetric gaussian tripartite e...
3,1105.2448,[the active galactic nucleus ( agn ) unificati...,[<S> x - ray unabsorbed seyfert 2 galaxies app...
4,1602.04433,[deep neural networks have significantly impro...,[<S> the recent success of deep neural network...


In [6]:
# Pick a specific or random example
# example = df.iloc[442]
example = df.sample(n=1).iloc[0]
print(f"idx: {example.name}")
print(f"id: {example['article_id']}")

idx: 376
id: hep-th0204258


#### Not LaTeX stuff

In [7]:
example["abstract"]

['<S> a systematic approach to the description of gauge invariant charges is presented and applied to the construction of both the static colour charge configuration in qcd and the monopole solution in pure su(2 ) . </S>',
 '<S> the gauge invariant non - abelian monopole offers a new style of order parameter for monopole condensation . </S>']

In [8]:
example["article"][:5]

['it is a fact of life , and one that has been exploited again and again at this meeting , that even gauge invariant objects are more transparent in a specific gauge .',
 'for example , the static interquark potential , which has a well known gauge invariant definition , is often best treated in coulomb gauge ( see , for example , @xcite and more recently  @xcite ) as this gauge is , in some way , closely adapted to that physical system .',
 'usually such a choice of gauge is a simple pragmatic decision based on the simplicity of the resulting calculation .',
 'however , what we d like to argue is that the connection between such an adapted gauge and the correct gauge invariant description , often goes much deeper  @xcite .',
 'we will see that such appropriate gauge fixings , once recognised , can lead to an understanding of the dominant contribution to the gauge invariant description of the relevant physical degrees of freedom . for monopoles and vortices , where gauge invariant form

In [9]:
# convience function to print a set
def print_set(s, joiner=' ', sort=True):
    if sort:
        s = sorted(s)
    if joiner is None:
        return s
    if len(s)!=0 and s[0] is not str:
        s = [str(x) for x in s]
    return joiner.join(s)

In [10]:
# See what kind of non alpha numeric characters are left over
abstract_chars = set(re.findall(r"[^a-zA-Z0-9 ]", ' '.join(example["abstract"])))
article_char = set(re.findall(r"[^a-zA-Z0-9 ]", ' '.join(example["article"])))
total_chars = abstract_chars.union(article_char)
abstract_chars = total_chars - article_char
article_char = total_chars - abstract_chars
print(f"All Characters: {print_set(total_chars)}")
print(f"Abstract Exclusive Characters: {print_set(abstract_chars)}")
print(f"Article Exclusive Characters: {print_set(article_char)}")

All Characters: $ ( ) , - . / : < = > @ [ \ ] _
Abstract Exclusive Characters: / < >
Article Exclusive Characters: $ ( ) , - . : = @ [ \ ] _


In [11]:
# Dataset used tags for various things. Check if there are any tags still left
abstract_tags = set(re.findall(r"<[/]?\w+[/]?>", ' '.join(example["abstract"])))
article_tags = set(re.findall(r"<[/]?\w+[/]?>", ' '.join(example["article"])))
total_tags = abstract_tags.union(article_tags)
abstract_tags = total_tags - article_tags
article_tags = total_tags - abstract_tags
print(f"All Tags: {print_set(total_tags)}")
print(f"Abstract Exclusive Tags: {print_set(abstract_tags)}")
print(f"Article Exclusive Tags: {print_set(article_tags)}")

All Tags: </S> <S>
Abstract Exclusive Tags: </S> <S>
Article Exclusive Tags: 


In [12]:
# Are there any indices with multiple sentences?
abstract_sent_counts = [len(nltk.tokenize.sent_tokenize(sent)) for sent in example["abstract"]]
article_sent_counts = [len(nltk.tokenize.sent_tokenize(sent)) for sent in example["article"]]

print(f"Abstract Indices with multiple sentences: {  sum([1 for count in abstract_sent_counts if count > 1])  }/{  len(example.abstract)  }")
abstract_sent_counts = [(i, abstract_sent_counts.count(i)) for i in set(abstract_sent_counts)]
print("\t" + print_set(abstract_sent_counts))

print(f"Article Indices with multiple sentences: {  sum([1 for count in article_sent_counts if count > 1])  }/{  len(example.article)  }")
article_sent_counts = [(i, article_sent_counts.count(i)) for i in set(article_sent_counts)]
print("\t" + print_set(article_sent_counts))

Abstract Indices with multiple sentences: 2/2
	(2, 2)
Article Indices with multiple sentences: 28/81
	(1, 53) (2, 27) (3, 1)


In [13]:
# The document also has some weird tags
weird_tags = set(re.findall(r"\( \[ [^\(\[\)\]]+ \] \)", ' '.join(example["article"])))
print(print_set(weird_tags))

( [ abeliang ] ) ( [ ced ] ) ( [ df ] ) ( [ dm ] ) ( [ etp ] ) ( [ gid ] ) ( [ qedmin ] ) ( [ r1b ] ) ( [ sd ] ) ( [ su2 dm ] )


In [14]:
# There are these other weird tags, but I think I'm going to try keeping them
weird_tags = set(re.findall(r"@x[a-z]+", ' '.join(example["article"])))
print(print_set(weird_tags))

@xcite @xmath


#### Figuring out how to deal with LaTex
The dataset has most latex parsed out but there is still a decent amount remaining... and due to the poor attempt made previously, it's really hard to parse out the stragglers

In [15]:
# Removing weird tags for now
example_article = ' '.join(example.article)
example_article = re.sub(r"\( \[ [^\(\[\)\]]+ \] \)", "", example_article)
example_article = re.sub(r"\n", "", example_article)

In [16]:
# Find groups of LaTeX (if any \ is within 100 characters of another \ then it is a group)
math_groups = [x.group() for x in re.finditer(r"([^\\]{0,100}\\[^\\]{0,100})+", example_article)]
print("\n\n".join(math_groups))

lence of the construction . ] and its equal - time commutator with the potential is @xmath16=f_i(x)m\,.\ ] ] so we can interpret @xmath17 as a monopole creation operator for the pure abelian theory . alth


In [17]:
# A common pattern I want to remove is \blah \blah blah ] ]
math_groups = [regex.sub(r"(\\((?>[^\\\]]+|(?R))*+)])", "", math_group) for math_group in math_groups]
print("\n\n".join(math_groups))

lence of the construction . ] and its equal - time commutator with the potential is @xmath16=f_i(x)m so we can interpret @xmath17 as a monopole creation operator for the pure abelian theory . alth


In [18]:
# Try removing the latex expressions (this is really slow and annoying to work with after)
# latex_removed = [pi.strict(sent) for sent in math_groups]

# Alternatively, we can try removing words that start with a backslash and have curly brace groups afterwards
latex_removed = [regex.sub(r"((?>\\[^\s{]*+)({(?>[^{}]+|(?2))*+})*)", "", sent) for sent in math_groups]

print("\n\n".join(latex_removed))

lence of the construction . ] and its equal - time commutator with the potential is @xmath16=f_i(x)m so we can interpret @xmath17 as a monopole creation operator for the pure abelian theory . alth


In [19]:
# Remove the groups of non standard characters
reg_exp = r"((?![a-zA-Z] )(?:[a-zA-Z ]{0,2}[^a-zA-Z0-9.,;:@\s][a-zA-Z ]{0,2})+(?<! [a-zA-Z]))"
special_chars_removed = [re.sub(reg_exp, " ", sent) for sent in latex_removed]
print("\n\n".join(special_chars_removed))

lence of the construction . and its equal time commutator with the potential is @xmath16 so we can interpret @xmath17 as a monopole creation operator for the pure abelian theory . alth


In [20]:
compress_spaces = [re.sub(r"\s+", " ", sent) for sent in special_chars_removed]
print("\n\n".join(compress_spaces))

lence of the construction . and its equal time commutator with the potential is @xmath16 so we can interpret @xmath17 as a monopole creation operator for the pure abelian theory . alth


This was a pain in the ass but I think it works pretty well

#### The whole latex removal pipeline

In [22]:
# Removing weird tags for now
example_article = ' '.join(example.article)
example_article = re.sub(r"\( \[ [^\(\[\)\]]+ \] \)", "", example_article)
example_article = re.sub(r"\n", "", example_article)

# REMOVING LATEX
# Remove \blah \blah blah ] ]
example_article = regex.sub(r"(\\((?>[^\\\]]+|(?R))*+)])", "", example_article)

# Remove words that start with a backslash and have curly brace groups
example_article = regex.sub(r"((?>\\[^\s{]*+)({(?>[^{}]+|(?2))*+})*)", "", example_article)

# Remove the groups of non standard characters
reg_exp = r"((?![a-zA-Z] )(?:[a-zA-Z ]{0,2}[^a-zA-Z0-9.,:@\s][a-zA-Z ]{0,2})+(?<! [a-zA-Z]))"
example_article = re.sub(reg_exp, " ", example_article)

# Compress whitespace
example_article = re.sub(r"\s+", " ", example_article)

# Sentence tokenize
example_article = nltk.tokenize.sent_tokenize(example_article)

# Remove sentences that are too short (less than 5 words, excluding punctuation and numbers)
example_article = list(filter(lambda x: len(list(filter(lambda x: x.isalpha(), x.split()))) > 5, example_article))

print('\n'.join(example_article))

it is a fact of life , and one that has been exploited again and again at this meeting , that even gauge invariant objects are more transparent in a specific gauge .
for example , the static interquark potential , which has a well known gauge invariant definition , is often best treated in coulomb gauge see , for example , @xcite and more recently @xcite as this gauge is , in some way , closely adapted to that physical system .
usually such a choice of gauge is a simple pragmatic decision based on the simplicity of the resulting calculation .
however , what we d like to argue is that the connection between such an adapted gauge and the correct gauge invariant description , often goes much deeper @xcite .
we will see that such appropriate gauge fixings , once recognised , can lead to an understanding of the dominant contribution to the gauge invariant description of the relevant physical degrees of freedom .
for monopoles and vortices , where gauge invariant formulation do not yet exist

## Cleaning the Data

#### Cleaning

In [23]:
def remove_latex(doc):
    # Remove \blah \blah blah ] ]
    doc = regex.sub(r"(\\((?>[^\\\]]+|(?R))*+)])", "", doc)

    # Remove words that start with a backslash and have curly brace groups
    doc = regex.sub(r"((?>\\[^\s{]*+)({(?>[^{}]+|(?2))*+})*)", "", doc)

    # Remove the groups of non standard characters
    reg_exp = r"((?![a-zA-Z] )(?:[a-zA-Z ]{0,2}[^a-zA-Z0-9.,:@\s][a-zA-Z ]{0,2})+(?<! [a-zA-Z]))"
    doc = re.sub(reg_exp, " ", doc)

    # Compress whitespace
    doc = re.sub(r"\s+", " ", doc)
    return doc

def normalize_document(doc):
    # undo bad sent tokenization
    doc = ' '.join(doc)

    # remove \n
    doc = re.sub(r'\n', ' ', doc)

    # remove sentence tags from abstract
    doc = re.sub(r'<S>|</S>', '', doc)

    # remove weird tags: ( [ blah ] )
    doc = re.sub(r"\( \[ [^\(\[\)\]]+ \] \)", "", doc)

    # remove latex
    doc = remove_latex(doc)

    # Sentence tokenize
    doc = nltk.tokenize.sent_tokenize(doc)

    # Strip each sentence
    doc = [sent.strip() for sent in doc]

    # Remove sentences that are too short (less than 5 words, excluding punctuation and numbers)
    doc = list(filter(lambda x: len(list(filter(lambda x: x.isalpha(), x.split()))) > 5, doc))
    return doc

def normalize_corpus(corpus):
    return corpus.map(lambda x: normalize_document(x))

In [24]:
cleaned_df = pd.DataFrame(columns=["article_id", "article", "abstract"])
cleaned_df["article_id"] = df["article_id"]
print("Normalizing article text...")
cleaned_df["article"] = normalize_corpus(df["article"])
print("Normalizing abstract text...")
cleaned_df["abstract"] = normalize_corpus(df["abstract"])

Normalizing article text...
Normalizing abstract text...


#### Looking at the results

In [25]:
cleaned_df.head()

Unnamed: 0,article_id,article,abstract
0,gr-qc0101015,[there is considerable current interest in stu...,[in this paper we consider the collision of sp...
1,0803.1640,[the first data system requiring dark energy c...,[upcoming weak lensing surveys can be used to ...
2,1510.01821,[quantum key distribution qkd is the first mat...,[the fully symmetric gaussian tripartite entan...
3,1105.2448,[the active galactic nucleus agn unification s...,[x ray unabsorbed seyfert 2 galaxies appear to...
4,1602.04433,[deep neural networks have significantly impro...,[the recent success of deep neural networks re...


In [26]:
# Pick a specific or random example
# example = cleaned_df.iloc[442]
example = cleaned_df.sample(n=1).iloc[0]
print(f"idx: {example.name}")
print(f"id: {example['article_id']}")

idx: 53
id: 1702.06415


In [27]:
# The abstract
print('\n'.join(example.abstract))

using the experimental data from the alice program on the centrality dependence of the transverse momentum @xmath0 spectra in collisions at @xmath1 tev , we show that the double tsallis distribution and the generalized fokker plank solution can not describe the spectra of pions , kaons and protons from central to peripheral collisions in the entire @xmath0 region , simultaneously .
hence , a new two component distribution , which is a hydrodynamic extension of the generalized fp solution accounting for the collective motion effect in heavy ion collisions , is proposed in order to reproduce all the particle spectra .
our results suggest that the particle production dynamics may be different for different particles , especially at very low @xmath0 region .


In [28]:
# The article
print('\n'.join(example.article))

the advent of a new generation of high energy collider experiments , such as relativistic heavy ion collider rhic and large hadron collider lhc , has launched a new era in the study of the hadron production .
plenty of pp , pa and aa collisions data have been accumulated , which allow us to study the nature of the final particle production .
the transverse momentum spectra carry important information about the dynamics of particle production and evolution process of interacting system formed in high energy nuclear collisions .
in the past decade , the attempt to understand the particle production mechanism by different theoretical and phenomenological approaches has been a great success @xcite .
generally , the theoretical investigation of hadron production in heavy ion collisions is operated into different camps , characterized by the regions of transverse momenta @xmath0 of the produced hadrons .
at low @xmath0 statistical hadronization and hydrodynamical models are generally adopted

#### Saving the cleaned data

In [29]:
cleaned_df.to_csv("data/arxiv_cleaned.csv", index=False)