In [11]:
import os, re
import regex, nltk, json
import pandas as pd
import tensorflow_datasets as tfds
import pydetex.pipelines as pi
import numpy as np

## Download and setup data
This is going to be janky because tensorflow won't let you customize what features to load

In [2]:
# WARNING: This dataset is 12GB!!!
# Download the dataset. You can't change the features that are loaded though so we need to load it a different way
builder = tfds.builder('scientific_papers')
builder.download_and_prepare()

## Load the Dataset

In [3]:
# Find the path to the downloaded arxiv dataset
path = str(builder.data_path)+'\\..\\..\\..\\downloads\\extracted'
path = os.path.abspath(path)
for root, dirs, files in os.walk(path):
    if 'arxiv-dataset' in dirs:
        path = os.path.join(root, 'arxiv-dataset')
        break
else:
    raise Exception('Could not find the arxiv dataset')
print(path)

C:\Users\luttredn\tensorflow_datasets\downloads\extracted\ZIP.ucid_1b3rmCSIoh6VhD4H-cSwcwbeC_export_downloadgu0w3Xxmpkl-6z18MJDCdOnjLAEkOPjguzzOPmwfyto\arxiv-dataset


In [4]:
data_path = os.path.join(path, 'test.txt')
features = ["article_id", "article_text", "abstract_text"]
df = pd.read_json(data_path, lines=True)[features]
df = df.rename(columns={"article_text": "article", "abstract_text": "abstract"})
df = df.sample(n=500, random_state=1, ignore_index=True)

### Playing with the data

In [5]:
print(df.shape)
df.head()

(500, 3)


Unnamed: 0,article_id,article,abstract
0,gr-qc0101015,[there is considerable current interest in stu...,[<S> in this paper we consider the collision o...
1,0803.1640,[the first data system requiring dark energy (...,[<S> upcoming weak lensing ( wl ) surveys can ...
2,1510.01821,[quantum key distribution ( qkd ) is the first...,[<S> the fully symmetric gaussian tripartite e...
3,1105.2448,[the active galactic nucleus ( agn ) unificati...,[<S> x - ray unabsorbed seyfert 2 galaxies app...
4,1602.04433,[deep neural networks have significantly impro...,[<S> the recent success of deep neural network...


In [31]:
# Pick a specific or random example
example = df.iloc[442]
# example = df.sample(n=1).iloc[0]
print(f"idx: {example.name}")
print(f"id: {example['article_id']}")

idx: 442
id: 1502.07553


#### Not LaTeX stuff

In [32]:
example["abstract"]

['<S> the opening of a gap in single - layer graphene is often ascribed to the breaking of the equivalence between the two carbon sublattices . </S>',
 "<S> we show by angle - resolved photoemission spectroscopy that ir- and na - modified graphene grown on the ir(111 ) surface presents a very large unconventional gap that can be described in terms of a phenomenological `` massless '' dirac model . </S>",
 '<S> we discuss the consequences and differences of this model in comparison of the standard massive gap model , and we investigate the conditions under which such anomalous gap can arise from a spontaneous symmetry breaking . </S>',
 '<S> keywords : graphene , bandgap , dirac cone , angle - resolved - photoemission </S>']

In [33]:
example["article"][:5]

['the isolation of single - layer and few - layer graphene has triggered a huge burst of interest , mainly motivated by the observation of unconventional electronic properties , which stem from the dirac - like low - energy electronic structure of graphene characterized by a gapless conical dispersion [ ] .',
 'the huge electronic mobility of free - standing graphene originates from its chiral properties , tightly linked with the lack of electron backscattering phenomena near the fermi level [ ] .',
 'a drawback of this characteristic band structure is the absence of an energy gap between the dirac cones , which would be highly desirable for the exploitation of graphene in device applications .',
 'so far , however , the effective employment of graphene - based materials in low - energy electronics has been hindered by the difficulty of opening a bandgap without affecting the electronic mobility .',
 'understanding the fundamental mechanisms responsible of gap opening in graphene is th

In [34]:
# convience function to print a set
def print_set(s, joiner=' ', sort=True):
    if sort:
        s = sorted(s)
    if joiner is None:
        return s
    if len(s)!=0 and s[0] is not str:
        s = [str(x) for x in s]
    return joiner.join(s)

In [35]:
# See what kind of non alpha numeric characters are left over
abstract_chars = set(re.findall(r"[^a-zA-Z0-9 ]", ' '.join(example["abstract"])))
article_char = set(re.findall(r"[^a-zA-Z0-9 ]", ' '.join(example["article"])))
total_chars = abstract_chars.union(article_char)
abstract_chars = total_chars - article_char
article_char = total_chars - abstract_chars
print(f"All Characters: {print_set(total_chars)}")
print(f"Abstract Exclusive Characters: {print_set(abstract_chars)}")
print(f"Article Exclusive Characters: {print_set(article_char)}")

All Characters: $ & ' ( ) * + , - . / : ; < = > @ [ \ ] _ ` { | }
Abstract Exclusive Characters: < >
Article Exclusive Characters: $ & ' ( ) * + , - . / : ; = @ [ \ ] _ ` { | }


In [36]:
# Dataset used tags for various things. Check if there are any tags still left
abstract_tags = set(re.findall(r"<[/]?\w+[/]?>", ' '.join(example["abstract"])))
article_tags = set(re.findall(r"<[/]?\w+[/]?>", ' '.join(example["article"])))
total_tags = abstract_tags.union(article_tags)
abstract_tags = total_tags - article_tags
article_tags = total_tags - abstract_tags
print(f"All Tags: {print_set(total_tags)}")
print(f"Abstract Exclusive Tags: {print_set(abstract_tags)}")
print(f"Article Exclusive Tags: {print_set(article_tags)}")

All Tags: </S> <S>
Abstract Exclusive Tags: </S> <S>
Article Exclusive Tags: 


In [37]:
# Are there any indices with multiple sentences?
abstract_sent_counts = [len(nltk.tokenize.sent_tokenize(sent)) for sent in example["abstract"]]
article_sent_counts = [len(nltk.tokenize.sent_tokenize(sent)) for sent in example["article"]]

print(f"Abstract Indices with multiple sentences: {  sum([1 for count in abstract_sent_counts if count > 1])  }/{  len(example.abstract)  }")
abstract_sent_counts = [(i, abstract_sent_counts.count(i)) for i in set(abstract_sent_counts)]
print("\t" + print_set(abstract_sent_counts))

print(f"Article Indices with multiple sentences: {  sum([1 for count in article_sent_counts if count > 1])  }/{  len(example.article)  }")
article_sent_counts = [(i, article_sent_counts.count(i)) for i in set(article_sent_counts)]
print("\t" + print_set(article_sent_counts))

Abstract Indices with multiple sentences: 3/4
	(1, 1) (2, 3)
Article Indices with multiple sentences: 60/158
	(1, 98) (2, 53) (3, 6) (4, 1)


In [38]:
# The document also has some weird tags
weird_tags = set(re.findall(r"\( \[ [^\(\[\)\]]+ \] \)", ' '.join(example["article"])))
print(print_set(weird_tags))

( [ coulomb ] ) ( [ emish ] ) ( [ fock ] ) ( [ gap ] ) ( [ h0 ] ) ( [ heff ] ) ( [ hgap ] ) ( [ inst ] ) ( [ lastgap ] ) ( [ less ] )


In [39]:
# There are these other weird tags, but I think I'm going to try keeping them
weird_tags = set(re.findall(r"@x[a-z]+", ' '.join(example["article"])))
print(print_set(weird_tags))

@xmath


#### Figuring out how to deal with LaTex
The dataset has most latex parsed out but there is still a decent amount remaining... and due to the poor attempt made previously, it's really hard to parse out the stragglers

In [40]:
# Removing weird tags for now
example_article = ' '.join(example.article)
example_article = re.sub(r"\( \[ [^\(\[\)\]]+ \] \)", "", example_article)
example_article = re.sub(r"\n", "", example_article)

In [41]:
# Find groups of LaTeX (if any \ is within 100 characters of another \ then it is a group)
math_groups = [x.group() for x in re.finditer(r"([^\\]{0,100}\\[^\\]{0,100})+", example_article)]
print("\n\n".join(math_groups))

ces a and b. this corresponds to include a @xmath0 term in the dirac - like hamiltonian : @xmath1 + \frac{\delta}{2}\hat{\sigma}_z , \label{hgap}\end{aligned}\ ] ] where @xmath2 is the dirac velocity , @xmath3 is the momentum relative to the k point , and whe

ies . as mentioned above , the characteristic energy - momentum dispersion results to be @xmath34 . \label{eless}\end{aligned}\ ] ] note that this models accounts in a simple way for the anomalous features described in the intr

xmath73 , @xmath74 ) , @xmath75 is the spinor in the sublattice basis @xmath76 , and where @xmath77.\end{aligned}\ ] ]    we assume a long - range coulomb interaction which can we written in the momentum space as : @xmath78 where @xmath79 , and where @xmath80/\epsilon_0 \kappa   |{\bf q}|$ ] . the term @xmath81 $ ] in @xmath82 takes into account the subtraction of the positive cha

.  , or , equivalently , neglecting @xmath97 in the right side term of eq .  , we obtain : @xmath98 \label{alpha}\end{aligned

In [42]:
# A common pattern I want to remove is \blah \blah blah ] ]
math_groups = [regex.sub(r"(\\((?>[^\\\]]+|(?R))*+)])", "", math_group) for math_group in math_groups]
print("\n\n".join(math_groups))

ces a and b. this corresponds to include a @xmath0 term in the dirac - like hamiltonian : @xmath1 + \frac{\delta}{2}\hat{\sigma}_z , \label{hgap} where @xmath2 is the dirac velocity , @xmath3 is the momentum relative to the k point , and whe

ies . as mentioned above , the characteristic energy - momentum dispersion results to be @xmath34 . \label{eless} note that this models accounts in a simple way for the anomalous features described in the intr

xmath73 , @xmath74 ) , @xmath75 is the spinor in the sublattice basis @xmath76 , and where @xmath77.    we assume a long - range coulomb interaction which can we written in the momentum space as : @xmath78 where @xmath79 , and where @xmath80/\epsilon_0  in @xmath82 takes into account the subtraction of the positive cha

.  , or , equivalently , neglecting @xmath97 in the right side term of eq .  , we obtain : @xmath98 \label{alpha} where @xmath99 is the coupling constant for suspended graphene , @xmath100 is a high - momentum

der phase tra

In [43]:
# Try removing the latex expressions (this is really slow and annoying to work with after)
# latex_removed = [pi.strict(sent) for sent in math_groups]

# Alternatively, we can try removing words that start with a backslash and have curly brace groups afterwards
latex_removed = [regex.sub(r"((?>\\[^\s{]*+)({(?>[^{}]+|(?2))*+})*)", "", sent) for sent in math_groups]

print("\n\n".join(latex_removed))

ces a and b. this corresponds to include a @xmath0 term in the dirac - like hamiltonian : @xmath1 + _z ,  where @xmath2 is the dirac velocity , @xmath3 is the momentum relative to the k point , and whe

ies . as mentioned above , the characteristic energy - momentum dispersion results to be @xmath34 .  note that this models accounts in a simple way for the anomalous features described in the intr

xmath73 , @xmath74 ) , @xmath75 is the spinor in the sublattice basis @xmath76 , and where @xmath77.    we assume a long - range coulomb interaction which can we written in the momentum space as : @xmath78 where @xmath79 , and where @xmath80/  in @xmath82 takes into account the subtraction of the positive cha

.  , or , equivalently , neglecting @xmath97 in the right side term of eq .  , we obtain : @xmath98  where @xmath99 is the coupling constant for suspended graphene , @xmath100 is a high - momentum

der phase transition . in order to address this possibility , we rewrite eq .  as : @xmat

In [44]:
# Remove the groups of non standard characters
reg_exp = r"((?![a-zA-Z] )(?:[a-zA-Z ]{0,2}[^a-zA-Z0-9.,;:@\s][a-zA-Z ]{0,2})+(?<! [a-zA-Z]))"
special_chars_removed = [re.sub(reg_exp, " ", sent) for sent in latex_removed]
print("\n\n".join(special_chars_removed))

ces a and b. this corresponds to include a @xmath0 term in the dirac like hamiltonian : @xmath1 ,  where @xmath2 is the dirac velocity , @xmath3 is the momentum relative to the k point , and whe

ies . as mentioned above , the characteristic energy momentum dispersion results to be @xmath34 .  note that this models accounts in a simple way for the anomalous features described in the intr

xmath73 , @xmath74 , @xmath75 is the spinor in the sublattice basis @xmath76 , and where @xmath77.    we assume a long range coulomb interaction which can we written in the momentum space as : @xmath78 where @xmath79 , and where @xmath80 in @xmath82 takes into account the subtraction of the positive cha

.  , or , equivalently , neglecting @xmath97 in the right side term of eq .  , we obtain : @xmath98  where @xmath99 is the coupling constant for suspended graphene , @xmath100 is a high momentum

der phase transition . in order to address this possibility , we rewrite eq .  as : @xmath105 0 ,  is the 

In [45]:
compress_spaces = [re.sub(r"\s+", " ", sent) for sent in special_chars_removed]
print("\n\n".join(compress_spaces))

ces a and b. this corresponds to include a @xmath0 term in the dirac like hamiltonian : @xmath1 , where @xmath2 is the dirac velocity , @xmath3 is the momentum relative to the k point , and whe

ies . as mentioned above , the characteristic energy momentum dispersion results to be @xmath34 . note that this models accounts in a simple way for the anomalous features described in the intr

xmath73 , @xmath74 , @xmath75 is the spinor in the sublattice basis @xmath76 , and where @xmath77. we assume a long range coulomb interaction which can we written in the momentum space as : @xmath78 where @xmath79 , and where @xmath80 in @xmath82 takes into account the subtraction of the positive cha

. , or , equivalently , neglecting @xmath97 in the right side term of eq . , we obtain : @xmath98 where @xmath99 is the coupling constant for suspended graphene , @xmath100 is a high momentum

der phase transition . in order to address this possibility , we rewrite eq . as : @xmath105 0 , is the functional

This was a pain in the ass but I think it works pretty well

#### The whole latex removal pipeline

In [22]:
# Removing weird tags for now
example_article = ' '.join(example.article)
example_article = re.sub(r"\( \[ [^\(\[\)\]]+ \] \)", "", example_article)
example_article = re.sub(r"\n", "", example_article)

# REMOVING LATEX
# Remove \blah \blah blah ] ]
example_article = regex.sub(r"(\\((?>[^\\\]]+|(?R))*+)])", "", example_article)

# Remove words that start with a backslash and have curly brace groups
example_article = regex.sub(r"((?>\\[^\s{]*+)({(?>[^{}]+|(?2))*+})*)", "", example_article)

# Remove the groups of non standard characters
reg_exp = r"((?![a-zA-Z] )(?:[a-zA-Z ]{0,2}[^a-zA-Z0-9.,:@\s][a-zA-Z ]{0,2})+(?<! [a-zA-Z]))"
example_article = re.sub(reg_exp, " ", example_article)

# Compress whitespace
example_article = re.sub(r"\s+", " ", example_article)

# Sentence tokenize
example_article = nltk.tokenize.sent_tokenize(example_article)

# Remove sentences that are too short (less than 5 words, excluding punctuation and numbers)
example_article = list(filter(lambda x: len(list(filter(lambda x: x.isalpha(), x.split()))) > 5, example_article))

print('\n'.join(example_article))

it is a fact of life , and one that has been exploited again and again at this meeting , that even gauge invariant objects are more transparent in a specific gauge .
for example , the static interquark potential , which has a well known gauge invariant definition , is often best treated in coulomb gauge see , for example , @xcite and more recently @xcite as this gauge is , in some way , closely adapted to that physical system .
usually such a choice of gauge is a simple pragmatic decision based on the simplicity of the resulting calculation .
however , what we d like to argue is that the connection between such an adapted gauge and the correct gauge invariant description , often goes much deeper @xcite .
we will see that such appropriate gauge fixings , once recognised , can lead to an understanding of the dominant contribution to the gauge invariant description of the relevant physical degrees of freedom .
for monopoles and vortices , where gauge invariant formulation do not yet exist

## Cleaning the Data

#### Cleaning

In [5]:
def remove_latex(doc):
    # Remove \blah \blah blah ] ]
    doc = regex.sub(r"(\\((?>[^\\\]]+|(?R))*+)])", "", doc)

    # Remove words that start with a backslash and have curly brace groups
    doc = regex.sub(r"((?>\\[^\s{]*+)({(?>[^{}]+|(?2))*+})*)", "", doc)

    # Remove the groups of non standard characters
    reg_exp = r"((?![a-zA-Z] )(?:[a-zA-Z ]{0,2}[^a-zA-Z0-9.,:@\s][a-zA-Z ]{0,2})+(?<! [a-zA-Z]))"
    doc = re.sub(reg_exp, " ", doc)

    # Compress whitespace
    doc = re.sub(r"\s+", " ", doc)
    return doc

def normalize_document(doc):
    # undo bad sent tokenization
    doc = ' '.join(doc)

    # remove \n
    doc = re.sub(r'\n', ' ', doc)

    # remove sentence tags from abstract
    doc = re.sub(r'<S>|</S>', '', doc)

    # remove weird tags: ( [ blah ] )
    doc = re.sub(r"\( \[ [^\(\[\)\]]+ \] \)", "", doc)

    # remove latex
    doc = remove_latex(doc)

    # Sentence tokenize
    doc = nltk.tokenize.sent_tokenize(doc)

    # Strip each sentence
    doc = [sent.strip() for sent in doc]

    # Remove sentences that are too short (less than 5 words, excluding punctuation and numbers)
    doc = list(filter(lambda x: len(list(filter(lambda x: x.isalpha(), x.split()))) > 5, doc))
    return doc

def normalize_corpus(corpus):
    return corpus.map(lambda x: normalize_document(x))

In [6]:
cleaned_df = pd.DataFrame(columns=["article_id", "article", "abstract"])
cleaned_df["article_id"] = df["article_id"]
print("Normalizing article text...")
cleaned_df["article"] = normalize_corpus(df["article"])
print("Normalizing abstract text...")
cleaned_df["abstract"] = normalize_corpus(df["abstract"])

Normalizing article text...
Normalizing abstract text...


#### Looking at the results

In [7]:
cleaned_df.head()

Unnamed: 0,article_id,article,abstract
0,gr-qc0101015,[there is considerable current interest in stu...,[in this paper we consider the collision of sp...
1,0803.1640,[the first data system requiring dark energy c...,[upcoming weak lensing surveys can be used to ...
2,1510.01821,[quantum key distribution qkd is the first mat...,[the fully symmetric gaussian tripartite entan...
3,1105.2448,[the active galactic nucleus agn unification s...,[x ray unabsorbed seyfert 2 galaxies appear to...
4,1602.04433,[deep neural networks have significantly impro...,[the recent success of deep neural networks re...


In [8]:
# Pick a specific or random example
# example = cleaned_df.iloc[442]
example = cleaned_df.sample(n=1).iloc[0]
print(f"idx: {example.name}")
print(f"id: {example['article_id']}")

idx: 203
id: 1007.2979


In [9]:
# The abstract
print('\n'.join(example.abstract))

recent results are reviewed on galaxy dynamics , bar evolution , destruction and re formation , cold gas accretion , gas radial flows and agn fueling , minor mergers .
some problems of galaxy evolution are discussed in particular , exchange of angular momentum , radial migration through resonant scattering , and consequences on abundance gradients , the frequency of bulgeless galaxies , and the relative role of secular evolution and hierarchical formation .


In [10]:
# The article
print('\n'.join(example.article))

there are at least three main scenarios invoked for galaxy formation : the first one , inspired by eggen et al 1962 is the monolithic collapse , where the initial gas clouds collapse gravitationally while forming quickly stars before the end of their collapse , so that they end up with a stellar spheroid .
disks must then form later on , through slower gas accretion .
the second is the hierarchical scenario , where the first systems to form are disks the star formation time scale being longer than the collapse time .
then the interaction and merger of two disks lead to the formation of a spheroid .
the third scenario , developped in this talk , considers that disks form first , as in the second scenario .
however , disks then may evolve without any merger with other galaxies .
their own internal evolution could also produce a spheroid at the center , through secular evolution .
this scenario assumes external gas accretion from filaments of the cosmic web e.g.
spontaneously , a disk evo

In [27]:
avg_abstract_len = df['abstract'].apply(lambda x: sum([len(words.split(" ")) for words in x])).mean()
avg_abstract_filtered_len = df['abstract'].apply(lambda x: sum([len(list(filter(lambda word: word.isalpha(), words.split(" ")))) for words in x])).mean()
print(f"Average Abstract Length: {avg_abstract_len}")
print(f"Average Abstract Filtered Length: {avg_abstract_filtered_len}")

avg_article_len = df['article'].apply(lambda x: sum([len(words.split(" ")) for words in x])).mean()
avg_article_filtered_len = df['article'].apply(lambda x: sum([len(list(filter(lambda word: word.isalpha(), words.split(" ")))) for words in x])).mean()
# avg_article_filtered_len = df['article'].apply(lambda x: np.mean([len(filter(lambda word: word.isalpha(), words.split(" "))) for words in x])).mean()
print(f"Average Article Length: {avg_article_len}")
print(f"Average Article Filtered Length: {avg_article_filtered_len}")

Average Abstract Length: 176.496
Average Abstract Filtered Length: 137.264
Average Article Length: 5990.734
Average Article Filtered Length: 4284.148


#### Saving the cleaned data

In [52]:
cleaned_df.to_csv("data/arxiv_cleaned.csv", index=False)