In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import pydetex.pipelines as pi
import numpy as np
import re
import nltk
import json
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luttredn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
path = "C:/Users/luttredn/tensorflow_datasets/downloads/extracted/ZIP.ucid_1b3rmCSIoh6VhD4H-cSwcwbeC_export_downloadgu0w3Xxmpkl-6z18MJDCdOnjLAEkOPjguzzOPmwfyto/arxiv-dataset/"
path += "train.txt"
df = pd.DataFrame(columns=["article_id", "article", "abstract"])
with open(path, 'r') as f:
    for line in f:
        data = json.loads(line)
        df.loc[len(df.index)] = {"article_id": data["article_id"], "article": data["article_text"], "abstract": data["abstract_text"]}
df = df.sample(n=1000)

In [None]:
# df = df.sample(n=100)

In [4]:
df.reset_index(inplace=True, drop=True)
print(df.shape)
df.head()

(1000, 3)


Unnamed: 0,article_id,article,abstract
0,cond-mat9902107,[in the hope to get a better understanding of ...,[<S> we applied the recurrent variational appr...
1,1308.2865,"[consider a network @xmath0 , where @xmath1 de...","[<S> in this paper , a hub refers to a non - t..."
2,1208.1580,[the magnetism of fermi gases has always recei...,[<S> magnetic properties of a charged spin-1 b...
3,astro-ph0108136,[this paper tries to understand whether concen...,[<S> we examine the question of how well the p...
4,0805.4263,"[in the past decade and a half , over two hund...",[<S> the perturbation caused by planet - moon ...


In [6]:
df.loc[0, "abstract"]

['<S> we applied the recurrent variational approach to the two - leg hubbard ladder . at half - filling , </S>',
 '<S> our variational ansatz was a generalization of the resonating valence bond state . at finite doping </S>',
 '<S> , hole pairs were allowed to move in the resonating valence bond background . </S>',
 '<S> the results obtained by the recurrent variational approach were compared with results from density matrix renormalization group .    2 </S>']

In [7]:
stop_words = nltk.corpus.stopwords.words('english')
def normalize_document(doc):
    # undo bad sent tokenization
    doc = ' '.join(doc)
    # remove latex
    # print("removing latex")
    # doc = pi.strict_eqn(doc)
    # removing @xmath and @xcite and <S> and </S>
    doc = re.sub(r'@xmath', '', doc)
    doc = re.sub(r'\([^()]*?@xcite.*?\)|@xcite', '', doc)
    doc = re.sub(r'<S>|</S>', '', doc)
    # sent tokenize document
    doc = nltk.sent_tokenize(doc)
    for idx, sent in enumerate(doc):
        # lower case and remove special characters\whitespaces
        # doc = re.sub(r'[^a-zA-Z0-9.\s]', ' ', doc, re.I|re.A)
        sent = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", sent)
        sent = re.sub(r'[ ]{2,}', ' ', sent)
        sent = sent.lower()
        sent = sent.strip()
        # tokenize document
        tokens = nltk.word_tokenize(sent)
        # filter stopwords out of document
        filtered_tokens = [token for token in tokens if token not in stop_words]
        # re-create document from filtered tokens
        sent = ' '.join(filtered_tokens)
        doc[idx] = sent
    
    return doc

def normalize_corpus(corpus):
    normalized_corpus = []
    for doc_idx in tqdm(range(len(corpus))):
        normalized_corpus.append(normalize_document(corpus[doc_idx]))
    return normalized_corpus

In [12]:
cleaned_df = pd.DataFrame(columns=["article_id", "article", "abstract"])
cleaned_df["article_id"] = df["article_id"]
print("Normalizing article text...")
cleaned_df["article"] = normalize_corpus(df["article"])
print("Normalizing abstract text...")
cleaned_df["abstract"] = normalize_corpus(df["abstract"])

Normalizing article text...


  0%|          | 0/1000 [00:00<?, ?it/s]

Normalizing abstract text...


  0%|          | 0/1000 [00:00<?, ?it/s]

In [13]:
cleaned_df.head()

Unnamed: 0,article_id,article,abstract
0,cond-mat9902107,[hope get better understanding strongly intera...,[applied recurrent variational approach two le...
1,1308.2865,[consider network 0 1 denotes set vertices 2 3...,[paper hub refers non terminal vertex degree l...
2,1208.1580,[magnetism fermi gases always received conside...,[magnetic properties charged spin 1 bose gas f...
3,astro-ph0108136,[paper tries understand whether concentrations...,[examine question well physical properties clu...
4,0805.4263,[past decade half two hundred fifty extra sola...,[perturbation caused planet moon binarity time...


In [14]:
cleaned_df.to_csv("arxiv_cleaned.csv", index=False)

In [11]:
print(df.loc[0]["article_id"])
print(normalize_document(df.loc[0]["abstract"]))
normalize_document(df.loc[0]["article"])

cond-mat9902107
['applied recurrent variational approach two leg hubbard ladder', 'half filling variational ansatz generalization resonating valence bond state', 'finite doping hole pairs allowed move resonating valence bond background', 'results obtained recurrent variational approach compared results density matrix renormalization group', '2']


['hope get better understanding strongly interacting systems considerable interest ladder systems',
 'ladder systems proven theoretical wonderland analytically numerically',
 'however much analytic work done ladders weak coupling perturbatively parameter namely analytic methods strong coupling',
 'exact diagonalization monte carlo density matrix renormalization group methods primary tools studying systems strong coupling',
 'methods strengths weaknesses considering lattice sizes temperatures couplings consider',
 'ability fabricate materials ladders theoretical playground',
 'example 0 well separated two leg ladders composed 1',
 'also cuprate like material 2 consists weakly coupled 3 two leg ladders material 4 consists weakly coupled 3 three leg ladders',
 'recently powerful analytic method developed deal strongly coupled quasi one dimensional systems recurrent variational approach rva',
 'method similar spirit wilson numerical renormalization group white density matrix renormalizatio