In [None]:
# Download arxiv documents

In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

def get_arxiv(topic, number):
  url = "http://export.arxiv.org/api/query?search_query=all:%s&max_results=%d" % (topic, number)
  response = requests.get(url)

  if response.status_code == 200:
      content = response.content
      # Parse the XML content
      root = ET.fromstring(content)
      # Lists to store extracted data
      titles = []
      summaries = []
      published_dates = []
      authors_list = []
      categories_list = []

      # Looping to extract information
      for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
          title = entry.find('{http://www.w3.org/2005/Atom}title').text
          summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
          
          published = entry.find('{http://www.w3.org/2005/Atom}published').text
          authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')]
          categories = [category.attrib['term'] for category in entry.findall('{http://www.w3.org/2005/Atom}category')]

          titles += [title]
          summaries += [summary]
          published_dates += [published]
          authors_list += [authors]
          categories_list += [categories]

      data = { 'Title': titles, 'Summary': summaries, 'Published Date': published_dates, 'Categories': categories_list, 'Authors': authors_list    }
      df = pd.DataFrame(data)
      return df
  else:
      print("Error: Unable to fetch content from the URL.")

In [2]:
data = get_arxiv("my favourite topic", 1000)
print("Dataset size:", len(data))
data.head()

Dataset size: 1000


Unnamed: 0,Title,Summary,Published Date,Categories,Authors
0,Astroparticle physics - A Personal Outlook,"At the request of the organizers, this talk ...",1996-02-15T17:41:36Z,[astro-ph],[John Ellis]
1,Une liste de problèmes,This is a structured compilation of some of ...,2022-12-12T09:34:12Z,"[math.AG, 14-01 14-02 14D10 14E08 14G12]",[Jean-Louis Colliot-Thélène]
2,Noetherianity up to symmetry,These lecture notes for the 2013 CIME/CIRM s...,2013-10-07T09:00:53Z,[math.AG],[Jan Draisma]
3,Multi-boson correlations using wave-packets II,We investigate the analytically solvable pio...,2007-09-24T21:23:17Z,[nucl-th],"[M. I. Nagy, T. Csorgo]"
4,Topological quantum field theories,Following my plenary lecture on ICMP2000 I r...,2000-11-29T00:29:54Z,[hep-th],[Albert Schwarz]


In [3]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import make_pipeline
import nltk
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import ColumnSelector
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [7]:
# pip install spacy

In [8]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS


In [9]:
# train = data["Summary"]

In [17]:
# train.head()

In [13]:
# !python -m spacy download en_core_web_sm


In [14]:
nlp = spacy.load("en_core_web_sm")


In [19]:
# Function to preprocess text
def preprocess_text(text):
    # Apply spaCy NLP pipeline
    doc = nlp(text)
    
    # Extract lemmatized tokens, remove stop words, and keep only nouns, proper nouns, and adjectives
    lemmatized_tokens = [token.lemma_.lower() for token in doc if token.pos_ in ["PROPN", "NOUN", "ADJ"] and token.lemma_.lower() not in STOP_WORDS]
    
    # Join the tokens into a string
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

# Apply the preprocess_text function to the 'text' column
data['processed_text'] = data['Summary'].apply(preprocess_text)


In [21]:
 data.loc[:, ["Summary", "processed_text"]]

Unnamed: 0,Summary,processed_text
0,"At the request of the organizers, this talk ...",request organizer talk survey hot topic meetin...
1,This is a structured compilation of some of ...,structured compilation favourite open problem
2,These lecture notes for the 2013 CIME/CIRM s...,lecture note cime cirm summer school combinato...
3,We investigate the analytically solvable pio...,solvable pion laser model generalization arbit...
4,Following my plenary lecture on ICMP2000 I r...,plenary lecture icmp2000 result topic topologi...
...,...,...
995,Due to some results by John P. D Angelo and ...,result john p. d angelo dusty grundmeier cr- m...
996,We study the shifted convolution sum of the ...,convolution sum divisor function d_3 ramanujan...
997,This is the introductory part of my Ph.D the...,introductory ph . d thesis faculty science tec...
998,This is a draft of my brief note on the earl...,draft brief note early history n\bar{n}$ oscil...
