In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
import os
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [None]:
# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# URL target (arXiv contoh)
URL = "https://arxiv.org/list/cs.LG/recent"
headers = {"User-Agent": "Mozilla/5.0"}

# Request ke website
response = requests.get(URL, headers=headers)
if response.status_code != 200:
    print("Failed to retrieve data")
    exit()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
# Parsing HTML dengan BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Menemukan semua judul penelitian
titles = []
for title in soup.find_all('div', class_='list-title mathjax'):
    text = re.sub(r'(?i)^title[:\-\s]*', '', title.get_text(strip=True))
    titles.append(text)

# Membersihkan data dengan NLTK
def clean_text(text):
    tokens = word_tokenize(text.lower())  # Tokenisasi & ubah ke lowercase
    tokens = [word for word in tokens if re.match(r'^[\w-]+$', word)]  # Pertahankan kata dengan tanda hubung
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Stopword removal
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

cleaned_titles = [clean_text(title) for title in titles]

# Contoh data sebelum dan sesudah pembersihan
text = "Multi-Agent Verification: Scaling Test-Time Compute with Multiple Verifiers."
print(clean_text(text))

multi-agent verification scaling test-time compute multiple verifier


In [None]:
# Simpan hasil ke CSV
df = pd.DataFrame({'Original Title': titles, 'Cleaned Title': cleaned_titles})
df.to_csv('scraped_titles.csv', index=False, encoding='utf-8')

print("Scraping dan pembersihan data selesai! File disimpan sebagai scraped_titles.csv")

Scraping dan pembersihan data selesai! File disimpan sebagai scraped_titles.csv
