In [2]:
import re

def print_text_with_line_breaks(text, line_length=100):
    for i in range(0, len(text), line_length):
        print(text[i:i+line_length])

def save_string_to_txt(string_content, txt_file):
    with open(txt_file, 'w', encoding='utf-8') as file:
        file.write(string_content)

def remove_text_in_square_brackets(input_string):
    pattern = r"\[[^\]]*\]"
    result = re.sub(pattern, "", input_string)

    return result


#### Wikipedia Extract Parser

In [14]:
import requests
import datetime

current_datetime = datetime.datetime.now()
title="Guernica (Picasso)"

response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': f'{title}',
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }).json()
page = next(iter(response['query']['pages'].values()))
text_string = page['extract']

save_string_to_txt(string_content=text_string, txt_file=f"{title}_{current_datetime}.txt")

Pablo Ruiz Picasso (25 October 1881 – 8 April 1973) was a Spanish painter, sculptor, printmaker, cer
amicist and theatre designer who spent most of his adult life in France. One of the most influential
 artists of the 20th century, he is known for co-founding the Cubist movement, the invention of cons
tructed sculpture, the co-invention of collage, and for the wide variety of styles that he helped de
velop and explore. Among his most famous works are the proto-Cubist Les Demoiselles d'Avignon (1907)
, and the anti-war painting Guernica (1937), a dramatic portrayal of the bombing of Guernica by Germ
an and Italian air forces during the Spanish Civil War.
Picasso demonstrated extraordinary artistic 
talent in his early years, painting in a naturalistic manner through his childhood and adolescence. 
During the first decade of the 20th century, his style changed as he experimented with different the
ories, techniques, and ideas. After 1906, the Fauvist work of the older artist Henri Matiss

#### Wikipedia Full Text Parser

In [4]:
import requests
from lxml import html
import datetime

current_datetime = datetime.datetime.now()
title="Pablo Picasso"

response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'parse',
        'page': f'{title}',
        'format': 'json',
    }).json()
raw_html = response['parse']['text']['*']
document = html.document_fromstring(raw_html)
p_nodes = document.xpath('//p')

text_string = " ".join(list(map(lambda x: x.text_content(), p_nodes)))
text_string = remove_text_in_square_brackets(text_string)

save_string_to_txt(string_content=text_string, txt_file=f"{title}_{current_datetime}.txt")

#### Recursive Wikipedia Full Text Parser

In [9]:
import wikipediaapi
import datetime

current_datetime = datetime.datetime.now()
wiki_wiki = wikipediaapi.Wikipedia('MyProjectName (hyd6623@naver.com)', 'en')

def get_sub_linked_titles(wiki_title):
    page = wiki_wiki.page(wiki_title)
    if not page.exists():
        print(f"The page '{wiki_title}' does not exist on Wikipedia.")
        return []

    sub_linked_titles = []
    for link in page.links:
        sub_linked_titles.append(link)

    return sub_linked_titles

def get_page_content(wiki_title):
    page = wiki_wiki.page(wiki_title)
    if not page.exists():
        print(f"The page '{wiki_title}' does not exist on Wikipedia.")
        return ""

    return page.text

def save_wikipedia_to_text(title: str):
    current_datetime = datetime.datetime.now()
    text_string = get_page_content(title)
    save_string_to_txt(string_content=text_string, txt_file=f"wikipedia_{title}_{current_datetime}.txt")

title = "Pablo Picasso"
sub_linked_titles = get_sub_linked_titles(title)
sub_linked_titles.append(title)

for title_item in sub_linked_titles:
    save_wikipedia_to_text(title_item)

The page 'Eva Gouel' does not exist on Wikipedia.
The page 'Harlequin and His Companion (The Saltimbanque)' does not exist on Wikipedia.
The page 'Henry Valensi' does not exist on Wikipedia.
The page 'House in the Garden (House and Trees)' does not exist on Wikipedia.
The page 'Jeffrey B. Jackson' does not exist on Wikipedia.
The page 'Richard Lacayo' does not exist on Wikipedia.
The page 'File:10 PABLO PICASO.ogg' does not exist on Wikipedia.


FileNotFoundError: [Errno 2] No such file or directory: 'wikipedia_Template:Marriage/doc_2023-07-23 20:38:01.686399.txt'

#### Webpage Parser

In [51]:
import requests
from bs4 import BeautifulSoup
import datetime

current_datetime = datetime.datetime.now()
webpage_url = "https://www.museoreinasofia.es/en/collection/artwork/guernica"

def extract_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')
        # Remove all script and style tags to exclude code
        for script in soup(['script', 'style', 'button', 'img', 'span', 'input', ]):
            script.extract()
        # Get the text content from the remaining HTML
        text = soup.get_text()
        # Normalize whitespace and remove leading/trailing spaces
        text = ' '.join(text.split())
        return text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return None

text_string = extract_text_from_url(webpage_url)
save_string_to_txt(string_content=text_string, txt_file=f"{'webpage_url'}_{current_datetime}.txt")
print_text_with_line_breaks(text_string)

Pablo Picasso (Pablo Ruiz Picasso) - Guernica What's on at the Museo Tickets Informative guide Openi
ng hours & prices EN ES EN PressRestorationLibrary and Documentation CentreEducationStudy CentreSupp
ort VisitPublicationsCollectionExhibitionsActivities Visit Hours and TicketsAccessGuided ToursGroup 
visitServicesAccessibilityPublicationsCollection Episode 1Episode 2Episode 3Episode 4Artworks in pub
lic placesBrowse all artworksRestorationExhibitionsActivities Seminars and LecturesFilm and VideoMus
ic, Dance and PerformanceAssociated activitiesForce LinesTIZs What's on at the Museo PressRestoratio
nLibrary and Documentation CentreEducationStudy CentreSupport Tickets Opening hours & prices Buscar 
en Entire website Colección Actividades Exposiciones Educación Visitas comentadas Multimedia Prensa 
Palabra clave ES EN Buscar Entire websiteCollectionActivitiesExhibitionsEducationGallery conversatio
nsMultimediaPressPublications Palabra clave Guernica Pablo Picasso (Pablo Ruiz Picasso) Mal