In [7]:
import sys
import pandas as pd
import numpy as np
import re
import pywikibot
import requests
from bs4 import BeautifulSoup
import datetime


In [8]:
catalog_df = pd.read_csv("public_domain_dump-master/pseudocatalogue.csv")
i = 6
i

6

In [9]:
ONLY_WORDS_AND_DIGITS_REGEX = re.compile(r'[\w\dא-ת]+')

def get_only_words_and_digits(text: str):
    return ONLY_WORDS_AND_DIGITS_REGEX.findall(text)

def invert_words(words: list):
    return [w[::-1] for w in words]

def remove_last_line_from_string(s):
    s = s.split("את הטקסט לעיל הפיקו מתנדבי פרויקט בן־יהודה באינטרנט.  הוא זמין תמיד בכתובת הבאה")
    return s[0]


def remove_author_title(s, author, title):
    s = s.replace(title, "", 1)
    return s.replace(author, "")

def normalize_text(s):
    s = re.sub(r'[\u0591-\u05BD\u05BF-\u05C2\u05C4-\u05C7]', '', s)
    return s.strip()

def parse_place(place):
    place = place.replace("\n", "$")
    place = place.split("$$$$$$$$$$$$$$$$")[1]
    return place.split("$$$$$$$$$")[0]

def parse_date(date):
    date = date.replace("\n", "$")
    date = date.split("$$$$$$$$$$$$$$$$")[1]
    date = date.split("$$$$$$$$$")[0]
    date = date.split("Gregorian")[0]

    if len(date) == 4:
        return datetime.datetime.strptime(date, '%Y')
    return  datetime.datetime.strptime(date, '%d %B %Y')


In [15]:
# i = 0
# content = []
# for index, row in catalog_df.iterrows():
#     print(index)
#     path = row["path"]
#     title = normalize_text(row["title"])
#     author = row["authors"]
#     path = "public_domain_dump-master/txt_stripped" + path + ".txt"
#     with open(path, "r", encoding="utf-8") as f:
#         text = f.read()
#         text = remove_last_line_from_string(text)
#         text = remove_author_title(text, author, title)
#         text = get_only_words_and_digits(text)
#         content.append(text)
#     i += 1
# catalog_df["text"] = content

In [10]:
def parse_writer(writer, id):
    site = pywikibot.Site("he", "wikipedia")
    page = pywikibot.Page(site, writer)
    try:
        item = pywikibot.ItemPage.fromPage(page)
        page_id = item.id
        URL = f"https://www.wikidata.org/wiki/{page_id}"
    except:

        URL = f"https://he.wikipedia.org/?curid={page.pageid}"
        response = requests.get(URL)
        soup = BeautifulSoup(response.text, 'html')
        URL = soup.find('a', {'title' : 'קישור לפריט המשויך במאגר הנתונים [g]'})['href']


    response = requests.get(URL)
    print(response, writer)

    soup = BeautifulSoup(response.text, 'html')


    try:
        sex = soup.find('a', {'title' : 'Q6581097'}).string
    except:
        sex = "female"


    place_of_birth = soup.find("div", {"id": "P19"}).text
    try:
        place_of_birth = parse_place(place_of_birth)
    except:
        print("p birth - ", sys.exc_info())

    place_of_death = soup.find("div", {"id": "P20"}).text
    try:
        place_of_death = parse_place(place_of_death)
    except:
        print("p death - ", sys.exc_info())


    date_of_birth = soup.find("div", {"id": "P569"}).text
    try:
        date_of_birth = parse_date(date_of_birth)
    except:
        print("d birth - ", sys.exc_info())

    date_of_death = soup.find("div", {"id": "P570"}).text
    try:
        date_of_death = parse_date(date_of_death)
    except:
        print("d death - ", sys.exc_info())

    return [id, writer, sex, place_of_birth, place_of_death, date_of_birth, date_of_death]



writers = catalog_df["authors"].unique()
authors_df = pd.DataFrame(columns=["id", "name", "sex", "p_birth", "p_death", "d_birth", "d_death"], )
for i, writer in enumerate(writers):
    try:
        authors_df.loc[i] = parse_writer(writer, i)
    except Exception:
        authors_df.loc[i] = [i, writer, None, None, None, None, None]
        print(sys.exc_info())
        print("couldn't complete for", writer, i)
print(authors_df)
authors_df.to_csv("authors", index=False)


<Response [200]> אחד העם
<Response [200]> אירה יאן
<Response [200]> יהודה ליב גורדון
(<class 'TypeError'>, TypeError("'NoneType' object is not subscriptable"), <traceback object at 0x0000019D4F7D5880>)
couldn't complete for עזריאל נתן פרנק 3
<Response [200]> יוסף חיים ברנר
d birth -  (<class 'ValueError'>, ValueError('unconverted data remains: Julian$$$$$$$$stated as$$$$$י"ז אלול תרמ"א'), <traceback object at 0x0000019D501D4500>)
<Response [200]> רחל בלובשטיין
<Response [200]> חיים נחמן ביאליק
d birth -  (<class 'ValueError'>, ValueError('unconverted data remains: Julian'), <traceback object at 0x0000019D51516440>)
<Response [200]> קדיש יהודה סילמן
<Response [200]> אלחנן ליב לוינסקי
<Response [200]> שמריהו לוין
<Response [200]> אברהם שטרן
<Response [200]> משה אבן עזרא
<Response [200]> אברהם שלום פרידברג
<Response [200]> אהרן אברהם קבק
(<class 'TypeError'>, TypeError("'NoneType' object is not subscriptable"), <traceback object at 0x0000019D54571380>)
couldn't complete for זלמן דוד בן יה