In [2]:
import sys
import pandas as pd
import numpy as np
import re
import pywikibot
import requests
from bs4 import BeautifulSoup
import datetime


In [None]:
catalog_df = pd.read_csv("public_domain_dump-master/pseudocatalogue.csv")
i = 6
i

In [None]:
ONLY_WORDS_AND_DIGITS_REGEX = re.compile(r'[\w\dא-ת]+')

def get_only_words_and_digits(text: str):
    return ONLY_WORDS_AND_DIGITS_REGEX.findall(text)

def invert_words(words: list):
    return [w[::-1] for w in words]

def remove_last_line_from_string(s):
    s = s.split("את הטקסט לעיל הפיקו מתנדבי פרויקט בן־יהודה באינטרנט.  הוא זמין תמיד בכתובת הבאה")
    return s[0]


def remove_author_title(s, author, title):
    s = s.replace(title, "", 1)
    return s.replace(author, "")

def normalize_text(s):
    s = re.sub(r'[\u0591-\u05BD\u05BF-\u05C2\u05C4-\u05C7]', '', s)
    return s.strip()

def parse_place(place):
    place = place.replace("\n", "$")
    place = place.split("$$$$$$$$$$$$$$$$")[1]
    return place.split("$$$$$$$$$")[0]

def parse_date(date):
    date = date.replace("\n", "$")
    date = date.split("$$$$$$$$$$$$$$$$")[1]
    date = date.split("$$$$$$$$$")[0]
    date = date.split("Gregorian")[0]

    if len(date) == 4:
        return datetime.datetime.strptime(date, '%Y')
    return  datetime.datetime.strptime(date, '%d %B %Y')


In [None]:
i = 0
content = []
for index, row in catalog_df.iterrows():
    print(index)
    path = row["path"]
    title = normalize_text(row["title"])
    author = row["authors"]
    path = "public_domain_dump-master/txt_stripped" + path + ".txt"
    with open(path, "r", encoding="utf-8") as f:
        text = f.read()
        text = remove_last_line_from_string(text)
        text = remove_author_title(text, author, title)
        text = get_only_words_and_digits(text)
        content.append(text)
    i += 1
catalog_df["text"] = content

In [None]:
def parse_writer(writer, id):
    site = pywikibot.Site("he", "wikipedia")
    page = pywikibot.Page(site, writer)
    try:
        item = pywikibot.ItemPage.fromPage(page)
        page_id = item.id
        URL = f"https://www.wikidata.org/wiki/{page_id}"
    except:

        URL = f"https://he.wikipedia.org/?curid={page.pageid}"
        response = requests.get(URL)
        soup = BeautifulSoup(response.text, 'html')
        URL = soup.find('a', {'title' : 'קישור לפריט המשויך במאגר הנתונים [g]'})['href']


    response = requests.get(URL)
    print(response, writer)

    soup = BeautifulSoup(response.text, 'html')


    try:
        sex = soup.find('a', {'title' : 'Q6581097'}).string
    except:
        sex = "female"


    place_of_birth = soup.find("div", {"id": "P19"}).text
    try:
        place_of_birth = parse_place(place_of_birth)
    except:
        print("p birth - ", sys.exc_info())

    place_of_death = soup.find("div", {"id": "P20"}).text
    try:
        place_of_death = parse_place(place_of_death)
    except:
        print("p death - ", sys.exc_info())


    date_of_birth = soup.find("div", {"id": "P569"}).text
    try:
        date_of_birth = parse_date(date_of_birth)
    except:
        print("d birth - ", sys.exc_info())

    date_of_death = soup.find("div", {"id": "P570"}).text
    try:
        date_of_death = parse_date(date_of_death)
    except:
        print("d death - ", sys.exc_info())

    return [id, writer, sex, place_of_birth, place_of_death, date_of_birth, date_of_death]

In [None]:
writers = catalog_df["authors"].unique()
authors_df = pd.DataFrame(columns=["id", "name", "sex", "p_birth", "p_death", "d_birth", "d_death"], )
for i, writer in enumerate(writers):
    try:
        authors_df.loc[i] = parse_writer(writer, i)
    except Exception:
        authors_df.loc[i] = [i, writer, None, None, None, None, None]
        print(sys.exc_info())
        print("couldn't complete for", writer, i)
print(authors_df)
authors_df.to_csv("authors", index=False)

In [10]:
from dateutil import parser

def get_year(year_str):
    try:
        if year_str and year_str is not np.nan:
            if '0000' in year_str:
                return 0
            return parser.parse(year_str).year
    except:
        return None



def parse_authors(df, cities):
    data = []
    for index, row in df.iterrows():
        id = row["id"]
        male = row["sex"] == "male"
        female = row["sex"] == "female"
        birth_year = get_year(row["d_birth"])
        death_year = get_year(row["d_death"])
        if birth_year is None or death_year is None:
            continue
        b_ancient = birth_year <=500
        b_spain = 500 < birth_year <= 1400
        b_renaissance = 1400 < birth_year <= 1800
        b_19ct = 1800 < birth_year <= 1900
        b_20ct = 1900 < birth_year <= 2000
        b_modern = 2000 < birth_year
        d_ancient = death_year <=500
        d_spain = 500 < death_year <= 1400
        d_renaissance = 1400 < death_year <= 1800
        d_19ct = 1800 < death_year <= 1900
        d_20ct = 1900 < death_year <= 2000
        d_modern = 2000 < death_year
        hebrew_speaker = "FALSE" != row["Hebrew Speaker"]
        p_birth_israel = row["p_birth"] in cities
        p_birth_not_israel = row["p_birth"] not in cities
        p_death_israel = row["p_death"] in cities
        p_death_not_israel = row["p_death"] not in cities
        data.append([id, male, female, b_ancient, b_spain, b_renaissance, b_19ct,
                     b_20ct, b_modern, d_ancient, d_spain, d_renaissance, d_19ct,
                     d_20ct, d_modern, hebrew_speaker, p_birth_israel, p_birth_not_israel,
                     p_death_israel, p_death_not_israel])
    return data





authors_df = pd.read_csv('authors.csv')
israeli_cities = ["Tel Aviv", "Jaffa", "Jerusalem", "Haifa", "Nahalal", "Acre", "Ramat Gan", "Givat Hashlosha",
                  "Rishon LeZion", "Petah Tikva", "Zikhron Ya'akov", "Israel", "Mandatory Palestine"]
parsed_authors_data = parse_authors(authors_df, israeli_cities)
parsed_authors_df = pd.DataFrame(parsed_authors_data, columns=["id", "male", "female", "b_ancient", "b_spain", "b_renaissance", "b_19ct",
                     "b_20ct", "b_modern", "d_ancient", "d_spain", "d_renaissance", "d_19ct",
                     "d_20ct", "d_modern", "hebrew_speaker", "p_birth_israel", "p_birth_not_israel",
                     "p_death_israel", "p_death_not_israel"])

In [11]:
parsed_authors_df.to_csv("authors_parsed.csv")