In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

# my stuff
import LoadSamples
from SeferNames import all_sfarim, shas, add_alt_chkchk
from SeferNames import cheleks, simans, siifs, sifkatans

In [2]:
long_mekor_str=40

### Names of notes seperators

In [3]:
simanim = np.asarray([[f'I{x}.', f'{x}.', f'{x}I.', f'{x}II.', f'{x}III.'] 
                      for x in ['I', 'V', 'X', 'XV']]).flatten().tolist()
simanim.append('XIV.')
simanim.insert(4, 'נספח')

sifim = [f'{x}.' for x in 'אבגדהוזחטי']
sifim.extend([f'י{x}.' for x in 'אבגדזחט'])
sifim.extend([f'ט{x}.' for x in 'וז'])
sifim.extend([f'כ{x}.' for x in ' אבגדהוזחט'])

klalim = [f'כלל {כ}' for כ in 'אבגדהוזחטי']
klalim.extend([f'כלל י{כ}' for כ in 'אבגד'])
klalim.extend([f'כלל ט{כ}' for כ in 'וז'])
klalim.extend([f'כלל י{כ}' for כ in 'זחט'])
klalim.extend([f'כלל כ{כ}' for כ in ' אבגדהוזחט'])

while len(simanim) < len(klalim):
    simanim.append(simanim[-1])
simanim = np.asarray(simanim)
simanim = simanim[np.argsort([len(siman) for siman in simanim])]

### Things to deal with chuck_chucks and numbers

In [4]:
def remove_chucks(hebrew_word):
    for chuck in ['״', '׳', '"', "'"]:
        while chuck in hebrew_word:
            hebrew_word = (hebrew_word[:hebrew_word.index(chuck)] 
                           + hebrew_word[hebrew_word.index(chuck)+1:])
    return hebrew_word

In [5]:
def chuck_with(word, prefix):
    word = remove_chucks(word)
    word = prefix + word
    word = word[:-1] + '״' + word[-1]
    return word

In [6]:
def has_chuck(word):
    return ("'" in word
            or '"' in word
            or '״' in word)

In [7]:
def can_be_prefix(letter, place):
    first_only = 'בכמלו'
    can_be_seconds = 'השד'
    if place <=2 and letter in can_be_seconds:
        return True
    elif place == 1 and letter in first_only:
        return True
    return False

In [8]:
def is_numeric(hebrew_word, allow_prefix=False, fifty_cap=False,
               allow_hundreds=True):
    ones = 'אבגדהוזחט'
    tens = 'יכלמנסעפצ'
    hundreds = 'קרשת'
    if fifty_cap:
        tens = 'יכלמנ'
        allow_hundreds=False
    hebrew_word = remove_chucks(hebrew_word)
    current_place = 100
    prev_letter = 'ת'
    for index, letter in enumerate(hebrew_word):
        if allow_prefix and can_be_prefix(letter, index+1):
            continue
        elif letter in hundreds:
            if not allow_hundreds:
                return False
            elif current_place <= 100:
                return False
            elif hundreds.index(prev_letter) < hundreds.index(letter):
                return False
            elif (hundreds.index(prev_letter) == hundreds.index(letter)
                  and letter != 'ת'):
                return False
        elif letter in tens:
            if current_place <= 10:
                return False
            current_place = 10
        elif letter in ones:
            if current_place <= 1:
                return False
            current_place = 1
        else:
            return False
        prev_letter = letter
    return True

In [9]:
def remove_pref(h_str, allowed='מבו'):
    if h_str[0] == 'ו':
        h_str = h_str[1:]
    if h_str[0] in allowed:
        h_str = h_str[1:]
    return h_str

### On the daf identifiers

In [10]:
def get_amud_str(amud):
    if amud == 'א':
        return '.'
    elif amud == 'ב':
        return ':'
    return ''

In [11]:
def get_dafs(mekor_str):
    words = mekor_str.split(' ')
    dafs = []
    pairs = []
    if 'דף' in words:
        # prob form of דף ה עמוד ב, etc
        where = words.index('דף')
        
    for index, word in enumerate(words):
        if word == 'דף':
            dafs.append(remove_chucks(words[index+1]))
        elif word == 'עמוד':
            amud = get_amud_str(words[index+1])
            if dafs and amud != '':
                pairs.append(dafs.pop()+amud)
        elif 'ע״' in word or ('ע' + '"') in word:
            amud = get_amud_str(word[-1])
            if amud == '':
                continue
            daf = words[index-1]
            if 'ד' in daf and ('״' in daf or '"' in daf):
                daf = daf[1:]
            pairs.append(remove_chucks(daf)+amud)
    for lonely_daf in dafs:
        pairs.append(lonely_daf)
    return pairs

### Rambam identifiers

In [12]:
def rambam_perk_halacha(mekor_str):
    words = mekor_str.split(' ')
    pereks = []
    pairs = []
    
    for index, word in enumerate(words):
        if ('״' in word or '"' in word):
            cut = remove_pref(word, allowed='מבו')
            if cut[0] == 'פ':
                cut = remove_pref(cut, allowed='פ')
                if is_numeric(cut, fifty_cap=True):
                    pereks.append(chuck_with(cut, 'פ'))
            if cut[0] == 'ה': 
                cut = remove_pref(cut, allowed='ה')
                if is_numeric(cut, fifty_cap=True):
                    halacha = chuck_with(cut, 'ה')
                    if pereks:
                        pairs.append(f'{pereks.pop()} {halacha}')
        elif word == 'פרק':
            pereks.append(words[index+1])
        elif word == 'הלכה':
            halacha = words[index+1]
            if pereks:
                pairs.append(f'{pereks.pop()}:{halacha}')
        
    for lonely_perek in pereks:
        pairs.append(lonely_perek)
    return pairs

### Shutim identifiers

In [13]:
def shutim_section(mekor_str):
    
    if len(mekor_str) > long_mekor_str:
        return '?'
    
    sect_str = ''
    words = mekor_str.split(' ')
    for index, word in enumerate(words):
        for sign in [*cheleks, *simans, *siifs, *sifkatans]:
            if sign in word:
                if sign == 'סעיף' and words[index+1] == 'קטן':
                    word = 'סעיף קטן'
                    index += 1
                if sect_str:
                    sect_str += ' '
                sect_str += word
                if not has_chuck(word) or sign in sifkatans:
                    sect_str += ' '
                    sect_str += words[index+1]
    return sect_str

### Turim identifiers

In [14]:
def tur_is_really_a_shut(mekor_str, tur_str):
    for shut in all_sfarim['shutim']:
        if (shut in mekor_str and np.abs(
             mekor_str.index(shut) -  mekor_str.index(tur_str)) < 30):
            return True
    return False

### All identifier

In [15]:
def get_sefer_name(mekor_str):
    
    # this is a basic inconsistency i have that should be fixed
    mekor_str = mekor_str.replace('השגת', 'השגות')
    
    for label, sefer_names in all_sfarim.items():
        for sefer in sefer_names:
             if sefer in mekor_str:
                if label == 'sfarim_chizonim':
                    return sefer
                elif label == 'turim':
                    if tur_is_really_a_shut(mekor_str, sefer):
                        continue
                    return mekor_str[:mekor_str.index(sefer)]
                elif label == 'shutim':
                    for sectioner in [*cheleks, *simans]:
                        if sectioner in mekor_str:
                            return mekor_str[:mekor_str.index(sectioner)]
                elif label == 'rambam':
                    for halacha_str in [' הלכות ',"הל' ", 'הל׳',]:
                        if halacha_str in mekor_str:
                            mekor_str = mekor_str.replace(halacha_str, '')
                    return mekor_str[:mekor_str.index(sefer)]
                elif label == 'mesechtot':
                    if 'מסכת' in mekor_str:
                        name = mekor_str[:mekor_str.index('מסכת')]
                    elif sefer == mekor_str.split(' ')[0]:
                        name = 'תלמוד בבלי'
                    else:
                        name = mekor_str[:mekor_str.index(sefer)]
                    if 'תלמוד בבלי' in name:
                        name = 'גמרא'
                    if 'חידושי' in name:
                        name = name.replace('חידושי ה', '')
                    return name
    return '?'

### Sorters

In [16]:
def gematria(letters_str):
    value = 0
    ones = 'אבגדהוזחט'
    for ot in ones:
        if ot in letters_str:
            value += (ones.index(ot)+1)
    tens = 'יכלמנסעפצ'
    for ot in tens:
        if ot in letters_str:
            value += (tens.index(ot)+1) * 10
    hunds = 'קרשת'
    for ot in hunds:
        if ot in letters_str:
            value += (hunds.index(ot)+1) * 100
    return value

In [17]:
def daf_numerical(daf_str):
    value = gematria(daf_str)
    if ':' in daf_str:
        value += .5
    return value

In [18]:
def perek_halacha_numerical(ph_str):
    if ':' in ph_str:
        perek, halacha = ph_str.split(':') 
    else:
        perek = ph_str
        halacha = ''
    perek_value = gematria(perek)
    halacha_value = gematria(halacha)
    total_value = perek_value + halacha_value/100.
    return total_value

In [19]:
def reorder(these_sfarim, priorities):

    order = np.argsort(priorities)
    these_sfarim = these_sfarim.iloc[order]
    
    # put ordering into big df
    indicies = these_sfarim.index
    these_sfarim.index = range(len(these_sfarim))
    return these_sfarim

In [20]:
def get_summed_priorities(*priorities):
    summed_priority = np.zeros(len(priorities[0]))
    multiplier = 100 ** (len(priorities)-1)
    for priority in priorities:
        priority = np.asarray(priority)
        summed_priority += priority*multiplier
        multiplier /= 100
    return summed_priority

In [21]:
def klal_siman_sif_priority(mekorot_df):
    
    klalim_order = np.array(
        [klalim.index(klal) for klal in mesechets['klal']])
    
    simanim_order = np.array(
        [klalim.index(siman) for siman in mesechets['siman']])
    
    sifim_order = np.array(
        [klalim.index(siman) for siman in mesechets['sif']])
    
    return (klalim_order, simanim_order, sifim_order)

In [22]:
def sort_shas(mekorot_df):
    # get the mesechets out of the big df
    mesechets = mekorot_df[mekorot_df['type'] == 'mesechtot']
    
    # order by order of shas
    mesechets_order = np.array(
        [shas.index(section) for section in mesechets['section']])
    
    # put the commentaries on the rif in the back
    rif_commentaries = ['ר״ן על', 'ספר הזכות', 'מלחמת', 'כתוב שם', 'המאור', 'רי״ף']
    add_alt_chkchk(rif_commentaries)
    rif_priority = np.array(
        [len(rif_commentaries) -
          [*[rishon in sefer for rishon in rif_commentaries], 
            True].index(True)
         for sefer in mesechets['sefer']])
    
    # order by daf
    daf_order = np.array(
        [daf_numerical(daf_str) for daf_str in mesechets['where']])
    
    # order gemara first then commentaries
    gemara_first_priority = np.array(
        [sefer != 'גמרא' for sefer in mesechets['sefer']]).astype(int)
    
    # order some of the main commentaries first
    main_rishonim = ['רש״י', 'תוס', 'רמב״ן', 'רשב״א', 'ריטב״א']
    add_alt_chkchk(main_rishonim)
    rishonim_priority = np.array(
        [[*[rishon in sefer for rishon in main_rishonim], 
            True].index(True)
         for sefer in mesechets['sefer']])
    
    # alphabetize to group commentaries with same name
    # -> helpful for sorting תוס ראש  and other תוס and such
    a_sorted = np.sort(mesechets['sefer']).tolist()
    alphabetical_priority = np.array(
                [a_sorted.index(sefer) for sefer in mesechets['sefer']])
    
    
    summed_priorities = get_summed_priorities(
        mesechets_order, rif_priority,
        daf_order, gemara_first_priority,
        rishonim_priority, alphabetical_priority
    )
    
    return reorder(mesechets, summed_priorities)

In [23]:
def sort_rambam(mekorot_df):
    
    rambams = mekorot_df[mekorot_df['type'] == 'rambam']
    
    
    # order by section
    halachas_order = np.array(
        [all_sfarim['rambam'].index(section) for section in rambams['section']]) * 100
    
    # order by perek halacha
    halacha_priority = np.array(
                [perek_halacha_numerical(where) for where in rambams['where']]) * 100
    
    # known ordering of sfarim
    known_sfarim = ['רמב״ם', 'השגות הראב״ד',
                    'מגיד משנה', 'מ״מ', 'כסף משנה', 'כס״מ',
                    'משנה למלך',
                   ]
    add_alt_chkchk(known_sfarim)
    known_priority = np.array(
        [[*[sf in sefer for sf in known_sfarim], 
            True].index(True)
         for sefer in rambams['sefer']])
    
    # alphabetize to group commentaries with same name
    a_sorted = np.sort(rambams['sefer']).tolist()
    alphabetical_priority = np.array(
                [a_sorted.index(sefer) for sefer in rambams['sefer']])
     
    summed_priorities = get_summed_priorities(
        halachas_order, halacha_priority, known_priority, alphabetical_priority
    )
    
    return reorder(rambams, summed_priorities)

In [24]:
def sort_section(mekorot_df, typ):
    if typ == 'mesechtot':
        return sort_shas(mekorot_df)
    elif typ == 'rambam':
        return sort_rambam(mekorot_df)
    # get the section out of the big df
    these_sfarim = mekorot_df[mekorot_df['type'] == typ]
    
    # alphabetize to group commentaries with same name
    a_sorted = np.sort(these_sfarim['sefer']).tolist()
    alphabetical_priority = np.array(
                [a_sorted.index(sefer) for sefer in these_sfarim['sefer']])
    
    return reorder(these_sfarim, alphabetical_priority)

### Mafteach Builder

In [25]:
def build_mafteach(text):
    """ Builds a Mafteach based on the given hebrew text.
    
        Args:
            text (list of str): List of strings where each
                item is a line of text.
        
        Returns:
            list of pd.DataFrames, each including a different
            kind of mafteach.
    """
    #------Variables for the tracking
    this_klal = None
    this_siman = None
    this_sif = None
    mekorot = []
    #-------Find the mekorot
    for line in text:
        #------Check if it is a new klal, siman or sif
        for klal in klalim:
            if klal in line:
                if (len(line) <= line.index(klal)+len(klal)
                        or line[line.index(klal)+len(klal)] == '.'):
                    this_klal = klal
        for siman in simanim:
            if siman in line:
                if '.' in siman:
                    siman = siman[:-1]
                this_siman = siman # take off the period
                this_sif = '-'
                
        for sif in sifim:
            if sif in line and len(line)<50:
                this_sif = sif[:-1] # take off the period
                
        if len(line) > long_mekor_str:
            continue
        #------Check if there is a mekor here
        for label, sefer_names in all_sfarim.items():
            # ^ "sefer_names" is not a super accurate variable descript
            #   unfortunately, because for example חידושי הריטב״א סוכה is 
            #   being identified under the "sefer_name" סוכה with this alg.
            for sefer in sefer_names:
                if sefer in line:
                    
                    where = ['']
                    section = []
                    
                    if label == 'mesechtot':
                        where = get_dafs(line)
                        if where == ['']:
                            # This is probably not a real mekor, just the notes
                            # mention something general about מסכת מעילה etc.
                            continue
                        section = sefer#sefer[sefer.index(' ')+1:]
                    if label == 'rambam':
                        where = rambam_perk_halacha(line)
                        section = sefer
                    elif label == 'turim':
                        if tur_is_really_a_shut(line, sefer):
                            continue
                    elif label == 'shutim':
                        where = shutim_section(line)
                    elif label == 'sfarim_chizonim':
                        shut_sect = shutim_section(line)
                        if shut_sect:
                            where = shut_sect
                        else:
                            where = line[line.index(sefer)+len(sefer)+1:]
                    
                    text_start = line.index(sefer) - 15
                    if text_start < 0: text_start = 0
                    text = line[text_start:line.index(sefer) + 35]
                    
                    sefer_name = get_sefer_name(line).replace('״', '"')
                    while len(sefer_name)>1 and sefer_name[-1] == ' ':
                        sefer_name = sefer_name[:-1]
                    
                    if type(where) != list: where = [where]
                    if type(section) != list: section = [section]
                    while len(section) < len(where):
                        section.append('')
                    
                    for w, s in zip(where, section):
                        mekor = pd.DataFrame({
                                    'text': [text], 'type': [label],
                                    'sefer': [sefer_name],
                                    'section': [s],
                                    'klal': [this_klal], 'siman': [this_siman],
                                    'sif': [this_sif],
                                    'where': [w]})
                        mekorot.append(mekor)
    #-------Turn it into a DataFrame
    key = pd.concat(mekorot)
    key.index = range(len(key))
    #-------Sort
    type_sort = np.argsort([list(all_sfarim.keys()).index(typ) for typ in key['type']])
    key = key.loc[type_sort]
    key.index = range(len(key))
    
    keys = []
    for typ in key['type'].unique():
        keys.append(sort_section(key, typ))
    
    for key_num, key in enumerate(keys):
        keys[key_num] = key[['sif', 'siman', 'klal', 'sefer', 'where', 'section', 'type']]
        # blank repeated refrences
        for rownum, (index, row) in enumerate(keys[key_num].iterrows()):
            row = row[['klal', 'siman', 'sif', 'sefer', 'where', 'section']]
            for label, item in row.iteritems():
                go_backs = 1
                while rownum-go_backs>0 and keys[key_num].iloc[rownum-go_backs][label] == '':
                    go_backs += 1
                
                if (rownum-go_backs>=0 
                    and (item == keys[key_num].iloc[rownum-go_backs][label])
                    and item != '-'):
                    # dont blank repeat siman/sif unless 
                    # its a repeat klal etc
                    if label == 'siman' or label == 'sif':
                        if not keys[key_num].loc[index, 'klal'] == '':
                            continue
                    if label == 'sif':
                        if not keys[key_num].loc[index, 'siman'] == '':
                            continue
                    # do the blanking
                    keys[key_num].loc[index, label] = ''      

    return keys

In [26]:
def save_mafteach(keys, savename='mafteach.xlsx'):
    """" Saves the outputted Mafteach from build_mafteach to an xcel sheet."""
    with pd.ExcelWriter('mafteach.xlsx') as writer:
        for key in keys:
            typ = key.pop('type').unique()[0]
            key.to_excel(excel_writer=writer, sheet_name=typ)

## Test Function

In [27]:
if __name__ == '__main__':
    
    text = LoadSamples.load_sample(1)
    keys = build_mafteach(text)
    save_mafteach(keys)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
