<a href="https://colab.research.google.com/github/roni762583/bible-data-science.github.io/blob/master/CleanBible.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

#clone data into datafile
! git clone https://github.com/roni762583/bible-data-science.github.io.git
# copy my datafile
! cp /content/bible-data-science.github.io/data/t3utf.dat datafile
# delete unused parts
! rm -rf /content/bible-data-science.github.io/
! ls -la

# Define the string that marks the end of the Torah
torah_end_marker = "אשרעשהמשהלעיניכלישראל"

# Initialize variables
s = ''  # to store the clean Hebrew text
hebrew_letters = {'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'כ', 'ך', 'ל', 'מ', 'ם', 'נ', 'ן', 'ס', 'ע', 'פ', 'ף', 'צ', 'ץ', 'ק', 'ר', 'ש', 'ת'}

# Open the data file and process line by line
with open('datafile', 'r') as file:
    for line in file:
        # Extract the text part after the last '|' and remove sof-pasuk colon
        line = line[line.rfind('|')+1:].replace('׃', '').strip().replace(" ", "")

        # Append only valid Hebrew characters to the string 's'
        s += ''.join([c for c in line if c in hebrew_letters])

        # Check if the end of the Torah has been reached
        if torah_end_marker in s:
            # Trim the string to end at the Torah marker
            s = s[:s.index(torah_end_marker) + len(torah_end_marker)]
            break

# Save the entire cleaned text up to and including the Torah end marker to a file
with open('preprocessed_torah.txt', 'w') as output_file:
    output_file.write(s)

# Optionally, print the last few characters to verify the Torah end marker
print("Last few characters in the processed text:")
print(s[-len(torah_end_marker):])  # Should print the Torah end marker







Cloning into 'bible-data-science.github.io'...
remote: Enumerating objects: 291, done.[K
remote: Counting objects: 100% (139/139), done.[K
remote: Compressing objects: 100% (95/95), done.[K
remote: Total 291 (delta 75), reused 80 (delta 42), pack-reused 152 (from 1)[K
Receiving objects: 100% (291/291), 10.59 MiB | 15.53 MiB/s, done.
Resolving deltas: 100% (152/152), done.
total 5652
drwxr-xr-x 1 root root    4096 Aug 27 10:12 .
drwxr-xr-x 1 root root    4096 Aug 27 08:47 ..
drwxr-xr-x 4 root root    4096 Aug 23 13:20 .config
-rw-r--r-- 1 root root 5763655 Aug 27 10:12 datafile
drwxr-xr-x 2 root root    4096 Aug 27 09:24 .ipynb_checkpoints
drwxr-xr-x 1 root root    4096 Aug 23 13:20 sample_data
Last few characters in the processed text:
אשרעשהמשהלעיניכלישראל


In [None]:
# Pull off els substrings, offset strings = els

def findTermIndeces(text, term):
  # print('hello from findTermIndeces()')
  indxLst = list()
  indx = text.find(term) # index of first occurance of term
  while indx != -1: # while term is found in text
    indxLst.append(indx) # add index of found term to index list
    indx = text.find(term,indx+1) # find next occurance of search term after index of prev. find
  return indxLst

def searchELS(text, term, minELS, maxELS):
  # print('hello from searchELS()')
  # initially address positive ELS values only
  # ternary-operator
  minELS = minELS if minELS>0 else 1
  foundTermIndeces = list() # empty list to hold indeces of found search term
  results = dict()
  elsRng = range(minELS, maxELS)
  for els in elsRng:
    # print('searching ELS = ', els)
    # get ELS substring with offset from 0 to els (current els being examined)
    # n = ELS substrings required to cover search space
    for offset in range(0, els): # get all offsets from 0 to els
      shiftedString = text[offset::els] # get offset string
      #foundTermIndeces = foundTermIndeces + findTermIndeces(shiftedString, term) # append new list of indeces to exisiting built list
      foundTermIndeces = findTermIndeces(shiftedString, term) # append new list of indeces to exisiting built list
      if not (foundTermIndeces): # don't store empty lists
        continue
      t = tuple([offset, foundTermIndeces]) # store offset to reconstrct indeces, result indeces are of shifted string, not of orig. text
      if(els>0): # els 0 is non-sensical, open text is els = 1 here
        results[els] = t # add found indeces to dictionary keyed by els
      foundTermIndeces = list() # clean out old list
  return results
# ex:
# print(searchELS(s,'עדנה', 0, 17))

In [None]:
def findTermIndecesWithOffset(text, term, offset):
  # print('hello from findTermIndecesWithOffset()')
  indxLst = list()
  indx = text.find(term) # index of first occurance of term
  while indx != -1: # while term is found in text
    offset = offset if offset!=0 else 1
    indxLst.append(indx*offset) # add index of found term to index list, THE INDEX IN ORIGINAL TEXT
    indx = text.find(term,indx+1) # find next occurance of search term after index of prev. find
  return indxLst



def searchELSmod(text, term, minELS, maxELS):
  # print('hello from searchELS()')
  # initially address positive ELS values only
  # ternary-operator
  minELS = minELS if minELS>0 else 1
  foundTermIndeces = list() # empty list to hold indeces of found search term
  results = dict()
  elsRng = range(minELS, maxELS)
  for els in elsRng:
    # print('searching ELS = ', els)
    # get ELS substring with offset from 0 to els (current els being examined)
    # n = ELS substrings required to cover search space
    for offset in range(0, els): # get all offsets from 0 to els
      shiftedString = text[offset::els] # get offset string
      #foundTermIndeces = foundTermIndeces + findTermIndeces(shiftedString, term) # append new list of indeces to exisiting built list
      foundTermIndeces = findTermIndecesWithOffset(shiftedString, term, offset) # append new list of indeces to exisiting built list
      if not (foundTermIndeces): # don't store empty lists
        continue
      t = tuple([offset, foundTermIndeces]) # store offset to reconstrct indeces, result indeces are of shifted string, not of orig. text
      if(els>0): # els 0 is non-sensical, open text is els = 1 here
        results[els] = foundTermIndeces # these indeces are supposed to be equal to the indeces in the original text
      foundTermIndeces = list() # clean out old list
  return results
# ex:
# print(searchELS(s,'עדנה', 0, 17))

In [None]:
# get user input for search term
term1 = input ("Enter Primary Search Term:")
print(term1)


Enter Primary Search Term:עדנה
עדנה


In [None]:
print(searchELSmod(s, term1, -10, 5))

{1: [21078, 108600, 574590, 835684, 1019725, 1169894], 2: [226804, 435273], 3: [763544], 4: [171879, 173530]}


In [None]:
# This section deals with reconstructing ELS codes for validation
# need to make use of offset value to reconstruct properly
# ex:
maxELS = 5
# get results dict
results = searchELSmod(s, term1, 0, maxELS)
# helper var
builtString = ''
# loop over results
for k in results: # for each key in results dictionary
  for idx in results[k]: # for each index in list
    # build print string
    for i in range(0, len(term1)):
      builtString += s[idx+i*k]
    print(builtString)
    builtString = ''

עדנה
עדנה
עדנה
עדנה
עדנה
עדנה
יוול
לוקנ
קייר
מםיא
םדנו


In [None]:
# write no spaces string to file - only run once
text_file = open("NoSpacesTanach.dat", "w")
text_file.write(s)
text_file.close()

# delete intermediary datafile
! rm datafile

#see
! ls -la

total 2368
drwxr-xr-x 1 root root    4096 Jan 17 05:33 .
drwxr-xr-x 1 root root    4096 Jan 17 02:30 ..
drwxr-xr-x 1 root root    4096 Jan  8 17:11 .config
-rw-r--r-- 1 root root 2405776 Jan 17 05:33 NoSpacesTanach.dat


In [None]:
# download result file - only run once
from google.colab import files
files.download('NoSpacesTanach.dat')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>