Skip to content

Commit

Permalink
Merge pull request #1635 from Sefaria/normalizeddates
Browse files Browse the repository at this point in the history
Normalized dates
  • Loading branch information
akiva10b committed Sep 20, 2023
2 parents d866edb + fb3a85a commit 9f82072
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 202 deletions.
11 changes: 2 additions & 9 deletions sefaria/client/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,9 @@ def format_link_object_for_client(link, with_text, ref, pos=None):
com["sourceVersion"] = {"title": link.versions[linkPos]["title"], "language": link.versions[linkPos].get("language", None)}
com["displayedText"] = link.displayedText[linkPos] # we only want source displayedText

compDate = getattr(linkRef.index, "compDate", None)
compDate = getattr(linkRef.index, "compDate", None) # default comp date to in the future
if compDate:
try:
com["compDate"] = 3000
except ValueError:
com["compDate"] = 3000 # default comp date to in the future
try:
com["errorMargin"] = int(getattr(linkRef.index, "errorMargin", 0))
except ValueError:
com["errorMargin"] = 0
com["compDate"] = compDate

# Pad out the sections list, so that comparison between comment numbers are apples-to-apples
lsections = linkRef.sections[:] + [0] * (linkRef.index_node.depth - len(linkRef.sections))
Expand Down
28 changes: 10 additions & 18 deletions sefaria/model/garden.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,24 +441,16 @@ def _derive_metadata(self):
# Time
# This is similar to logic on Index.composition_time_period() refactor
if getattr(self, "start", None) is None or getattr(self, "end", None) is None:
if getattr(i, "compDate", None):
errorMargin = int(getattr(i, "errorMargin", 0))
self.startIsApprox = self.endIsApprox = errorMargin > 0
self.start = self.end = 3000
# try:
# year = int(getattr(i, "compDate"))
# self.start = year - errorMargin
# self.end = year + errorMargin
# except ValueError as e:
# years = getattr(i, "compDate").split("-")
# if years[0] == "" and len(years) == 3: #Fix for first value being negative
# years[0] = -int(years[1])
# years[1] = int(years[2])
# self.start = int(years[0]) - errorMargin
# self.end = int(years[1]) + errorMargin

elif author and author.mostAccurateTimePeriod():
tp = author.mostAccurateTimePeriod()
years = getattr(i, 'compDate', [])
if years and len(years) > 0:
self.startIsApprox = self.endIsApprox = getattr(i, "hasErrorMargin", False)
if len(years) > 1:
self.start = years[0]
self.end = years[1]
else:
self.start = self.end = years[0]
elif author and author.most_accurate_time_period():
tp = author.most_accurate_time_period()
self.start = tp.start
self.end = tp.end
self.startIsApprox = tp.startIsApprox
Expand Down
5 changes: 5 additions & 0 deletions sefaria/model/tests/text_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,11 @@ def test_invalid_index_save_no_category():
assert "You must create category Mishnah/Commentary/Bartenura/Gargamel before adding texts to it." in str(e_info.value)
assert model.IndexSet({"title": title}).count() == 0

def test_best_time_period():
i = model.library.get_index("Rashi on Genesis")
assert i.best_time_period().period_string('en') == ' (c.1075 - c.1105 CE)'
i.compDate = None
assert i.best_time_period().period_string('en') == ' (1040 - 1105 CE)' # now that compDate is None, period_string should return Rashi's birth to death years

def test_invalid_index_save_no_hebrew_collective_title():
title = 'Bartenura (The Next Generation)'
Expand Down
147 changes: 40 additions & 107 deletions sefaria/model/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,10 @@ class Index(abst.AbstractMongoRecord, AbstractIndex):
"enShortDesc",
"heShortDesc",
"pubDate",
"hasErrorMargin", # (bool) whether or not compDate is exact. used to be 'errorMargin' which was an integer amount that compDate was off by
"compDate",
"compPlace",
"pubPlace",
"errorMargin",
"era",
"dependence", # (str) Values: "Commentary" or "Targum" - to denote commentaries and other potential not standalone texts
"base_text_titles", # (list) the base book(s) this one is dependant on
Expand Down Expand Up @@ -307,30 +307,20 @@ def expand_metadata_on_contents(self, contents):
contents["base_text_titles"] = [{"en": btitle, "he": hebrew_term(btitle)} for btitle in self.base_text_titles]

contents["heCategories"] = list(map(hebrew_term, self.categories))
contents = self.time_period_and_place_contents(contents)
return contents


composition_time_period = self.composition_time_period()
if composition_time_period:
contents["compDateString"] = {
"en": composition_time_period.period_string("en"),
"he": composition_time_period.period_string("he"),
}


composition_place = self.composition_place()
if composition_place:
contents["compPlaceString"] = {
"en": composition_place.primary_name("en"),
"he": composition_place.primary_name("he"),
}

pub_place = self.publication_place()
if pub_place:
contents["pubPlaceString"] = {
"en": pub_place.primary_name("en"),
"he": pub_place.primary_name("he"),
}

def time_period_and_place_contents(self, contents):
""" Used to expand contents for date and time info """
for k, f in [("compDateString", self.composition_time_period), ("pubDateString", self.publication_time_period)]:
time_period = f()
if time_period:
contents[k] = {"en": time_period.period_string('en'), 'he': time_period.period_string('he')}

for k, f in [("compPlaceString", self.composition_place), ("pubPlaceString", self.publication_place)]:
place = f()
if place:
contents[k] = {"en": place.primary_name('en'), 'he': place.primary_name('he')}
return contents

def _saveable_attrs(self):
Expand Down Expand Up @@ -444,93 +434,47 @@ def publication_place(self):

# This is similar to logic on GardenStop
def composition_time_period(self):
return self._get_time_period("compDate", "errorMargin")
return self._get_time_period("compDate", margin_field="hasErrorMargin")

def publication_time_period(self):
return self._get_time_period("pubDate")

def best_time_period(self):
"""
:return: TimePeriod: First tries to return `compDate`. Deals with ranges and negative values for compDate
If no compDate, looks at author info
"""
start, end, startIsApprox, endIsApprox = None, None, None, None

if getattr(self, "compDate", None):
errorMargin = int(getattr(self, "errorMargin", 0))
self.startIsApprox = self.endIsApprox = errorMargin > 0
start = end = 3000
# try:
# year = int(getattr(self, "compDate"))
# start = year - errorMargin
# end = year + errorMargin
# except ValueError as e:
# years = getattr(self, "compDate").split("-")
# if years[0] == "" and len(years) == 3: #Fix for first value being negative
# years[0] = -int(years[1])
# years[1] = int(years[2])
# try:
# start = int(years[0]) - errorMargin
# end = int(years[1]) + errorMargin
# except UnicodeEncodeError as e:
# pass

:return: TimePeriod: First tries to return `compDate`.
If no compDate or compDate is an empty list, _get_time_period returns None and it then looks at author info
"""
compDatePeriod = self._get_time_period('compDate', margin_field="hasErrorMargin")
if compDatePeriod:
return compDatePeriod
else:
author = self.author_objects()[0] if len(self.author_objects()) > 0 else None
tp = author and author.most_accurate_time_period()
if tp is not None:
tpvars = vars(tp)
start = tp.start if "start" in tpvars else None
end = tp.end if "end" in tpvars else None
startIsApprox = tp.startIsApprox if "startIsApprox" in tpvars else None
endIsApprox = tp.endIsApprox if "endIsApprox" in tpvars else None

if not start is None:
from sefaria.model.timeperiod import TimePeriod
if not startIsApprox is None:
return TimePeriod({
"start": start,
"end": end,
"startIsApprox": startIsApprox,
"endIsApprox": endIsApprox
})
else:
return TimePeriod({
"start": start,
"end": end
})
return tp

def _get_time_period(self, date_field, margin_field=None):
def _get_time_period(self, date_field, margin_field=""):
"""
Assumes that value of `date_field` ('pubDate' or 'compDate') is a list of integers.
"""
from . import timeperiod
if not getattr(self, date_field, None):
years = getattr(self, date_field, [])
if years is None or len(years) == 0:
return None

try:
error_margin = int(getattr(self, margin_field, 0)) if margin_field else 0
error_margin = getattr(self, margin_field, False) if margin_field else False
except ValueError:
error_margin = 0
startIsApprox = endIsApprox = error_margin > 0
start = end = 3000
# try:
# year = int(getattr(self, date_field))
# start = year - error_margin
# end = year + error_margin
# except ValueError as e:
# try:
# years = getattr(self, date_field).split("-")
# if years[0] == "" and len(years) == 3: #Fix for first value being negative
# years[0] = -int(years[1])
# years[1] = int(years[2])
# start = int(years[0]) - error_margin
# end = int(years[1]) + error_margin
# except ValueError as e:
# return None
error_margin = False
startIsApprox = endIsApprox = error_margin
if len(years) > 1:
start, end = years
else:
start = end = years[0]
return timeperiod.TimePeriod({
"start": start,
"startIsApprox": startIsApprox,
"end": end,
"endIsApprox": endIsApprox
})
"start": start,
"startIsApprox": startIsApprox,
"end": end,
"endIsApprox": endIsApprox
})

# Index changes behavior of load_from_dict, so this circumvents that changed behavior to call load_from_dict on the abstract superclass
def update_from_dict(self, d):
Expand Down Expand Up @@ -692,12 +636,6 @@ def _normalize(self):
for attr in deprecated_attrs:
if getattr(self, attr, None):
delattr(self, attr)
try:
error_margin_value = getattr(self, "errorMargin", 0)
int(error_margin_value)
except ValueError:
logger.warning("Index record '{}' has invalid 'errorMargin': {} field, removing".format(self.title, error_margin_value))
delattr(self, "errorMargin")

def _update_alt_structs_on_title_change(self):
old_title = self.pkeys_orig_values["title"]
Expand Down Expand Up @@ -758,11 +696,6 @@ def _validate(self):
if getattr(self, "collective_title", None) and not hebrew_term(getattr(self, "collective_title", None)):
raise InputError("You must add a hebrew translation Term for any new Collective Title: {}.".format(self.collective_title))

try:
int(getattr(self, "errorMargin", 0))
except (ValueError):
raise InputError("composition date error margin must be an integer")

#complex style records- all records should now conform to this
if self.nodes:
# Make sure that all primary titles match
Expand Down
28 changes: 16 additions & 12 deletions sefaria/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,17 @@ def remove_footnotes(cls, content):
content = AbstractTextRecord.strip_itags(content)
return content

@classmethod
def modify_text_in_doc(cls, content):
content = AbstractTextRecord.strip_imgs(content)
content = cls.remove_footnotes(content)
content = strip_cantillation(content, strip_vowels=False).strip()
content = re.sub(r'<[^>]+>', ' ', content) # replace HTML tags with space so that words dont get smushed together
content = re.sub(r'\([^)]+\)', ' ', content) # remove all parens
while " " in content: # make sure there are not many spaces in a row
content = content.replace(" ", " ")
return content

@classmethod
def make_text_index_document(cls, tref, heTref, version, lang, version_priority, content, categories, hebrew_version_title):
"""
Expand All @@ -621,15 +632,8 @@ def make_text_index_document(cls, tref, heTref, version, lang, version_priority,
# Don't bother indexing if there's no content
if not content:
return False
content = AbstractTextRecord.strip_imgs(content)
content = cls.remove_footnotes(content)
content_wo_cant = strip_cantillation(content, strip_vowels=False).strip()
content_wo_cant = re.sub(r'<[^>]+>', ' ', content_wo_cant) # replace HTML tags with space so that words dont get smushed together
content_wo_cant = re.sub(r'\([^)]+\)', ' ', content_wo_cant) # remove all parens
while " " in content_wo_cant: # make sure there are not many spaces in a row
content_wo_cant = content_wo_cant.replace(" ", " ")

if len(content_wo_cant) == 0:
content = cls.modify_text_in_doc(content)
if len(content) == 0:
return False

oref = Ref(tref)
Expand Down Expand Up @@ -657,9 +661,9 @@ def make_text_index_document(cls, tref, heTref, version, lang, version_priority,
"path": "/".join(indexed_categories + [cls.curr_index.title]),
"pagesheetrank": pagesheetrank,
"comp_date": comp_start_date,
#"hebmorph_semi_exact": content_wo_cant,
"exact": content_wo_cant,
"naive_lemmatizer": content_wo_cant,
#"hebmorph_semi_exact": content,
"exact": content,
"naive_lemmatizer": content,
'hebrew_version_title': hebrew_version_title,
}

Expand Down
9 changes: 3 additions & 6 deletions sefaria/tests/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ def test_make_text_index_document():

ref_data = RefData().load({"ref": tref})
pagesheetrank = ref_data.pagesheetrank if ref_data is not None else RefData.DEFAULT_PAGESHEETRANK
content_wo_cant = strip_cantillation(content, strip_vowels=False).strip()
content_wo_cant = re.sub(r'<[^>]+>', '', content_wo_cant)
content_wo_cant = re.sub(r'\([^)]+\)', '', content_wo_cant)

content = TI.modify_text_in_doc(content)
assert doc == {
"ref": tref,
"heRef": he_ref,
Expand All @@ -40,8 +37,8 @@ def test_make_text_index_document():
"path": "/".join(categories + [index.title]),
"pagesheetrank": pagesheetrank,
"comp_date": comp_date,
"exact": content_wo_cant,
"naive_lemmatizer": content_wo_cant,
"exact": content,
"naive_lemmatizer": content,
'hebrew_version_title': heVtitle,

}
Expand Down
20 changes: 4 additions & 16 deletions static/js/AboutBox.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ class AboutBox extends Component {
authorsElems[lang] = authorArray.map((author, iauthor) => <span>{iauthor > 0 ? ", " : ""}<a key={author.slug} href={`/topics/${author.slug}`}>{author[lang]}</a></span> );
}
}
// use compPlaceString and compDateString if available. then use compPlace o/w use pubPlace o/w nothing
let placeTextEn, placeTextHe;
if (d.compPlaceString) {
placeTextEn = d.compPlaceString.en;
Expand All @@ -131,22 +130,11 @@ class AboutBox extends Component {
if (d.compDateString) {
dateTextEn = d.compDateString.en;
dateTextHe = d.compDateString.he
} else if (d.compDate) {
if (d.errorMargin !== 0) {
//I don't think there are any texts which are mixed BCE/CE
const lowerDate = Math.abs(d.compDate - d.errorMargin);
const upperDate = Math.abs(d.compDate - d.errorMargin);
dateTextEn = `(c.${lowerDate} - c.${upperDate} ${d.compDate < 0 ? "BCE" : "CE"})`;
dateTextHe = `(${lowerDate} - ${upperDate} ${d.compDate < 0 ? 'לפנה"ס בקירוב' : 'לספירה בקירוב'})`;
} else {
dateTextEn = `(${Math.abs(d.compDate)} ${d.compDate < 0 ? "BCE" : "CE"})`;
dateTextHe = `(${Math.abs(d.compDate)} ${d.compDate < 0 ? 'לפנה"ס בקירוב' : 'לספירה בקירוב'})`;
}
} else if (d.pubDate) {
dateTextEn = `(${Math.abs(d.pubDate)} ${d.pubDate < 0 ? "BCE" : "CE"})`;
dateTextHe = `(${Math.abs(d.pubDate)} ${d.pubDate < 0 ? 'לפנה"ס בקירוב' : 'לספירה בקירוב'})`;
} else if (d.pubDateString) {
dateTextEn = d.pubDateString.en;
dateTextHe = d.pubDateString.he;
}
const bookPageUrl = "/" + Sefaria.normRef(d.title);
const bookPageUrl = "/" + Sefaria.normRef(d.title); //comment for the sake of commit
detailSection = (
<div className="detailsSection sans-serif">
<h2 className="aboutHeader">
Expand Down

0 comments on commit 9f82072

Please sign in to comment.