Merge pull request #1635 from Sefaria/normalizeddates

Normalized dates
Sefaria · Sep 20, 2023 · 9f82072 · 9f82072
2 parents d866edb + fb3a85a
commit 9f82072
Show file tree

Hide file tree

Showing 8 changed files with 97 additions and 202 deletions.
diff --git a/sefaria/client/wrapper.py b/sefaria/client/wrapper.py
@@ -54,16 +54,9 @@ def format_link_object_for_client(link, with_text, ref, pos=None):
         com["sourceVersion"] = {"title": link.versions[linkPos]["title"], "language": link.versions[linkPos].get("language", None)}
         com["displayedText"] = link.displayedText[linkPos]  # we only want source displayedText
 
-    compDate = getattr(linkRef.index, "compDate", None)
+    compDate = getattr(linkRef.index, "compDate", None)  # default comp date to in the future
     if compDate:
-        try:
-            com["compDate"] = 3000
-        except ValueError:
-            com["compDate"] = 3000  # default comp date to in the future
-        try:
-            com["errorMargin"] = int(getattr(linkRef.index, "errorMargin", 0))
-        except ValueError:
-            com["errorMargin"] = 0
+        com["compDate"] = compDate
 
     # Pad out the sections list, so that comparison between comment numbers are apples-to-apples
     lsections = linkRef.sections[:] + [0] * (linkRef.index_node.depth - len(linkRef.sections))

diff --git a/sefaria/model/garden.py b/sefaria/model/garden.py
@@ -441,24 +441,16 @@ def _derive_metadata(self):
         # Time
         # This is similar to logic on Index.composition_time_period() refactor
         if getattr(self, "start", None) is None or getattr(self, "end", None) is None:
-            if getattr(i, "compDate", None):
-                errorMargin = int(getattr(i, "errorMargin", 0))
-                self.startIsApprox = self.endIsApprox = errorMargin > 0
-                self.start = self.end = 3000
-                # try:
-                #     year = int(getattr(i, "compDate"))
-                #     self.start = year - errorMargin
-                #     self.end = year + errorMargin
-                # except ValueError as e:
-                #     years = getattr(i, "compDate").split("-")
-                #     if years[0] == "" and len(years) == 3:  #Fix for first value being negative
-                #         years[0] = -int(years[1])
-                #         years[1] = int(years[2])
-                #     self.start = int(years[0]) - errorMargin
-                #     self.end = int(years[1]) + errorMargin
-
-            elif author and author.mostAccurateTimePeriod():
-                tp = author.mostAccurateTimePeriod()
+            years = getattr(i, 'compDate', [])
+            if years and len(years) > 0:
+                self.startIsApprox = self.endIsApprox = getattr(i, "hasErrorMargin", False)
+                if len(years) > 1:
+                    self.start = years[0]
+                    self.end = years[1]
+                else:
+                    self.start = self.end = years[0]
+            elif author and author.most_accurate_time_period():
+                tp = author.most_accurate_time_period()
                 self.start = tp.start
                 self.end = tp.end
                 self.startIsApprox = tp.startIsApprox

diff --git a/sefaria/model/tests/text_test.py b/sefaria/model/tests/text_test.py
@@ -167,6 +167,11 @@ def test_invalid_index_save_no_category():
     assert "You must create category Mishnah/Commentary/Bartenura/Gargamel before adding texts to it." in str(e_info.value)
     assert model.IndexSet({"title": title}).count() == 0
 
+def test_best_time_period():
+    i = model.library.get_index("Rashi on Genesis")
+    assert i.best_time_period().period_string('en') == ' (c.1075  - c.1105 CE)'
+    i.compDate = None
+    assert i.best_time_period().period_string('en') == ' (1040  - 1105 CE)'  # now that compDate is None, period_string should return Rashi's birth to death years
 
 def test_invalid_index_save_no_hebrew_collective_title():
     title = 'Bartenura (The Next Generation)'

diff --git a/sefaria/model/text.py b/sefaria/model/text.py
@@ -200,10 +200,10 @@ class Index(abst.AbstractMongoRecord, AbstractIndex):
         "enShortDesc",
         "heShortDesc",
         "pubDate",
+        "hasErrorMargin",     # (bool) whether or not compDate is exact.  used to be 'errorMargin' which was an integer amount that compDate was off by
         "compDate",
         "compPlace",
         "pubPlace",
-        "errorMargin",
         "era",
         "dependence",           # (str) Values: "Commentary" or "Targum" - to denote commentaries and other potential not standalone texts
         "base_text_titles",     # (list) the base book(s) this one is dependant on
@@ -307,30 +307,20 @@ def expand_metadata_on_contents(self, contents):
             contents["base_text_titles"] = [{"en": btitle, "he": hebrew_term(btitle)} for btitle in self.base_text_titles]
 
         contents["heCategories"] = list(map(hebrew_term, self.categories))
+        contents = self.time_period_and_place_contents(contents)
+        return contents
 
-
-        composition_time_period = self.composition_time_period()
-        if composition_time_period:
-            contents["compDateString"] = {
-                "en": composition_time_period.period_string("en"),
-                "he": composition_time_period.period_string("he"),
-            }
-
-
-        composition_place = self.composition_place()
-        if composition_place:
-            contents["compPlaceString"] = {
-                "en": composition_place.primary_name("en"),
-                "he": composition_place.primary_name("he"),
-            }
-
-        pub_place = self.publication_place()
-        if pub_place:
-            contents["pubPlaceString"] = {
-                "en": pub_place.primary_name("en"),
-                "he": pub_place.primary_name("he"),
-            }
-
+    def time_period_and_place_contents(self, contents):
+        """ Used to expand contents for date and time info """
+        for k, f in [("compDateString", self.composition_time_period), ("pubDateString", self.publication_time_period)]:
+            time_period = f()
+            if time_period:
+                contents[k] = {"en": time_period.period_string('en'), 'he': time_period.period_string('he')}
+
+        for k, f in [("compPlaceString", self.composition_place), ("pubPlaceString", self.publication_place)]:
+            place = f()
+            if place:
+                contents[k] = {"en": place.primary_name('en'), 'he': place.primary_name('he')}
         return contents
 
     def _saveable_attrs(self):
@@ -444,93 +434,47 @@ def publication_place(self):
 
     # This is similar to logic on GardenStop
     def composition_time_period(self):
-        return self._get_time_period("compDate", "errorMargin")
+        return self._get_time_period("compDate", margin_field="hasErrorMargin")
 
     def publication_time_period(self):
         return self._get_time_period("pubDate")
 
     def best_time_period(self):
         """
-        :return: TimePeriod: First tries to return `compDate`. Deals with ranges and negative values for compDate
-        If no compDate, looks at author info
-        """
-        start, end, startIsApprox, endIsApprox = None, None, None, None
-
-        if getattr(self, "compDate", None):
-            errorMargin = int(getattr(self, "errorMargin", 0))
-            self.startIsApprox = self.endIsApprox = errorMargin > 0
-            start = end = 3000
-            # try:
-            #     year = int(getattr(self, "compDate"))
-            #     start = year - errorMargin
-            #     end = year + errorMargin
-            # except ValueError as e:
-            #     years = getattr(self, "compDate").split("-")
-            #     if years[0] == "" and len(years) == 3:  #Fix for first value being negative
-            #         years[0] = -int(years[1])
-            #         years[1] = int(years[2])
-            #     try:
-            #         start = int(years[0]) - errorMargin
-            #         end = int(years[1]) + errorMargin
-            #     except UnicodeEncodeError as e:
-            #         pass
-
+        :return: TimePeriod: First tries to return `compDate`.
+        If no compDate or compDate is an empty list, _get_time_period returns None and it then looks at author info
+        """
+        compDatePeriod = self._get_time_period('compDate', margin_field="hasErrorMargin")
+        if compDatePeriod:
+            return compDatePeriod
         else:
             author = self.author_objects()[0] if len(self.author_objects()) > 0 else None
             tp = author and author.most_accurate_time_period()
-            if tp is not None:
-                tpvars = vars(tp)
-                start = tp.start if "start" in tpvars else None
-                end = tp.end if "end" in tpvars else None
-                startIsApprox = tp.startIsApprox if "startIsApprox" in tpvars else None
-                endIsApprox = tp.endIsApprox if "endIsApprox" in tpvars else None
-
-        if not start is None:
-            from sefaria.model.timeperiod import TimePeriod
-            if not startIsApprox is None:
-                return TimePeriod({
-                    "start": start,
-                    "end": end,
-                    "startIsApprox": startIsApprox,
-                    "endIsApprox": endIsApprox
-                })
-            else:
-                return TimePeriod({
-                    "start": start,
-                    "end": end
-                })
+            return tp
 
-    def _get_time_period(self, date_field, margin_field=None):
+    def _get_time_period(self, date_field, margin_field=""):
+        """
+        Assumes that value of `date_field` ('pubDate' or 'compDate') is a list of integers.
+        """
         from . import timeperiod
-        if not getattr(self, date_field, None):
+        years = getattr(self, date_field, [])
+        if years is None or len(years) == 0:
             return None
-
         try:
-            error_margin = int(getattr(self, margin_field, 0)) if margin_field else 0
+            error_margin = getattr(self, margin_field, False) if margin_field else False
         except ValueError:
-            error_margin = 0
-        startIsApprox = endIsApprox = error_margin > 0
-        start = end = 3000
-        # try:
-        #     year = int(getattr(self, date_field))
-        #     start = year - error_margin
-        #     end = year + error_margin
-        # except ValueError as e:
-        #     try:
-        #         years = getattr(self, date_field).split("-")
-        #         if years[0] == "" and len(years) == 3:  #Fix for first value being negative
-        #             years[0] = -int(years[1])
-        #             years[1] = int(years[2])
-        #         start = int(years[0]) - error_margin
-        #         end = int(years[1]) + error_margin
-        #     except ValueError as e:
-        #         return None
+            error_margin = False
+        startIsApprox = endIsApprox = error_margin
+        if len(years) > 1:
+            start, end = years
+        else:
+            start = end = years[0]
         return timeperiod.TimePeriod({
-            "start": start,
-            "startIsApprox": startIsApprox,
-            "end": end,
-            "endIsApprox": endIsApprox
-        })
+        "start": start,
+        "startIsApprox": startIsApprox,
+        "end": end,
+        "endIsApprox": endIsApprox
+    })
 
     # Index changes behavior of load_from_dict, so this circumvents that changed behavior to call load_from_dict on the abstract superclass
     def update_from_dict(self, d):
@@ -692,12 +636,6 @@ def _normalize(self):
         for attr in deprecated_attrs:
             if getattr(self, attr, None):
                 delattr(self, attr)
-        try:
-            error_margin_value = getattr(self, "errorMargin", 0)
-            int(error_margin_value)
-        except ValueError:
-            logger.warning("Index record '{}' has invalid 'errorMargin': {} field, removing".format(self.title, error_margin_value))
-            delattr(self, "errorMargin")
 
     def _update_alt_structs_on_title_change(self):
         old_title = self.pkeys_orig_values["title"]
@@ -758,11 +696,6 @@ def _validate(self):
         if getattr(self, "collective_title", None) and not hebrew_term(getattr(self, "collective_title", None)):
             raise InputError("You must add a hebrew translation Term for any new Collective Title: {}.".format(self.collective_title))
 
-        try:
-            int(getattr(self, "errorMargin", 0))
-        except (ValueError):
-            raise InputError("composition date error margin must be an integer")
-
         #complex style records- all records should now conform to this
         if self.nodes:
             # Make sure that all primary titles match

diff --git a/sefaria/search.py b/sefaria/search.py
@@ -613,6 +613,17 @@ def remove_footnotes(cls, content):
             content = AbstractTextRecord.strip_itags(content)
             return content
 
+    @classmethod
+    def modify_text_in_doc(cls, content):
+        content = AbstractTextRecord.strip_imgs(content)
+        content = cls.remove_footnotes(content)
+        content = strip_cantillation(content, strip_vowels=False).strip()
+        content = re.sub(r'<[^>]+>', ' ', content)     # replace HTML tags with space so that words dont get smushed together
+        content = re.sub(r'\([^)]+\)', ' ', content)   # remove all parens
+        while "  " in content:                                 # make sure there are not many spaces in a row
+            content = content.replace("  ", " ")
+        return content
+
     @classmethod
     def make_text_index_document(cls, tref, heTref, version, lang, version_priority, content, categories, hebrew_version_title):
         """
@@ -621,15 +632,8 @@ def make_text_index_document(cls, tref, heTref, version, lang, version_priority,
         # Don't bother indexing if there's no content
         if not content:
             return False
-        content = AbstractTextRecord.strip_imgs(content)
-        content = cls.remove_footnotes(content)
-        content_wo_cant = strip_cantillation(content, strip_vowels=False).strip()
-        content_wo_cant = re.sub(r'<[^>]+>', ' ', content_wo_cant)     # replace HTML tags with space so that words dont get smushed together
-        content_wo_cant = re.sub(r'\([^)]+\)', ' ', content_wo_cant)   # remove all parens
-        while "  " in content_wo_cant:                                 # make sure there are not many spaces in a row
-            content_wo_cant = content_wo_cant.replace("  ", " ")
-
-        if len(content_wo_cant) == 0:
+        content = cls.modify_text_in_doc(content)
+        if len(content) == 0:
             return False
 
         oref = Ref(tref)
@@ -657,9 +661,9 @@ def make_text_index_document(cls, tref, heTref, version, lang, version_priority,
             "path": "/".join(indexed_categories + [cls.curr_index.title]),
             "pagesheetrank": pagesheetrank,
             "comp_date": comp_start_date,
-            #"hebmorph_semi_exact": content_wo_cant,
-            "exact": content_wo_cant,
-            "naive_lemmatizer": content_wo_cant,
+            #"hebmorph_semi_exact": content,
+            "exact": content,
+            "naive_lemmatizer": content,
             'hebrew_version_title': hebrew_version_title,
         }
 

diff --git a/sefaria/tests/search.py b/sefaria/tests/search.py
@@ -24,10 +24,7 @@ def test_make_text_index_document():
 
     ref_data = RefData().load({"ref": tref})
     pagesheetrank = ref_data.pagesheetrank if ref_data is not None else RefData.DEFAULT_PAGESHEETRANK
-    content_wo_cant = strip_cantillation(content, strip_vowels=False).strip()
-    content_wo_cant = re.sub(r'<[^>]+>', '', content_wo_cant)
-    content_wo_cant = re.sub(r'\([^)]+\)', '', content_wo_cant)
-
+    content = TI.modify_text_in_doc(content)
     assert doc == {
         "ref": tref,
         "heRef": he_ref,
@@ -40,8 +37,8 @@ def test_make_text_index_document():
         "path": "/".join(categories + [index.title]),
         "pagesheetrank": pagesheetrank,
         "comp_date": comp_date,
-        "exact": content_wo_cant,
-        "naive_lemmatizer": content_wo_cant,
+        "exact": content,
+        "naive_lemmatizer": content,
         'hebrew_version_title': heVtitle,
 
     }

diff --git a/static/js/AboutBox.jsx b/static/js/AboutBox.jsx
@@ -114,7 +114,6 @@ class AboutBox extends Component {
           authorsElems[lang] = authorArray.map((author, iauthor) => <span>{iauthor > 0 ? ", " : ""}<a key={author.slug} href={`/topics/${author.slug}`}>{author[lang]}</a></span> );
         }
       }
-      // use compPlaceString and compDateString if available. then use compPlace o/w use pubPlace o/w nothing
       let placeTextEn, placeTextHe;
       if (d.compPlaceString) {
         placeTextEn = d.compPlaceString.en;
@@ -131,22 +130,11 @@ class AboutBox extends Component {
       if (d.compDateString) {
         dateTextEn = d.compDateString.en;
         dateTextHe = d.compDateString.he
-      } else if (d.compDate) {
-        if (d.errorMargin !== 0) {
-          //I don't think there are any texts which are mixed BCE/CE
-          const lowerDate = Math.abs(d.compDate - d.errorMargin);
-          const upperDate = Math.abs(d.compDate - d.errorMargin);
-          dateTextEn = `(c.${lowerDate} - c.${upperDate} ${d.compDate < 0 ? "BCE" : "CE"})`;
-          dateTextHe = `(${lowerDate} - ${upperDate} ${d.compDate < 0 ? 'לפנה"ס בקירוב' : 'לספירה בקירוב'})`;
-        } else {
-          dateTextEn = `(${Math.abs(d.compDate)} ${d.compDate < 0 ? "BCE" : "CE"})`;
-          dateTextHe = `(${Math.abs(d.compDate)} ${d.compDate < 0 ? 'לפנה"ס בקירוב' : 'לספירה בקירוב'})`;
-        }
-      } else if (d.pubDate) {
-        dateTextEn = `(${Math.abs(d.pubDate)} ${d.pubDate < 0 ? "BCE" : "CE"})`;
-        dateTextHe = `(${Math.abs(d.pubDate)} ${d.pubDate < 0 ? 'לפנה"ס בקירוב' : 'לספירה בקירוב'})`;
+      } else if (d.pubDateString) {
+        dateTextEn = d.pubDateString.en;
+        dateTextHe = d.pubDateString.he;
       }
-      const bookPageUrl = "/" + Sefaria.normRef(d.title);
+      const bookPageUrl = "/" + Sefaria.normRef(d.title);  //comment for the sake of commit
       detailSection = (
         <div className="detailsSection sans-serif">
           <h2 className="aboutHeader">