## Dates and Calendars

In [7]:
import pandas as pd
import altair as alt

from utils import pgp_csv_paths, chart_dir

documents = pd.read_csv(pgp_csv_paths["documents"])

In [8]:
# limit to documents with standard date OR inferred date information
dated_docs = documents[documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()].copy()
dated_docs[['doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard']]

Unnamed: 0,doc_date_standard,doc_date_original,doc_date_calendar,inferred_date_standard
5,1259,1570,Seleucid,
8,,,,1089/1190
16,1116-03-05,19 Adar 1427,Seleucid,
17,0977-06-21/0977-07-19,Tammuz 1288,Seleucid,
23,1025-08-28/1026-09-14,1337,Seleucid,
...,...,...,...,...
35108,,,,1200/1230
35109,,,,1200/1250
35110,,,,1200/1250
35115,,,,1160/1171


In [9]:
# collect dates into a single field
dated_docs['date'] = dated_docs.apply(lambda x: x.doc_date_standard.strip() if pd.notna(x.doc_date_standard) else x.inferred_date_standard.strip(), axis=1)
dated_docs[['doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard', 'date']]

Unnamed: 0,doc_date_standard,doc_date_original,doc_date_calendar,inferred_date_standard,date
5,1259,1570,Seleucid,,1259
8,,,,1089/1190,1089/1190
16,1116-03-05,19 Adar 1427,Seleucid,,1116-03-05
17,0977-06-21/0977-07-19,Tammuz 1288,Seleucid,,0977-06-21/0977-07-19
23,1025-08-28/1026-09-14,1337,Seleucid,,1025-08-28/1026-09-14
...,...,...,...,...,...
35108,,,,1200/1230,1200/1230
35109,,,,1200/1250,1200/1250
35110,,,,1200/1250,1200/1250
35115,,,,1160/1171,1160/1171


In [10]:
dated_docs[dated_docs.date.str.contains("1217-02-29")][['doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard', 'date']]

Unnamed: 0,doc_date_standard,doc_date_original,doc_date_calendar,inferred_date_standard,date
3190,1217-02-20/1217-02-29,middle decade of Adar 1528,Seleucid,,1217-02-20/1217-02-29


In [11]:
dated_docs[dated_docs.date.str.contains("1139/")][['pgpid', 'doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard', 'date']]

Unnamed: 0,pgpid,doc_date_standard,doc_date_original,doc_date_calendar,inferred_date_standard,date
3780,4666,1139/1140,1451.0,Seleucid,,1139/1140
6011,7885,,,,1139/1140,1139/1140
7054,9104,,,,1139/1140,1139/1140


In [12]:
# parse with undate
from undate import Undate, UndateInterval
from lark.exceptions import VisitError

def parse_date(datestr):
  # potentially multiple values
  if ';' in datestr:
    print(f"multiple inferred dates: {datestr}")
    all_dates = [parse_date(d) for d in datestr.split(';')]
    for d in all_dates:
      print(f"{d}: {d.duration().days} days")
    # earliest = min(all_dates)
    # latest = max(all_dates)
    try:
      earliest = min([d.earliest for d in all_dates])
      latest = max([d.latest for d in all_dates])
      print(f"all dates: {all_dates} earliest {earliest} latest {latest}")
  #    range = UndateInterval(Undate.to_undate(earliest), Undate.to_undate(latest))
    # can we just cheat and use first/last
      range = UndateInterval(Undate(earliest.year, earliest.month, earliest.day), Undate(latest.year, latest.month, latest.day))
      print(f'range {range}')
      return range
    except:
      print("error sorting to determine outer interval")
  else:
    try:
      return Undate.parse(datestr.strip(), "EDTF")
    except (ValueError, VisitError) as err:
      # special cases
      if datestr.endswith("-02-29"):
        print(f"parse error on {datestr}, parsing as --02-28")
        return parse_date(datestr.replace("-02-29", "-02-28"))
      if "/" in datestr:
        parts = datestr.split("/")
        # data entry error: 1139/1139 ; corrected in db, but use first for now
        if parts[0] == parts[1]:
          return parse_date(parts[0])
      print(err)

dated_docs['undate'] = dated_docs.date.apply(parse_date) # lambda x: Undate.parse(x, 'EDTF'))
dated_docs[['doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard', 'date', 'undate']].head(10)

multiple inferred dates: 1047-06 ; 1048-08-09
1047-06: 30 days
1048-08-09: 1 days
all dates: [<Undate 1047-06 (Gregorian)>, <Undate 1048-08-09 (Gregorian)>] earliest 1047-06-01 latest 1048-08-09
range 1047-06-01/1048-08-09
parse error on 1217-02-20/1217-02-29, parsing as --02-28
multiple inferred dates: 1000 ; 1000/1055
1000: 365 days
1000/1055: 20453 days
error sorting to determine outer interval
parse error on 1747-02-29, parsing as --02-28


Unnamed: 0,doc_date_standard,doc_date_original,doc_date_calendar,inferred_date_standard,date,undate
5,1259,1570,Seleucid,,1259,1259
8,,,,1089/1190,1089/1190,1089/1190
16,1116-03-05,19 Adar 1427,Seleucid,,1116-03-05,1116-03-05
17,0977-06-21/0977-07-19,Tammuz 1288,Seleucid,,0977-06-21/0977-07-19,0977-06-21/0977-07-19
23,1025-08-28/1026-09-14,1337,Seleucid,,1025-08-28/1026-09-14,1025-08-28/1026-09-14
35,,,,1080/1100,1080/1100,1080/1100
36,1131,,,,1131,1131
41,1188-12-07,"Wednesday, 15 Kislev 1500",Seleucid,,1188-12-07,1188-12-07
43,1236-11-30/1236-12-28,Tevet 1548,Seleucid,,1236-11-30/1236-12-28,1236-11-30/1236-12-28
46,,,,1028,1028,1028


In [13]:
import numpy as np

def undate_earliest(und):
  if isinstance(und, UndateInterval):
    return und.earliest.earliest
  elif isinstance(und, Undate):
    return und.earliest

def undate_latest(und):
  if isinstance(und, UndateInterval):
    return und.latest.earliest
  elif isinstance(und, Undate):
    return und.latest


# get earliest, latest, midpoint, and convert to numpy datetime for graphing
dated_docs['undate_earliest'] = dated_docs.undate.apply(undate_earliest).astype('datetime64[ms]')
dated_docs['undate_latest'] = dated_docs.undate.apply(undate_latest).astype('datetime64[ms]')
dated_docs['undate_midpoint'] = dated_docs.apply(lambda row: (row.undate_earliest + (row.undate_latest - row.undate_earliest) / 2.0) if pd.notna(row.undate) else None, axis=1).astype('datetime64[ms]')
dated_docs['date_source'] = dated_docs.apply(lambda row: "On document" if pd.notna(row.doc_date_standard) else "Inferred", axis=1)


dated_docs[['doc_date_standard', 'doc_date_original', 'doc_date_calendar', 'inferred_date_standard', 'date', 'undate', 'undate_earliest', 'undate_latest', 'undate_midpoint', 'date_source']].head(10)

Unnamed: 0,doc_date_standard,doc_date_original,doc_date_calendar,inferred_date_standard,date,undate,undate_earliest,undate_latest,undate_midpoint,date_source
5,1259,1570,Seleucid,,1259,1259,1259-01-01,1259-12-31,1259-07-02 00:00:00,On document
8,,,,1089/1190,1089/1190,1089/1190,1089-01-01,1190-01-01,1139-07-03 12:00:00,Inferred
16,1116-03-05,19 Adar 1427,Seleucid,,1116-03-05,1116-03-05,1116-03-05,1116-03-05,1116-03-05 00:00:00,On document
17,0977-06-21/0977-07-19,Tammuz 1288,Seleucid,,0977-06-21/0977-07-19,0977-06-21/0977-07-19,977-06-21,977-07-19,977-07-05 00:00:00,On document
23,1025-08-28/1026-09-14,1337,Seleucid,,1025-08-28/1026-09-14,1025-08-28/1026-09-14,1025-08-28,1026-09-14,1026-03-07 00:00:00,On document
35,,,,1080/1100,1080/1100,1080/1100,1080-01-01,1100-01-01,1089-12-31 12:00:00,Inferred
36,1131,,,,1131,1131,1131-01-01,1131-12-31,1131-07-02 00:00:00,On document
41,1188-12-07,"Wednesday, 15 Kislev 1500",Seleucid,,1188-12-07,1188-12-07,1188-12-07,1188-12-07,1188-12-07 00:00:00,On document
43,1236-11-30/1236-12-28,Tevet 1548,Seleucid,,1236-11-30/1236-12-28,1236-11-30/1236-12-28,1236-11-30,1236-12-28,1236-12-14 00:00:00,On document
46,,,,1028,1028,1028,1028-01-01,1028-12-31,1028-07-01 12:00:00,Inferred


In [14]:
bar_chart = alt.Chart(dated_docs[['pgpid', 'undate_earliest', 'undate_latest', 'date_source']].sort_values('date_source')).mark_bar(opacity=0.15).encode(
    x=alt.X('undate_earliest', title="Year"), #, axis=alt.Axis(format="r")),
    x2='undate_latest',
    y=alt.Y('count(pgpid)', title='Documents'),
    color=alt.Color("date_source", title="Dating").scale(domain=['On document', 'Inferred']),
).properties(width=900, height=175)

line_chart = alt.Chart(dated_docs[['pgpid', 'undate_midpoint']]).mark_line(opacity=0.6, color="green", interpolate="monotone").encode(
 x=alt.X('undate_midpoint:T', title="Year"), # axis=alt.Axis(format="r")),
 y=alt.Y('count(pgpid)', title='Documents')
).properties(width=900, height=175)


combined_dating_chart = (line_chart + bar_chart).configure_legend(
    strokeColor='gray',
    fillColor='white', #EEEEEE',
    padding=10,
    cornerRadius=5,
    orient='top-left'
)

combined_dating_chart.save(f'{chart_dir}/combined_dating.pdf')
combined_dating_chart

In [15]:
# graph documents with calendars

date_docs_cal = dated_docs[dated_docs.doc_date_standard.notna()]

dated_docs_cal = date_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})
dated_docs_cal['midpoint_year'] = dated_docs_cal.undate_midpoint.apply(lambda x: x.year)

docs_calendars_charts = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(
  x=alt.X('midpoint_year', title="CE Year (Julian/Gregorian)", bin=alt.Bin(maxbins=120), axis=alt.Axis(format="r")),
  y=alt.Y('count(pgpid)', title='Documents'),
  color=alt.Y("doc_date_calendar", title="Calendar")
).properties(width=900, height=200,
            #  title={
            #      "text": "Documents by date and original calendar",
            #    "subtitle": "(Graphed based on date midpoint for uncertain dates and date ranges)"
            #  }
             ).configure_legend(
    strokeColor='gray',
    fillColor='white', #EEEEEE',
    padding=10,
    cornerRadius=5,
    orient='top-left'
)

docs_calendars_charts.save(f"{chart_dir}/dated_docs_by_cal.pdf")
docs_calendars_charts