# Load Required Libraries

In [None]:
import re
import pandas as pd
from gensim.models import Doc2Vec
import gensim

# Load corpus metadata (required for breaking the corpus into the desire size documents)

In [None]:
maptable = pd.read_csv("data/block-item-toplevel-map.csv")
maptable[["block", "item", "toplevel"]] = maptable[["block", "item", "toplevel"]].apply(lambda x: x.str.strip())

In [None]:
block_item_dict = dict(zip(maptable['block'], maptable['item']))
item_toplevel_dict = dict(zip(maptable['item'], maptable['toplevel']))

In [None]:
aamaptable = pd.read_csv("data/aquinasSTandScriptumArticleBlocks.csv")
aamaptable[["block", "article", "toplevel"]] = aamaptable[["block", "article", "toplevel"]].apply(lambda x: x.str.strip())

In [None]:
aablock_item_dict = dict(zip(aamaptable['block'], aamaptable['article']))
aaitem_toplevel_dict = dict(zip(aamaptable['article'], aamaptable['toplevel']))

In [None]:
text = pd.read_csv("data/alltext.csv")
toplevellist = text["topLevel"].str.strip().unique()

# Normalize Text

In [None]:
def text_normalize(row):
    try:
        #strip punctuation
        re_stripper_alpha = re.compile('[^a-zA-Z]+')
        #reduce white space
        newText = re_stripper_alpha.sub(' ', row["text"])
        #conversions
        newText = newText.lower()
        newText = newText.replace("ae", "e")
        newText = newText.replace("v", "u")
        newText = newText.replace("j", "i")
        newText = newText.replace("y", "i")
        newText = newText.replace("oe", "e")
        return newText
    
    except Exception:
        return ""
text["text_clean"] = text.apply(lambda x: text_normalize(x), axis=1)

# Divide corpus into "Item" sized documents

In [None]:
def createDocsDict(block_text_dict):
  documents_dict = {}
  for key, value in block_text_dict.items():
    # exclude list is to exclude some texts that are not latin and are an experimental part of corpus
    excludeList = ["UD1xh4-", "ee-", "dor5dc-"]
    # run conditional to exclude texts that begin with the above prefix
    if not any(key.startswith(prefix) for prefix in excludeList):
      if (key.startswith("TAca84-") or key.startswith("ta-")):
        if aablock_item_dict.get(key):
          itemid = aablock_item_dict.get(key)
          current_value = documents_dict.get(itemid)
          if current_value:
              documents_dict[itemid] = current_value + ' ' + value
          else:
              documents_dict[itemid] = value
      else:
        if block_item_dict.get(key):
          itemid = block_item_dict.get(key)
          current_value = documents_dict.get(itemid)
          if current_value:
              documents_dict[itemid] = current_value + ' ' + value
          else:
              documents_dict[itemid] = value
  return documents_dict

In [None]:
block_text_dict = dict(zip(text['id'], text['text_clean']))
documents_dict = createDocsDict(block_text_dict)

# Create Dictionaries to look up documents by index number or by label

In [None]:
documents2label = {}
label2documents = {}
counter = 0
for doc in documents_dict:
    documents2label[counter] = doc
    label2documents[doc] = counter
    counter = counter + 1

In [None]:
documents = list(documents_dict.values())

# Create a Gensim Tagged Corpus

In [None]:
def tag_corpus():
  #for i, row in text.iterrows():
  for idx, val in enumerate(documents):
    tokens = gensim.utils.simple_preprocess(val)
    # add genesim tags
    yield gensim.models.doc2vec.TaggedDocument(tokens, [idx])

# tagging can take about a minute
gensim_tagged_corpus = list(tag_corpus())

# Load Model (either from saved model or new training)

In [None]:
def train_mode():
  model = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=10, epochs=100)
  model.build_vocab(gensim_tagged_corpus)
  model.train(gensim_tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)
  return model

In [None]:
def get_model(train=False):
  if train:
    model = train_mode()
  else:
    model = Doc2Vec.load("../SCTACorpus-doc2vec.model")
  return model

In [None]:
model = get_model()

# Evaluate Model

In [None]:
def reportMatchResult(model, doc_id, target_doc_id):
  inferred_vector = model.infer_vector(gensim_tagged_corpus[doc_id].words)
  sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
  filteredSims = [t for t in sims if documents2label[t[0]].startswith(documents2label[target_doc_id].split("-")[0])]
  resultFiltered = [i for i,t in enumerate(filteredSims) if t[0] == target_doc_id][0]
  resultNotFiltered = [i for i,t in enumerate(sims) if t[0] == target_doc_id][0]
  resultPerc = [t[1] for i,t in enumerate(sims) if t[0] == target_doc_id][0]
  return (resultFiltered, resultNotFiltered, resultPerc)

In [None]:
def reportTopResults(model, doc_id, topn=11):
  inferred_vector = model.infer_vector(train_corpus[doc_id].words)
  sims = model.dv.most_similar([inferred_vector], topn=topn)
  return sims

In [193]:
# this cell can take about 2 minutes to run
matches4 = pd.read_csv("data/aquinasSuggestedMatchesV4.csv")
matches6 = pd.read_csv("data/aquinasSuggestedMatchesV6.csv")
matches7 = pd.read_csv("data/aquinasSuggestedMatchesV7.csv")
matches = pd.concat([matches4, matches6, matches7])

results = {}
for index, row in matches.iterrows():
  if (index >= 0):  
    source = row["ST"]
    target = row["Match"]
    note = str(row["note"])
    if "target" not in note:
      result = reportMatchResult(model, label2documents[source], label2documents[target])
      key = source + "===" + target
      results[key] = result
  
resultsdf = pd.DataFrame.from_dict(results, orient="index", columns=["matchFiltered", "matchUnfiltered", "matchPerc"])

FileNotFoundError: [Errno 2] No such file or directory: 'data/aquinasSuggestedMatches.csv'

In [None]:
pd.set_option("display.max_rows", 200)

Q1 = resultsdf.quantile(0.0)
Q3 = resultsdf.quantile(0.90)
IQR = Q3 - Q1
# Remove outliers from each column
maskdf = resultsdf[~(resultsdf > (Q3 + 1.5 * IQR)).any(axis=1)]

# Calculate the mean of each column
mean_values = maskdf.mean()
median_values = maskdf.median()

# Add the mean values as a new row to the DataFrame
resultsdf.loc["mean"] = mean_values
resultsdf.loc["median"] = median_values

resultsdf.tail(11)