In [1]:
import sys; sys.path.append('..')
from osp import *

KeyboardInterrupt: 

In [None]:
PATH_FEAT_SCORES = os.path.join(PATH_DATA, 'feat_weights.xlsx')

feat_comparisons = [
    # 'Philosophy vs Literature',
    'C20 Philosophy vs C20 Literature',
    # 'Synthese vs PMLA',
    # 'The Philosophical Review vs The Modern Language Review',
]

def get_feat_scores(fn=PATH_FEAT_SCORES, comparisons=feat_comparisons):
    df = pd.read_excel(fn)
    df = df.query('comparison in @comparisons')
    for c in ['Unnamed: 0', 'Unnamed: 0.1']:
        if c in df.columns:
            df.drop(columns=[c], inplace=True)
    return df.set_index('feature')

In [None]:
df_feats=get_feat_scores()
df_feats.reset_index().feature.unique()

array(['pos_NN', 'pos_DT', 'pos_NNP', 'pos_NNS', 'pos_RB', 'pos_JJ',
       'pos_VBP', 'pos_VBZ', 'pos_VB', 'pos_IN', 'deprel_punct', 'pos_MD',
       'pos_FW', 'pos_SYM', 'pos_PRP', 'pos_EX', 'deprel_mark', 'pos_CC',
       'pos_GW', 'pos_CD', 'sent_height', 'pos_VBG', 'pos_UH', 'pos_VBN',
       'deprel_cop', 'pos_WRB', 'deprel_discourse', 'pos_WDT', 'pos_JJS',
       'pos_JJR', 'deprel_ccomp', 'pos_HYPH', 'deprel_goeswith',
       'deprel_vocative', 'deprel_cc', 'pos_WP', 'pos_RP', 'pos_PDT',
       'deprel_csubj', 'pos_RBR', 'deprel_acl', 'pos_NFP',
       'deprel_compound', 'deprel_dep', 'deprel_nummod', 'pos_RBS',
       'pos_AFX', 'deprel_fixed', 'deprel_orphan', 'deprel_obj',
       'deprel_dislocated', 'deprel_obl', 'deprel_iobj', 'pos_POS',
       'deprel_reparandum', 'pos_NNPS', 'deprel_aux', 'deprel_xcomp',
       'deprel_advcl', 'pos_TO', 'deprel_parataxis', 'deprel_nmod',
       'deprel_appos', 'pos_VBD', 'deprel_expl', 'deprel_amod',
       'deprel_case', 'deprel_advmod'

In [None]:
def get_slice_ids(id, n_slices=10,stash=None):
    if stash is None:
        stash = STASH_SLICES_NLP
    
    return [
        f'{id}__{slice_id:02d}'
        for slice_id in range(1,n_slices+1)
        if f'{id}__{slice_id:02d}' in stash
    ]

In [None]:
slice_ids = get_slice_ids(id)
slice_id = slice_ids[0]

In [None]:
df_feat_scores = df_feats.groupby('feature').mean(numeric_only=True)
df_feat_scores['odds_ratio_log'] = np.log10(df_feat_scores['odds_ratio'])
color_by_feat_score = dict(zip(df_feat_scores.index, df_feat_scores['odds_ratio_log']))

In [None]:
def get_passage(slice_id, color_by=color_by_feat_score):
    docstr = STASH_SLICES_NLP[slice_id]
    doc = stanza.Document.from_serialized(docstr)

    for sent in doc.sentences:
        print(sent.text)
        print('-'*100)


In [None]:
from IPython.display import HTML
import html

def display_passage(slice_id, color_by=color_by_feat_score):
    """
    Displays the passage for a given slice_id in HTML.
    Annotates words with POS and deprel, and colors them by their weight in color_by.
    """
    if slice_id not in STASH_SLICES_NLP:
        print(f"Slice ID {slice_id} not found.")
        return
        
    docstr = STASH_SLICES_NLP[slice_id]
    doc = stanza.Document.from_serialized(docstr)
    
    output_html = ['<div style="line-height: 2.8; font-family: sans-serif; padding: 10px;">']
    
    min_score = min(color_by.values())
    max_score = max(color_by.values())

    for sent in doc.sentences:
        sent_html = []
        for word in sent.words:
            # Match the feature naming convention used in the codebase (pos_TAG, deprel_REL)
            pos = word.xpos or word.upos
            deprel = word.deprel
            
            # Combine scores for the word's features
            score = color_by.get(f'pos_{pos}', 0) + color_by.get(f'deprel_{deprel}', 0)
            
            # Map score to color intensity (clamped to [-1, 1])
            # Blue (1) to Orange (-1)
            intensity = max(min_score, min(max_score, score))
            
            if intensity > 0:
                # Positive score -> Blue
                bg_color = f'rgba(0, 0, 255, {intensity:.2f})'
                text_color = 'white' if intensity > 0.5 else 'black'
            elif intensity < 0:
                # Negative score -> Orange
                bg_color = f'rgba(255, 165, 0, {-intensity:.2f})'
                text_color = 'white' if -intensity > 0.5 else 'black'
            else:
                bg_color = 'transparent'
                text_color = 'black'
            
            # Escape text for safety and create annotated span
            safe_text = html.escape(word.text)
            word_span = (
                f'<span style="background-color: {bg_color}; color: {text_color}; font-size: 1.2em; '
                f'display: inline-block; text-align: center; vertical-align: top; line-height: 1.1; '
                f'padding: 0 2px;">'
                f'{safe_text}'
                f'<sub style="display: block; font-size: 0.5em; opacity: 0.7; line-height: 1;">'
                f'{pos}/{deprel}</sub>'
                f'</span>'
            )
            sent_html.append(word_span)
        
        output_html.append(f'<p style="margin-bottom: 25px;">{" ".join(sent_html)}</p>')
    
    output_html.append('</div>')
    return HTML("".join(output_html))

In [None]:
id1='phil/10.2307/20118780'
id2='lit/25614398'

In [None]:
for psg in get_slice_ids(id1):
    display(display_passage(psg))

In [None]:
for psg in get_slice_ids(id2):
    display(display_passage(psg))

In [None]:

get_passage('phil/10.2307/20118780__01')


ANTONY GALTON OPERATORS VS.
----------------------------------------------------------------------------------------------------
ARGUMENTS: THE INS AND OUTS OF REIFICATION ABSTRACT.
----------------------------------------------------------------------------------------------------
INTRODUCTION Temporal Logic has been a fertile battle-ground for philosophical and technical debate, from abstruse metaphysical questions through technicalities in mathematical logic to practical issues concerning computer implementation.
----------------------------------------------------------------------------------------------------
One such debate concerns the general theme of reification.
----------------------------------------------------------------------------------------------------
This term, which literally means 'thing-making', refers to the use of terms in first-order logic to express concepts normally expressed using predicates, operators, or even complete propositions.
---------------------