In [None]:
import pandas as pd
import numpy as np
import os
import time
import rtsvg
rt = rtsvg.RACETrack()
from ollama import chat
from ollama import ChatResponse
def promptModel(prompt, model):
    response: ChatResponse = chat(model=model, messages=[{ 'role': 'user', 'content': prompt,},])
    return response['message']['content']

_dir_ = '../../../data/2014_vast/MC1/'
df = df_articles = pd.read_parquet(_dir_ + 'articles.parquet')
def extractPythonBlock(s):
    _start_, _end_ = '```python', '```'
    if _start_ in s and _end_ in s[s.index(_start_)+len(_start_):]:
        i0 = s.index(_start_)+len(_start_)
        i1 = s.index(_end_, i0)
        return s[i0:i1]
    return None

In [2]:
name_to_model = {
    'deep_seek':      'deepseek-r1:14b',
    'qwen25':         'qwen2.5:14b',
    'llama32':        'llama3.2',
    'phi4':           'phi4',
    'granite31dense': 'granite3.1-dense',
}

def saveFile(filename, name, response_lu, timing_lu):
    _file_save_info_ = {'file':[], 'deep_seek_response':[], 'time_taken':[]}
    for _keyvalue_ in _response_lu_:
        _file_save_info_['file'].append(_keyvalue_)
        _file_save_info_['deep_seek_response'].append(_response_lu_[_keyvalue_])
        _file_save_info_['time_taken'].append(_timing_lu_[_keyvalue_])
    df_intermediate = pd.DataFrame(_file_save_info_)
    df_intermediate.to_parquet('INTERMEDIA_deep_seek_responses.parquet')

_dfs_ = []
for _name_, _model_ in name_to_model.items():
    _file_ = _dir_ + f'20250201_{_name_}_responses.parquet'
    if os.path.exists(_file_) == False:
        promptModel('What is 2+2?', _model_) # force the model to load to prevent it messing with the timing results
        _response_lu_, _timing_lu_ = {}, {}
        total_files = df['file'].nunique()
        files_processed = 0
        for k, k_df in df.groupby('file'):
            if k not in _response_lu_:        
                k_df             = k_df.sort_values('sentence_no').reset_index()
                _article_        = ''.join(k_df['sentence'])
                t0               = time.time()
                _response_       = promptModel(f'Extract all of the entities into a Python dictionary from the following article.  The dictionary pairing should indicate the entities type:\n\n{_article_}', _model_)
                _timing_lu_[k]   = time.time() - t0
                _response_lu_[k] = _response_
                print('.', end='')
                if files_processed % 10 == 0:
                    print(f'{files_processed}/{total_files}', end='')
                    saveFile(f'intermediate_{_name_}.parquet', _name_, _response_lu_, _timing_lu_)
            files_processed += 1
        saveFile(_file_, _name_, _response_lu_, _timing_lu_)
    _df_          = pd.read_parquet(_file_).rename({f'{_name_}_response':'model_response'}, axis=1)
    _df_['model'] = _model_
    _dfs_.append(_df_)

df_responses = pd.concat(_dfs_)

In [None]:
import ast

df_responses['model_response_len']  = df_responses['model_response'].str.len()
df_responses['python']              = df_responses['model_response'].apply(lambda x: extractPythonBlock(x))
df_responses['python_exists']       = df_responses['python'].apply(lambda x: x is not None)

def extractDictionaryBlocks(_str_):
    _dicts_ = []
    _str_ = _str_.strip()
    while (_str_.startswith('{') and '\n}' in _str_) or ('\n{' in _str_ and '\n}' in _str_[_str_.index('\n{'):]):
        i0 = 0 if _str_.startswith('{') else _str_.index('\n{')
        i1 = _str_.index('\n}',i0)
        _dicts_.append(_str_[i0:i1+2])
        _str_ = _str_[i1+1:]
    return _dicts_

entity_to_types_lu = {'entity':[], 'type':[], 'file':[], 'model':[]}
def addEntityTyping(_dict_, _file_, _model_):
    for k, v in _dict_.items():
        if type(v) is list:
            for _entity_ in v:
                entity_to_types_lu['entity'].append(str(_entity_))
                entity_to_types_lu['type']  .append(str(k))
                entity_to_types_lu['file']  .append(_file_)
                entity_to_types_lu['model'] .append(_model_)
        elif type(v) is dict:
            print(f'DICT {k=} {v=}')
        elif type(v) is str:
            entity_to_types_lu['entity'].append(str(k))
            entity_to_types_lu['type']  .append(str(v))
            entity_to_types_lu['file']  .append(_file_)
            entity_to_types_lu['model'] .append(_model_)
        else:
            print(f'UNKN {k=} {v=}')

_status_ = []
for i in range(len(df_responses)):
    _file_  = df_responses.iloc[i]['file']
    _model_ = df_responses.iloc[i]['model']
    if df_responses.iloc[i]['python'] is None: # NOT A PYTHON BLOCK ... BUT MAYBE SOMETHING THAT CAN BE EVALUATED
        _dicts_maybe_ = extractDictionaryBlocks(df_responses.iloc[i]['model_response'])
        if len(_dicts_maybe_) > 0:
            _all_parsed_ = True
            for _possible_ in _dicts_maybe_:
                try:
                    _dict_ = ast.literal_eval(_possible_)
                    addEntityTyping(_dict_, _file_, _model_)
                except:
                    _all_parsed_ = False
            if _all_parsed_:  _status_.append('Evaled - Non-Python Block')
            else:             _status_.append('Exception - Non-Python Block')
        else: _status_.append('None')
    else: # A PYTHON BLOCK ... BUT CAN IT BE SAFELY EVALED?
        _str_ = df_responses.iloc[i]['python'].strip()
        if 'entities ='      in _str_: _str_ = _str_.replace('entities =', '')
        if 'print(entities)' in _str_: _str_ = _str_.replace('print(entities)', '')
        _article_str_ = 'article_'
        if _str_.startswith(_article_str_): _str_ = _str_[len(_article_str_):]
        _str_ = _str_.strip()
        try:
            _dictionary_ = ast.literal_eval(_str_)
            if type(_dictionary_) is list: 
                _status_.append('Evaled-List')
                for x in _dictionary_:
                    if type(x) is dict: addEntityTyping(x, _file_, _model_)
                    else:
                        print('---') 
                        print(x)
            if type(_dictionary_) == dict: 
                _status_.append('Evaled')
                addEntityTyping(_dictionary_, _file_, _model_)
        except:
            _status_.append('Exception')
            last_failed_str  = _str_
            last_failed_orig = df_responses.iloc[i]['python']
df_responses['python_status'] = _status_
_h_ = 128
rt.tile([rt.histogram(df_responses, bin_by='model', count_by='time_taken',         color_by='model',         w=256, h=_h_),
         rt.histogram(df_responses, bin_by='model', count_by='model_response_len', color_by='model',         w=256, h=_h_),
         rt.histogram(df_responses, bin_by='model',                                color_by='python_exists', w=256, h=_h_),
         rt.histogram(df_responses, bin_by='model',                                color_by='python_status', w=256, h=_h_),
         rt.histogram(df_responses, bin_by='python_status',                        color_by='python_status', w=256, h=_h_)], spacer=10)

In [None]:
# Created this by hand
aggregate_type_to_entity_type = {'organization':   ['organizations', 'company', 'companies', 'company/organization', 'org', 'orgs',
                                                    'group', 'groups', 'group/organization', 'organization/group'],
                                 'government':     ['government', 'governments', 'country', 'countries', 'nation', 'nations', 'government entity'],
                                 'person':         ['person', 'people', 'persons'],
                                 'event':          ['event', 'events'],
                                 'location':       ['location', 'locations', 'place', 'places', 'facility', 'facilities'],
                                 'date':           ['date', 'dates'],
                                 'time':           ['time', 'times', 'date/time'],
                                 'action':         ['action', 'actions'],
                                 'object':         ['object', 'objects'],
                                 'entity':         ['entity', 'entities'],
                                 'topic':          ['topics', 'topic', 'issue', 'issues'],
                                 'chemical/drugs': ['chemical', 'chemicals', 'drug', 'drugs', 'substance', 'substances'],
                                 'other':          ['other', 'others', 'miscellaneous']}
_lu_ = {}
for k, v in aggregate_type_to_entity_type.items():
    for _entity_ in v:
        _lu_[_entity_] = k
df_entity_types = pd.DataFrame(entity_to_types_lu)
df_entity_types['type_agg'] = df_entity_types['type'].apply(lambda x: _lu_[x.lower()] if x.lower() in _lu_ else x.lower())
_w_, _h_ = 256, 256
rt.tile([rt.histogram(df_entity_types, bin_by='model',     color_by='type',     w=_w_, h=_h_),
         rt.histogram(df_entity_types, bin_by='type',      color_by='type',     w=_w_, h=_h_),
         rt.histogram(df_entity_types, bin_by='model',     color_by='type_agg', w=_w_, h=_h_),
         rt.histogram(df_entity_types, bin_by='type_agg',  color_by='type_agg', w=_w_, h=_h_)], spacer=10)

In [5]:
# No variations of this worked / with any of the models...  they'd return plain language descriptions ... but i don't believe any of them were complete
#_model_    = 'deepseek-r1:14b'
#_prompt_   = f'Create a Python function that coverts all of the following entity types into aggregate types. {df_entity_types["type"].unique().tolist()}'
#_response_ = promptModel(_prompt_, _model_)
#print(_response_)

In [6]:
#import networkx as nx
#relates = [('entity','type')]
#g       = rt.createNetworkXGraph(df_entity_types, relates)
#pos     = nx.spring_layout(g) # about 4 minutes

In [7]:
#pos = rt.layoutSimpleTemplates(g, pos)
#igl     = rt.interactiveGraphPanel(df_entity_types, ln_params={'relationships':relates, 'pos':pos, 'node_size':'small'}, w=2200, h=900)
#igl

In [None]:
_file_    = '5.txt'
_article_ = ' '.join(df_articles.query('file == @_file_').sort_values('sentence_no')['sentence'])
_df_ents_ = df_entity_types.query('file == @_file_').reset_index()
my_markup = {}
for i in range(len(_df_ents_)):
    _model_  = _df_ents_.iloc[i]['model']
    _entity_ = _df_ents_.iloc[i]['entity']
    if _model_ not in my_markup: my_markup[_model_] = {}
    my_markup[_model_][_entity_] = rt.co_mgr.getColor(_model_)
_txt_blk_ = rt.textBlock(_article_, word_wrap=True, w=384)
_tiles_   = []
_svg_lu_  = _txt_blk_.highlightsComparison(my_markup)
_tiles_.append(_svg_lu_['__all__'])
for x in _svg_lu_:
    if x == '__all__': continue
    _tiles_.append(_svg_lu_[x])
_tiles_.append(rt.histogram(_df_ents_, bin_by='model', color_by='model', w=160, h=256))
rt.tile(_tiles_, spacer=10)

In [None]:
_df_ents_.query('model == "deepseek-r1:14b"')