In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
torch.random.manual_seed(0)
model_id = "microsoft/Phi-3-small-8k-instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", trust_remote_code=True, )
assert torch.cuda.is_available(), "This model needs a GPU to run ..."
device = torch.cuda.current_device()
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
pipe = pipeline("text-generation",model=model,tokenizer=tokenizer,device=device)
def promptModel(prompt, max_tokens=500, temperature=0.0, do_sample=False):
    messages = [{"role": "user", "content": prompt},]
    generation_args = {"max_new_tokens": max_tokens, "return_full_text": False,
                       "temperature": temperature,   "do_sample": do_sample,
                       "pad_token_id": tokenizer.eos_token_id}
    output = pipe(messages, **generation_args)
    return output[0]['generated_text']

In [None]:
#
# Prepare an article for processing by loading it, separating it into sentences, and doing basic cleaning
#
import os
_base_dir_ = '../../../data/2014_vast/MC1/News Articles'
os.listdir(_base_dir_ + '/Everyday News')
_txt_raw_     = open(_base_dir_ + '/Everyday News/343.txt', 'rb').read()
_txt_better_  = str(_txt_raw_, encoding='utf-8').replace('\r', '').split('\n')
_txt_         = []
for _line_ in _txt_better_:
    if _line_ == '' or _line_ == '\n' or _line_ == '\r': continue
    if '.' in _line_:
        for _sent_ in _line_.split('.'):
            _sent_ = _sent_.strip()
            if _sent_ == '' or _sent_ == '\n' or _sent_ == '\r' or _sent_ == '.': continue
            _txt_.append(_sent_ + '.')
    else:
        _txt_.append(_line_.strip())
print('\n'.join(_txt_))

In [None]:
#
# Grammar is a problem with the articles -- let's have the model fix that up first
#
_grammar_ = []
for _sent_ in _txt_:
    if _sent_.lower().startswith('source:') or _sent_.lower().startswith('title:') or _sent_.lower().startswith('published:'): continue
    _fixed_up_ = promptModel('Make the following sentence grammatically correct.  Just return the corrected sentence with no explanation: "' + _sent_ + '"')    
    _grammar_.append(_fixed_up_)
print('\n'.join(_grammar_))

In [None]:
#
# This part is unreliable with phi-3-small-128k...
# ... entity extraction seems to be reasonable... but relationships are not...
#
i = 3

_entity_prompt_ = '''Extract just the people, groups, organizations, countries, cities, places, and locations from the following sentence.
Return as CSV with a header of "entity, type".  Just return the CSV.  Do not include an explanation or any caveats.'''

_example_ = promptModel(_entity_prompt_ + '\n\n' + _grammar_[i])
print(_grammar_[i])
print(_example_)

# This pulls out two entities from the last extraction and asks for a relationship... the llm doesn't just pull out the relationships between these two specific entities...
_relationship_prompt_ = '''What is the relationship between "government of Kronos" and the "protestors" in the following sentence?
Return as CSV with a header of "subject, relationship, object".  Just return the CSV.  Do not include an explanation or any caveats.'''

_example_ = promptModel(_relationship_prompt_ + '\n\n' + _grammar_[i])
print(_example_)

In [5]:
# 12.6s for one article...
_entity_prompt_ = '''Extract just the people, groups, organizations, countries, cities, places, and locations from the following sentence.
Return as CSV with a header of "entity, type".  Just return the CSV.  Do not include an explanation or any caveats.'''
_entity_extracts_ = {}
for _sent_i_ in range(len(_grammar_)):
    _entity_extracts_[_sent_i_] = promptModel(_entity_prompt_ + '\n\n' + _grammar_[_sent_i_])