In [None]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

def promptModel(_user_, _system_='You are a helpful digital assistant.', max_tokens=256):
    messages = [{"role": "system", "content": _system_},{"role": "user",   "content": _user_}]
    prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    terminators = [pipeline.tokenizer.eos_token_id,pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
    outputs = pipeline(prompt,max_new_tokens=max_tokens,eos_token_id=terminators,
                       do_sample=True,temperature=0.6,top_p=0.9,)
    return outputs[0]["generated_text"][len(prompt):]

In [None]:
#
# Prepare an article for processing by loading it, separating it into sentences, and doing basic cleaning
#
import os
_base_dir_ = '../../../data/2014_vast/MC1/News Articles'
os.listdir(_base_dir_ + '/Everyday News')
_txt_raw_     = open(_base_dir_ + '/Everyday News/343.txt', 'rb').read()
_txt_better_  = str(_txt_raw_, encoding='utf-8').replace('\r', '').split('\n')
_txt_         = []
for _line_ in _txt_better_:
    if _line_ == '' or _line_ == '\n' or _line_ == '\r': continue
    if '.' in _line_:
        for _sent_ in _line_.split('.'):
            _sent_ = _sent_.strip()
            if _sent_ == '' or _sent_ == '\n' or _sent_ == '\r' or _sent_ == '.': continue
            _txt_.append(_sent_ + '.')
    else:
        _txt_.append(_line_.strip())
print('\n'.join(_txt_))

In [None]:
#
# Grammar is a problem with the articles -- let's have the model fix that up first
#
_grammar_ = []
for _sent_ in _txt_:
    if _sent_.lower().startswith('source:') or _sent_.lower().startswith('title:') or _sent_.lower().startswith('published:'): continue
    _fixed_up_ = promptModel(_sent_, 'Make the following sentence grammatically correct.  Just return the corrected sentence with no explanation.')    
    _grammar_.append(_fixed_up_)
print('\n'.join(_grammar_))

In [None]:
#
# Three step process - extract entities... then for each entity, get context and then get events...
#
_entity_prompt_ = '''Extract the people, groups, organizations, countries, cities, and locations from the following sentence.
Return as CSV with a header of "entity, type".  Just return the CSV.  Do not include an explanation or any caveats.'''
i = 7
_example_ = promptModel(_grammar_[i], _entity_prompt_)
print(_grammar_[i])
print(_example_)

In [None]:
_attribute_prompt_ = '''Extract context about Julian Vann from the following sentence.  Return as CSV with a header of "entity, context, value".  Just return the CSV.  Do not include an explanation or any caveats.'''
_attributes_ = promptModel(_grammar_[i], _attribute_prompt_)
print(_attributes_)

In [None]:
_event_prompt_ = '''Extract events about Julian Vann from the following sentence.  Return as CSV with a header of "entity, event, value".  Just return the CSV.  Do not include an explanation or any caveats.'''
_events_ = promptModel(_grammar_[i], _event_prompt_)
print(_events_)

In [None]:
import pandas as pd

def appendRows(_lu_, _csv_):
    for _row_ in _csv_.split('\n')[1:]: # first row should be a header
        if _row_.startswith(','): _row_ = _row_[1:]
        _lu_['subj'].append(_row_.split(',')[0]), _lu_['verb'].append(_row_.split(',')[1]), _lu_['obj'].append(','.join(_row_.split(',')[2:]))

def iterateOverSentence(_sentence_):
    _lu_ = {'subj':[], 'verb':[], 'obj':[]}
    _entity_prompt_ = '''Extract the people, groups, organizations, countries, cities, and locations from the following sentence.  Return as CSV with a header of "entity, type".  Just return the CSV.  Do not include an explanation or any caveats.'''
    _entities_ = promptModel(_grammar_[i], _entity_prompt_)
    for _row_ in _entities_.split('\n'):
        if _row_ == '' or _row_ == '"entity,type"': continue
        _lu_['subj'].append(_row_.split(',')[0]), _lu_['verb'].append('isInstanceOf'), _lu_['obj'].append(','.join(_row_.split(',')[1:]))
        _attribute_prompt_ = f'''Extract context about {_row_.split(',')[0]} from the following sentence.  Return as CSV with a header of "entity, context, value".  Just return the CSV.  Do not include an explanation or any caveats.'''
        _attributes_ = promptModel(_sentence_, _attribute_prompt_)
        appendRows(_lu_, _attributes_)
        _event_prompt_ = f'''Extract events about {_row_.split(',')[0]} from the following sentence.  Return as CSV with a header of "entity, event, value".  Just return the CSV.  Do not include an explanation or any caveats.'''
        _events_ = promptModel(_sentence_, _event_prompt_)
        appendRows(_lu_, _events_)
    return pd.DataFrame(_lu_)

i = 10
print(_grammar_[i])
iterateOverSentence(_grammar_[i])