In [12]:
import spacy
import pandas as pd
import transformers
from pathlib import Path
from collections import defaultdict
from rdflib import Graph, Literal, URIRef

In [10]:
src_path = Path('.').absolute().parent
data_path = src_path / 'data'

In [13]:
def convert_to_string(x):
    if isinstance(x, URIRef):
        if len(x.split('#')) == 2:
            return x.split('#')[1]
        else:
            raise ValueError(f'Split error {x}')
    elif isinstance(x, Literal):
        return str(x)
    else:
        raise ValueError(f'Returned None')

In [20]:
df_account = pd.read_csv(data_path / 'AccountName.csv', encoding='utf-8')

g = Graph()
g.load(data_path / 'AccountRDF.ttl', format='ttl')

ACC_DICT = defaultdict(dict)
for index, row in df_account.iterrows():
    acc = row['acc']
    eng = row['acc_name_eng']
    kor = row['acc_name_kor']
    group = row['group']
    ACC_DICT[acc]['kor_name'] = kor
    ACC_DICT[acc]['eng_name'] = eng
    ACC_DICT[acc]['group'] = group
ACC_DICT['CalendarOneYear']['name'] = '365 일'
ACC_DICT['CalendarOneYear']['group'] = 'TIME-Value-99'

query_statement = """
SELECT ?s ?p ?literal WHERE { 
?s a acc:Account . 
VALUES ?p { acc:Account_Property acc:Account_Level } 
?s ?p ?literal .
}
"""
qres2 = g.query(query_statement)
for src, link, trg in qres2:
    src = convert_to_string(src)
    link = convert_to_string(link)
    trg = convert_to_string(trg)
    ACC_DICT[src][link] = trg

In [None]:

'What is the Cost of Sales Ratio in last year?',
'What happens to the Operating Income when the Cost of Sales increases by 10% this year?',
'What will be our revenue in the 4th quarter?'

In [37]:
from transformers import BertForMaskedLM, BertTokenizerFast, BertConfig

model_path = 'bert-large-cased'
config = BertConfig()
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForMaskedLM.from_pretrained(model_path)

Downloading: 100%|██████████| 208k/208k [00:00<00:00, 341kB/s]  
Downloading: 100%|██████████| 426k/426k [00:01<00:00, 276kB/s]  
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 29.9kB/s]
Downloading: 100%|██████████| 762/762 [00:00<00:00, 744kB/s]
Downloading: 100%|██████████| 1.25G/1.25G [02:11<00:00, 10.2MB/s]
Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [105]:
sentence = '[MASK] is the Cost of Sales Ratio in the last year?'
sentence = 'What is the Cost of Sales Ratio in the [MASK] year?'
inputs = tokenizer(sentence, padding=True, truncation=True, return_token_type_ids=True, return_tensors='pt')
masked = inputs['input_ids'].eq(tokenizer.mask_token_id)

In [113]:
outputs = model(**inputs).logits[masked]

In [114]:
outputs

tensor([[-5.9819, -5.8912, -6.0374,  ..., -5.5938, -4.7100, -5.4477]],
       grad_fn=<IndexBackward0>)

In [119]:
tokenizer.decode(
    outputs[0].argsort(descending=True)[:10]
)

'current same present previous last first fiscal following coming past'

---

In [2]:
# Load a spacy model and chekc if it has ner

nlp = spacy.load('en_core_web_trf')  # en_core_web_trf / en_core_web_sm

nlp.pipe_names

['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [27]:
doc = nlp.tokenizer("abc")

In [30]:
for x in doc:
    print(x)

abc


In [31]:
x.shape_

'xxx'

In [23]:
tf_model = nlp.get_pipe('transformer')

In [25]:
tf_model.predict("abc")

AttributeError: 'str' object has no attribute 'text'