## Tutorial for CSVtoRDF

https://stackoverflow.com/questions/43524943/creating-rdf-file-using-csv-file-as-input

In [1]:
from rdflib import Graph, Literal, RDF, URIRef, Namespace #basic RDF handling
from rdflib.namespace import FOAF, XSD #most common namespaces
import urllib.parse #for parsing strings to URI's
import pandas as pd

In [5]:
url='https://raw.githubusercontent.com/KRontheWeb/csv2rdf-tutorial/master/example.csv'
df=pd.read_csv(url,sep=";",quotechar='"')

In [6]:
df

Unnamed: 0,Name,Address,Place,Country,Age,Hobby,Favourite Colour
0,John,Dam 52,Amsterdam,The Netherlands,32,Fishing,Blue
1,Jenny,Leidseplein 2,Amsterdam,The Netherlands,12,Dancing,Mauve
2,Jill,52W Street 5,Amsterdam,United States of America,28,Carpentry,Cyan
3,Jake,12E Street 98,Amsterdam,United States of America,42,Ballet,Purple


In [7]:
g = Graph()
ppl = Namespace('http://example.com/people/')
loc = Namespace('http://mylocations.org/addresses/')
schema = Namespace('http://schema.org/')

In [8]:
for index, row in df.iterrows():
    g.add((URIRef(ppl+row['Name']), RDF.type, FOAF.Person))
    g.add((URIRef(ppl+row['Name']), URIRef(schema+'name'), Literal(row['Name'], datatype=XSD.string) ))
    g.add((URIRef(ppl+row['Name']), FOAF.age, Literal(row['Age'], datatype=XSD.integer) ))
    g.add((URIRef(ppl+row['Name']), URIRef(schema+'address'), Literal(row['Address'], datatype=XSD.string) ))
    g.add((URIRef(loc+urllib.parse.quote(row['Address'])), URIRef(schema+'name'), Literal(row['Address'], datatype=XSD.string) ))

In [10]:
print(g.serialize(format='turtle'))
g.serialize('example.ttl',format='turtle')

@prefix ns1: <http://xmlns.com/foaf/0.1/> .
@prefix ns2: <http://schema.org/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://example.com/people/Jake> a ns1:Person ;
    ns2:address "12E Street 98"^^xsd:string ;
    ns2:name "Jake"^^xsd:string ;
    ns1:age 42 .

<http://example.com/people/Jenny> a ns1:Person ;
    ns2:address "Leidseplein 2"^^xsd:string ;
    ns2:name "Jenny"^^xsd:string ;
    ns1:age 12 .

<http://example.com/people/Jill> a ns1:Person ;
    ns2:address "52W Street 5"^^xsd:string ;
    ns2:name "Jill"^^xsd:string ;
    ns1:age 28 .

<http://example.com/people/John> a ns1:Person ;
    ns2:address "Dam 52"^^xsd:string ;
    ns2:name "John"^^xsd:string ;
    ns1:age 32 .

<http://mylocations.org/addresses/12E%20Street%2098> ns2:name "12E Street 98"^^xsd:string .

<http://mylocations.org/addresses/52W%20Street%205> ns2:name "52W Street 5"^^xsd:string .

<http://mylocations.org/addresses/Dam%2052> ns2:name "Dam 52"^^xsd:string .

<http://mylocations.org/addre

<Graph identifier=N42150f30038f4cdaa20c991a4360946a (<class 'rdflib.graph.Graph'>)>

In [136]:
from rdflib import Graph, Literal, RDF, URIRef, Namespace #basic RDF handling
from rdflib.namespace import CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, \
                           PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, \
                           VOID, XMLNS, XSD
import urllib.parse #for parsing strings to URI's
import pandas as pd
from pathlib import Path

main_path = Path().absolute().parent
data_path = main_path / 'data'

In [137]:
# full
df = pd.read_csv(data_path / 'AccountRDF.csv', encoding='utf-8')
ns_acc = Namespace('http://fsqa.com/acc#')
namespace_dict = {
    'acc': ns_acc, 'time': TIME, 'rdf': RDF, 'rdfs': RDFS, None: ''
}
g = Graph()
g.bind('rdf', RDF)
g.bind('time', TIME)
g.bind('acc', ns_acc, override=True)
for index, row in df.iterrows():
    s_ns, s = row['subject'].split(':')
    p_ns, p = row['predicate'].split(':')
    if len(row['object'].split(':')) < 2:
        o_ns, o = None, row['object']
    else:
        o_ns, o = row['object'].split(':')
    
    if o_ns is None:
        g.add( (URIRef(namespace_dict[s_ns]+s), URIRef(namespace_dict[p_ns]+p), Literal(o, datatype=XSD.string)) )
    else:
        g.add( (URIRef(namespace_dict[s_ns]+s), URIRef(namespace_dict[p_ns]+p), URIRef(namespace_dict[o_ns]+o)) )

In [138]:
g.serialize(data_path / 'AccountRDF.ttl', format='turtle')
g.serialize(data_path / 'AccountRDF.xml', format='xml', encoding='utf-8')

<Graph identifier=Na72b615b0229458f8802af51c7d8ce19 (<class 'rdflib.graph.Graph'>)>

In [139]:
g = Graph()
g.load(data_path / 'AccountRDF.ttl', format='ttl')

<Graph identifier=Nbb591ae8e7904fd59bfab8980f811210 (<class 'rdflib.graph.Graph'>)>

In [63]:
# show only BalanceSheet
query_string = """
SELECT ?p ?o
WHERE {
    acc:CostOfSales ?p ?o .
    FILTER 
}
"""
qres = g.query(query_string)
for x in qres:
    print(f'{x}')

(rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://fsqa.com/acc#Account'))
(rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'), rdflib.term.Literal('매출원가', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))
(rdflib.term.URIRef('http://fsqa.com/acc#Account_Belonging'), rdflib.term.URIRef('http://fsqa.com/acc#IncomeStatement'))
(rdflib.term.URIRef('http://fsqa.com/acc#Account_Type'), rdflib.term.Literal('Value', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))
(rdflib.term.URIRef('http://fsqa.com/acc#numerator'), rdflib.term.URIRef('http://fsqa.com/acc#CostOfSalesRatio'))
(rdflib.term.URIRef('http://fsqa.com/acc#partOf'), rdflib.term.URIRef('http://fsqa.com/acc#GrossProfit'))


In [49]:
# show only BalanceSheet
query_string = """
select distinct ?literal { 
  ?s ?p ?literal 
  filter isLiteral(?literal)
}
"""
qres = g.query(query_string)
for x in qres:
    print(f'{x}')
    break

(rdflib.term.Literal('재고자산 회전기간', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')),)


In [140]:
accs = list(df.loc[
    (df['object'] == 'acc:IncomeStatement') | (df['object'] == 'acc:FinancialStatement'), 
    'subject'].unique())
print(' '.join(accs))


acc:BalanceSheet acc:Revenue acc:CostOfSales acc:GrossProfit acc:SellingGeneralAdministrativeExpenses acc:OperatingIncome acc:FinanceIncome acc:FinancialExpenses acc:ProfitBeforeTax acc:IncomeTaxExpense acc:Profit acc:IncomeStatement


acc:BalanceSheet acc:Revenue acc:CostOfSales acc:GrossProfit acc:SellingGeneralAdministrativeExpenses acc:OperatingIncome acc:FinanceIncome acc:FinancialExpenses acc:ProfitBeforeTax acc:IncomeTaxExpense acc:Profit acc:IncomeStatement acc:CostOfSalesRatio acc:SellingGeneralAdministrativeRatio acc:SalesAndSellingGeneralAdministrativeRatio acc:IncomeTaxRatio acc:ProfitRatio acc:Ratios

In [141]:
accs = list(df.loc[
    (df['object'] == 'acc:BalanceSheet') | (df['object'] == 'acc:FinancialStatement'), 
    'subject'].unique())
print(' '.join(accs))


acc:CurrentAssets acc:CashAndCashEquivalents acc:TradeAndOtherCurrentReceivables acc:PrepaidExpenses acc:Inventories acc:NoncurrentAssets acc:PropertyPlantAndEquipment acc:IntangibleAssets acc:AssetsAbstract acc:CurrentLiabilities acc:TradeAndOtherCurrentPayables acc:ShorttermBorrowings acc:AdvancesCustomers acc:NoncurrentLiabilities acc:BondsIssued acc:LongTermBorrowings acc:LiabilitiesAbstract acc:TotalEquity acc:LiabilitiesAndEquities acc:BalanceSheet acc:IncomeStatement


acc:CurrentAssets acc:CashAndCashEquivalents acc:TradeAndOtherCurrentReceivables acc:PrepaidExpenses acc:Inventories acc:NoncurrentAssets acc:PropertyPlantAndEquipment acc:IntangibleAssets acc:AssetsAbstract acc:CurrentLiabilities acc:TradeAndOtherCurrentPayables acc:ShorttermBorrowings acc:AdvancesCustomers acc:NoncurrentLiabilities acc:BondsIssued acc:LongTermBorrowings acc:LiabilitiesAbstract acc:TotalEquity acc:LiabilitiesAndEquities acc:BalanceSheet acc:IncomeStatement acc:TradeReceivableTurnoverPeriod acc:InventoriesTurnoverPeriod acc:TradePayablesTurnoverPeriod acc:AdvancesCustomersTurnoverPeriod acc:Ratios acc:CalendarOneYear

In [145]:
# BalanceSheet
query_statement = """
SELECT ?s ?p ?o WHERE { 
  VALUES ?s { acc:BalanceSheet acc:Revenue acc:CostOfSales acc:GrossProfit acc:SellingGeneralAdministrativeExpenses acc:OperatingIncome acc:FinanceIncome acc:FinancialExpenses acc:ProfitBeforeTax acc:IncomeTaxExpense acc:Profit acc:IncomeStatement acc:CostOfSalesRatio acc:SellingGeneralAdministrativeRatio acc:SalesAndSellingGeneralAdministrativeRatio acc:IncomeTaxRatio acc:ProfitRatio acc:Ratios }
  VALUES ?o { acc:BalanceSheet acc:Revenue acc:CostOfSales acc:GrossProfit acc:SellingGeneralAdministrativeExpenses acc:OperatingIncome acc:FinanceIncome acc:FinancialExpenses acc:ProfitBeforeTax acc:IncomeTaxExpense acc:Profit acc:IncomeStatement acc:CostOfSalesRatio acc:SellingGeneralAdministrativeRatio acc:SalesAndSellingGeneralAdministrativeRatio acc:IncomeTaxRatio acc:ProfitRatio acc:Ratios }
  VALUES ?p { acc:partOf acc:denominator acc:numerator } 
  ?s ?p ?o .
}
"""
qres = g.query(query_statement)
list(qres)[:2]

[(rdflib.term.URIRef('http://fsqa.com/acc#Revenue'),
  rdflib.term.URIRef('http://fsqa.com/acc#partOf'),
  rdflib.term.URIRef('http://fsqa.com/acc#GrossProfit')),
 (rdflib.term.URIRef('http://fsqa.com/acc#Revenue'),
  rdflib.term.URIRef('http://fsqa.com/acc#denominator'),
  rdflib.term.URIRef('http://fsqa.com/acc#CostOfSalesRatio'))]

In [148]:
# BalanceSheet
query_statement = """
SELECT ?s ?p ?o WHERE { 
  VALUES ?s { acc:CurrentAssets acc:CashAndCashEquivalents acc:TradeAndOtherCurrentReceivables acc:PrepaidExpenses acc:Inventories acc:NoncurrentAssets acc:PropertyPlantAndEquipment acc:IntangibleAssets acc:AssetsAbstract acc:CurrentLiabilities acc:TradeAndOtherCurrentPayables acc:ShorttermBorrowings acc:AdvancesCustomers acc:NoncurrentLiabilities acc:BondsIssued acc:LongTermBorrowings acc:LiabilitiesAbstract acc:TotalEquity acc:LiabilitiesAndEquities acc:BalanceSheet acc:IncomeStatement acc:TradeReceivableTurnoverPeriod acc:InventoriesTurnoverPeriod acc:TradePayablesTurnoverPeriod acc:AdvancesCustomersTurnoverPeriod acc:Ratios acc:CalendarOneYear }
  VALUES ?o { acc:CurrentAssets acc:CashAndCashEquivalents acc:TradeAndOtherCurrentReceivables acc:PrepaidExpenses acc:Inventories acc:NoncurrentAssets acc:PropertyPlantAndEquipment acc:IntangibleAssets acc:AssetsAbstract acc:CurrentLiabilities acc:TradeAndOtherCurrentPayables acc:ShorttermBorrowings acc:AdvancesCustomers acc:NoncurrentLiabilities acc:BondsIssued acc:LongTermBorrowings acc:LiabilitiesAbstract acc:TotalEquity acc:LiabilitiesAndEquities acc:BalanceSheet acc:IncomeStatement acc:TradeReceivableTurnoverPeriod acc:InventoriesTurnoverPeriod acc:TradePayablesTurnoverPeriod acc:AdvancesCustomersTurnoverPeriod acc:Ratios acc:CalendarOneYear }
  VALUES ?p { acc:partOf acc:denominator acc:numerator } 
  ?s ?p ?o .
}
"""
qres = g.query(query_statement)
list(qres)[:2]

[(rdflib.term.URIRef('http://fsqa.com/acc#CurrentAssets'),
  rdflib.term.URIRef('http://fsqa.com/acc#partOf'),
  rdflib.term.URIRef('http://fsqa.com/acc#AssetsAbstract')),
 (rdflib.term.URIRef('http://fsqa.com/acc#CashAndCashEquivalents'),
  rdflib.term.URIRef('http://fsqa.com/acc#partOf'),
  rdflib.term.URIRef('http://fsqa.com/acc#CurrentAssets'))]

In [20]:
# relationship Incomstatement

query_statement = """
SELECT ?s ?p ?o WHERE { 
  VALUES ?s { acc:CashAndCashEquivalents acc:TradeAndOtherCurrentReceivables acc:PrepaidExpenses acc:PropertyPlantAndEquipment acc:NoncurrentAssets acc:CurrentAssets }
  VALUES ?o { acc:CurrentAssets acc:NoncurrentAssets acc:AssetsAbstract }
  ?s ?p ?o .
}
"""
qres = g.query(query_statement)
list(qres)[:5]

[(rdflib.term.URIRef('http://fsqa.com/acc#CashAndCashEquivalents'),
  rdflib.term.URIRef('http://fsqa.com/acc#partOf'),
  rdflib.term.URIRef('http://fsqa.com/acc#CurrentAssets')),
 (rdflib.term.URIRef('http://fsqa.com/acc#TradeAndOtherCurrentReceivables'),
  rdflib.term.URIRef('http://fsqa.com/acc#partOf'),
  rdflib.term.URIRef('http://fsqa.com/acc#CurrentAssets')),
 (rdflib.term.URIRef('http://fsqa.com/acc#PrepaidExpenses'),
  rdflib.term.URIRef('http://fsqa.com/acc#partOf'),
  rdflib.term.URIRef('http://fsqa.com/acc#CurrentAssets')),
 (rdflib.term.URIRef('http://fsqa.com/acc#PropertyPlantAndEquipment'),
  rdflib.term.URIRef('http://fsqa.com/acc#partOf'),
  rdflib.term.URIRef('http://fsqa.com/acc#NoncurrentAssets')),
 (rdflib.term.URIRef('http://fsqa.com/acc#NoncurrentAssets'),
  rdflib.term.URIRef('http://fsqa.com/acc#partOf'),
  rdflib.term.URIRef('http://fsqa.com/acc#AssetsAbstract'))]

In [31]:
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network
from collections import defaultdict

In [159]:
df_account = pd.read_csv(data_path / 'AccountName.csv', encoding='utf-8')
ACC_DICT = defaultdict(dict)
for index, row in df_account.iterrows():
    eng = row['acc_name_eng']
    kor = row['acc_name_kor']
    group = row['group']
    ACC_DICT[eng]['name'] = kor
    ACC_DICT[eng]['group'] = group
ACC_DICT['CalendarOneYear']['name'] = '365 일'
ACC_DICT['CalendarOneYear']['group'] = 'TIME-Value-99'

In [160]:
df_account.groupby('group')['acc_name_kor'].apply(lambda x: list(x)).to_dict()

{'BS-Ratio-98': ['매출채권 회전기간',
  '선급비용 회전기간',
  '재고자산 회전기간',
  '매입채무 회전기간',
  '선수금 회전기간'],
 'BS-Value-0': ['재무상태표'],
 'BS-Value-1': ['자산총계', '부채와자본총계'],
 'BS-Value-2': ['부채총계', '자본총계'],
 'BS-Value-3': ['유동자산', '비유동자산', '유동부채', '비유동부채'],
 'BS-Value-4': ['현금및현금성자산',
  '매출채권',
  '선급비용',
  '재고자산',
  '유형자산',
  '무형자산',
  '매입채무 및 기타유동채무',
  '단기차입금',
  '선수금',
  '사채',
  '장기차입금'],
 'IS-Ratio-98': ['매출원가율', '판관비율', '매출 및 판관 비율', '법인세율', '순이익율'],
 'IS-Value-0': ['손익계산서'],
 'IS-Value-1': ['당기순이익(손실)'],
 'IS-Value-2': ['법인세비용차감전순이익(손실)', '법인세비용'],
 'IS-Value-3': ['영업이익', '금융수익', '금융비용'],
 'IS-Value-4': ['매출총이익', '판매비와관리비'],
 'IS-Value-5': ['수익(매출액)', '매출원가']}

In [161]:
net = Network(height="800px", width="100%", directed=True, font_color="black", heading='FSQA', notebook=False)
size_dict = {
    0: 20, 1: 18, 2: 16, 3: 14, 4: 12, 5: 10, 98: 12, 99: 10
}
def convert_to_string(x):
    if isinstance(x, URIRef):
        if len(x.split('#')) == 2:
            return x.split('#')[1]
        else:
            raise ValueError(f'Split error {x}')
    elif isinstance(x, Literal):
        return str(x)
    else:
        raise ValueError(f'Returned None')

for src, link, trg in qres:
    src = convert_to_string(src)
    link = convert_to_string(link)
    trg = convert_to_string(trg)
    
    src_label = ACC_DICT[src]['name']
    trg_label = ACC_DICT[trg]['name']
    src_fs, src_type, src_group = ACC_DICT[src]['group'].split('-')
    trg_fs, trg_type, trg_group = ACC_DICT[trg]['group'].split('-')
    src_title = f'Statement: {src_fs} Type: {src_type}'
    trg_title = f'Statement: {trg_fs} Type: {trg_type}'
    net.add_node(src_label, src_label, group=int(src_group), size=size_dict[int(src_group)], title=src_title)
    net.add_node(trg_label, trg_label, group=int(trg_group), size=size_dict[int(trg_group)], title=trg_title)
    net.add_edge(src_label, trg_label, weight=2, titel=link)

In [162]:
net.show('bs.html')

In [1]:
from datetime import datetime as dt

In [3]:
dt.now().strftime('%Y-%m-%d')

'2021-12-13'

In [3]:
import spacy
from spacy import displacy
sp = spacy.load('en_core_web_sm')
sp_trf = spacy.load('en_core_web_trf')

In [151]:
q = 'What happens to operating income when cost of revenue increases by 10 percent this year?'

doc = sp_trf(q)



In [146]:
displacy.render(doc, style='ent')

In [168]:
tag = None
words = None
tags = set()
for x in doc:
    print(f'-- {x.lemma_, x.ent_iob_, x.ent_type_} --')
    if x.ent_iob_ == 'B':
        if tag is not None:
            tags.add((tag, words))
        words = x.lemma_
        tag = x.ent_type_
    elif x.ent_iob_ == 'I':
        words += f' {x.lemma_}'
    else:
        if tag is not None:
            tags.add((tag, words))
        tag = None
        words = None


-- ('what', 'O', '') --
-- ('happen', 'O', '') --
-- ('to', 'O', '') --
-- ('operating', 'O', '') --
-- ('income', 'O', '') --
-- ('when', 'O', '') --
-- ('cost', 'O', '') --
-- ('of', 'O', '') --
-- ('revenue', 'O', '') --
-- ('increase', 'O', '') --
-- ('by', 'O', '') --
-- ('10', 'B', 'PERCENT') --
-- ('percent', 'I', 'PERCENT') --
-- ('this', 'B', 'DATE') --
PERCENT
-- ('year', 'I', 'DATE') --
-- ('?', 'O', '') --
DATE


In [169]:
tags

{('DATE', 'this year'), ('PERCENT', '10 percent')}

In [140]:
for x in doc:
    print(x.lemma_, x.ent_iob_, x.ent_type_)

what O 
happen O 
to O 
operating O 
income O 
when O 
cost O 
of O 
revenue O 
increase O 
by O 
10 B PERCENT
percent I PERCENT
this B DATE
year I DATE
? O 


In [137]:
for tag, words in tags:
    print(tag, words)

DATE this year


In [62]:
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

In [57]:
today = dt.now() 

In [66]:
today

datetime.datetime(2021, 12, 13, 17, 19, 25, 835071)

In [68]:
today + relativedelta(years=-1)

datetime.datetime(2020, 12, 13, 17, 19, 25, 835071)

In [61]:
from dateutil.relativedelta import relativedelta

In [107]:
today = dt.strptime(f'2021-01-23', '%Y-%m-%d')

In [108]:
today >= dt.strptime(f'{today.year}-04-01', '%Y-%m-%d') 
# 2021.12.08 >= 2021.04.01 
# true: year = 2020
# false: year = 2019  2021.01.08 >= 2021.04.01 

False

In [109]:
int('last year')

ValueError: invalid literal for int() with base 10: 'last year'

In [None]:
today  dt.strptime(f'{today.year}-03-31', '%Y-%m-%d')

In [101]:
dt.strptime(f'{today.year + 1}-03-31', '%Y-%m-%d')

datetime.datetime(2022, 3, 31, 0, 0)