In [1]:
import csv
import xml
import xml.etree.ElementTree as ET
from string import ascii_lowercase as alph


In [2]:
tree = ET.parse("./open_funder_registry-master/registry.rdf")
root = tree.getroot()

# tips at https://docs.python.org/3/library/xml.etree.elementtree.html 

In [3]:
for x in root[0:4]:
    print(x)
    
# -> concepts start at position 2. before that it's header info

<Element '{http://data.crossref.org/fundingdata/terms}registry' at 0x00000209097E01D0>
<Element '{http://www.w3.org/2004/02/skos/core#}ConceptScheme' at 0x00000209097E03B0>
<Element '{http://www.w3.org/2004/02/skos/core#}Concept' at 0x000002090AAE6450>
<Element '{http://www.w3.org/2004/02/skos/core#}Concept' at 0x000002090AAEC720>


In [4]:
print(len(root))

31553


In [5]:
root[31552]
# -> last organization

<Element '{http://www.w3.org/2004/02/skos/core#}Concept' at 0x0000020920CDDE00>

In [41]:
# open the file in the write mode

with open('funders_extracted_crossref.csv', 'w',encoding='utf-8',newline='') as file:
    
    writer = csv.writer(file)
    header_row = ['DOI','Funding body type','Region','Country','Funding Body Sub Type', 
                  'Names (first one pref., others alternative)']
    
    writer.writerow(header_row)

    for orga in range(2,31552):

        row = []
        names = []
        
        # DOI (is at position zero of the list of values for the main attribute of every organization)
        row.append(list(root[orga].attrib.values())[0])
        
        # iterate through entries in each organization and check whether the tag is interesting
        for entry in root[orga]:

            #print(entry)
            if entry.tag == '{http://www.w3.org/2008/05/skos-xl#}prefLabel' or entry.tag == '{http://www.w3.org/2008/05/skos-xl#}altLabel':
                # -> label found
                # the name (first one found is preferred label, others are alternative names)
                names.append(entry[0][0].text)

            if entry.tag == '{http://data.crossref.org/fundingdata/xml/schema/grant/grant-1.2/}fundingBodyType':
                # -> funding orga type found
                row.append(entry.text)

            if entry.tag == '{http://data.crossref.org/fundingdata/xml/schema/grant/grant-1.2/}region':
                # -> region
                row.append(entry.text)

            if entry.tag == '{https://none.schema.org/}address':
                # -> country
                row.append(entry[0][0].text)

            if entry.tag == '{http://data.crossref.org/fundingdata/xml/schema/grant/grant-1.2/}fundingBodySubType':
                # -> funding body sub type
                row.append(entry.text)

        # organization names go last columns in csv file since their number is open-end
        row.extend(names)

        # write row to the csv file
        writer.writerow(row)

# note: when opening .csv with Excel -> convert data to columns with comma "," as separator
# also Excel may display foreign characters as gibberish, e.g. Chinese characters
# don't worry file is fine and Python works out as long as it's opened with utf-8 encoding

In [7]:
## create dictionary for sorted searching of orga, names

# create empty dict with list for every character in alphabet, one "other" label for non-standard first characters (e.g. Chinese names)
orga_dict = {}
for c in alph:
    orga_dict[c] = []
    if c=="z":
        orga_dict['other'] = []

for orga in range(2,31552):
    names = []

    # iterate through entries in each organization and check whether the tag is interesting
    for entry in root[orga]:

        #print(entry)
        if entry.tag == '{http://www.w3.org/2008/05/skos-xl#}prefLabel' or entry.tag == '{http://www.w3.org/2008/05/skos-xl#}altLabel':

            orga_name = entry[0][0].text.lower()
            #print(orga_name)
            #print(orga_name[0])
            if orga_name[0] in alph:
                orga_dict[orga_name[0]] == orga_dict[orga_name[0]].append(orga_name)
            else:
                orga_dict['other'].append(orga_name)

w = csv.writer(open("crossref_organizations_sorted_dict.csv", "w", encoding='utf-8', newline=''), delimiter='|')
for letter,orgs in orga_dict.items():
    w.writerow([letter,list(orgs)])

orga_dict

{'a': ['administración de alimentos y medicamentos de los estados unidos',
  "american parkinson's disease foundation",
  'apda',
  'american diabetes association',
  'asociación americana de la diabetes',
  'ada',
  'amgen foundation',
  'amgen foundation, inc.',
  'amgen foundation inc',
  'american association for cancer research',
  'american association for cancer research, inc.',
  'american association for cancer research inc',
  'aacr',
  'aetna foundation',
  'aetna foundation, inc.',
  'aetna foundation inc',
  'abbott fund',
  'abbott',
  'annenberg foundation',
  'american cancer society',
  'american cancer society, inc.',
  'acs, inc.',
  'american cancer society inc.',
  'acs',
  'autism speaks',
  'autism speaks inc.',
  'autism speaks inc',
  'autism speaks, inc.',
  'as',
  'anesthesia patient safety foundation',
  'apsf',
  'administración de recursos y servicios de salud',
  'administration des ressources et des services de santé',
  'administração de recursos e ser