In [8]:
data = {
        'Record ID': [],
        'Date of Publication': [],
        'Publication Status': [],
        'Country of Publication': [],
        'Language': [],
        'Continued by': [],
        'Continued from': [],
        'ISSN': [],
        'ISSNL': [],
        'ddc_subject_classification': [],
        'Publisher':[],
        'Title': [],
        'Access URL': [],
        'Access Condition': [],
        'Copyright': [],
        'subject': [],
        'subject_level1': [],
        'subject_level2': [],
        'subject_level3': []
    }

In [9]:
publication_status = {
    'd': 'Closed',
    'c': 'Ongoing',
    'e': 'Ceased',
    'u': 'Unknown',
    '|': 'Unknown'
}

In [10]:
def parseRecord(record):
    parsed_record={}
    subjects = set()
    subjects_l1=set()
    subjects_l2=set()
    subjects_l3=set()
    access_url = set()
    # Fields
    for field in record.get('fields', []):
        for tag, value in field.items():
            # Gestisci campi controllati
            if tag == '001':
                parsed_record['Record ID'] = value
            elif tag == '008':
                general_info = value
                parsed_record['Publication Status']= publication_status[general_info[6].strip()]
                parsed_record['Date of Publication']= general_info[7:11].strip() + " - " + general_info[11:15].strip()
                parsed_record['Country of Publication']= general_info[15:18].strip()
                parsed_record['Language']= general_info[35:38].strip()
            elif tag == '022':
                for subfield in value.get('subfields', []):
                    if 'a' in subfield:
                        parsed_record['ISSN'] = subfield['a']
                    if 'l' in subfield:
                        parsed_record['ISSNL'] = subfield['l']
            elif tag == '044':
                country_code = value.get('subfields', [{}])[0].get('c', 'Unknown')
                parsed_record['Country of Publication'] = country_code
            elif tag == '082':
                parsed_record['ddc_subject_classification']= value.get('subfields', [{}])[0].get('a', 'Unknown')
            elif tag == '245':
                title = ' '.join([sub.get('a', '') for sub in value.get('subfields', [])])
                parsed_record['Title'] = title.strip()
            elif tag == '260':
                parsed_record['Publisher'] = value.get('subfields', [{}])[0].get('a', ' ') + " " + value.get('subfields', [{}])[1].get('b', ' ')
            elif tag == '506':
                #Open Access Conditions
                f = next((sub.get('u') for sub in value.get('subfields', []) if 'f' in sub), None)
                if f: parsed_record['Access Condition'] = f
            elif tag == '540':
                #Informazioni sull’uso e la riproduzione
                #TO CHECK
                copyright = next((sub.get('f') for sub in value.get('subfields', []) if 'f' in sub), None)
                if copyright:
                    parsed_record['copyright']= copyright
                print("COPYRIGHT:", copyright)
                pass
            elif tag == '780':
                #continue from.
                #print("CONTINUED FROM: ", value.get('subfields', [{}]))
                parsed_record['Continued from'] = value.get('subfields', [{}])[1].get('x', 'Unknown') if len(value.get('subfields', [{}])) > 1 else ''
                #subfield 'x' -> issn della risorsa precedente
                pass
            elif tag == '785':
                #continued by
                parsed_record['Continued by']= value.get('subfields', [{}])[1].get('x', 'Unknown') if len(value.get('subfields', [{}])) > 1 else ''
                #subfield 'x' -> issn della risorsa successiva
                pass
            elif tag == '856':
                url = next((sub.get('u') for sub in value.get('subfields', []) if 'u' in sub), None)
                if url:
                    access_url.add(url)
            elif tag == '981': 
                subjects.add( value.get('subfields', [{}])[0].get('a', 'Unknown'))
            elif tag == '982':
                subjects_l1.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))
            elif tag == '983':
                subjects_l2.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))
            elif tag == '984':
                subjects_l3.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))
    parsed_record['subject'] = ", ".join(list(subjects))
    parsed_record["subject_level1"]= ", ".join(list(subjects_l1))
    parsed_record["subject_level2"]= ", ".join(list(subjects_l2))
    parsed_record["subject_level3"]= ", ".join(list(subjects_l3))
    parsed_record['Access URL'] = ", ".join(list(access_url))

    data['Record ID'].append(parsed_record.get('Record ID', ''))
    data['Publication Status'].append(parsed_record.get('Publication Status', ''))
    data['Date of Publication'].append(parsed_record.get('Date of Publication', ''))
    data['Country of Publication'].append(parsed_record.get('Country of Publication', ''))
    data['Language'].append(parsed_record.get('Language', ''))
    data['Continued from'].append(parsed_record.get('Continued from', ''))
    data['Continued by'].append(parsed_record.get('Continued by', ''))
    data['ISSN'].append(parsed_record.get('ISSN', ''))
    data['ISSNL'].append(parsed_record.get('ISSNL', ''))
    data['ddc_subject_classification'].append(parsed_record.get('ddc_subject_classification', ''))

    data['Publisher'].append(parsed_record.get('Publisher',''))
    data['Access Condition'].append(parsed_record.get('Access Condition', ''))
    data['Title'].append(parsed_record.get('Title', ''))
    data['Access URL'].append(parsed_record.get('Access URL', ''))
    data['Copyright'].append(parsed_record.get('copyright', ''))
    data['subject'].append(parsed_record.get('subject',' '))
    data['subject_level1'].append(parsed_record.get('subject_level1',' '))
    data['subject_level2'].append(parsed_record.get('subject_level2',' '))
    data['subject_level3'].append(parsed_record.get('subject_level3',' '))
                

In [1]:
d = {
				"856": {
					"subfields": [
						{
							"u": "http://www.tandfonline.com/loi/tizo20#.VDfU900cRok"
						},
						{
							"x": "oa-j"
						}
					],
					"ind1": "4",
					"ind2": "0"
				}
			}

In [7]:
for tag, value in d.items():
    print(tag, value)
    url = next((sub.get('u') for sub in value.get('subfields', []) if 'u' in sub), None)
    print(url)

856 {'subfields': [{'u': 'http://www.tandfonline.com/loi/tizo20#.VDfU900cRok'}, {'x': 'oa-j'}], 'ind1': '4', 'ind2': '0'}
http://www.tandfonline.com/loi/tizo20#.VDfU900cRok


In [11]:
import json

fin = open('./input/downloadedITNotOpenHandled.json')
lines = fin.read().split("\n")
fin.close()
for line in lines:
    entry = json.loads(line)
    for record in entry:
        parseRecord(record)

In [12]:
len(lines)

39

In [13]:
fin = open('./input/downloadedItNotOpenoalex.json')

for line in fin:
    if line[0] == '[' or line[0] == ',':
        parseRecord(json.loads(line[1:].strip()))
    else:
        continue
                    

In [14]:
import pandas as pd

idf = pd.DataFrame(data, columns = ['Record ID',
        'Date of Publication',
        'Publication Status',
        'Country of Publication',
        'Language',
        'ISSN',
        'ISSNL',
        'Continued by',
        'Continued from',
        'ddc_subject_classification',
        'Publisher',
        'Title',
        'Access Condition',
        'Copyright',
        'Access URL',
        'subject',
        'subject_level1',
        'subject_level2',
        'subject_level3'])

#idf.to_csv('./input/issnExtracted1.tsv',sep="\t")

In [15]:
idf[idf['Continued by'] != '' ]

Unnamed: 0,Record ID,Date of Publication,Publication Status,Country of Publication,Language,ISSN,ISSNL,Continued by,Continued from,ddc_subject_classification,Publisher,Title,Access Condition,Copyright,Access URL,subject,subject_level1,subject_level2,subject_level3
0,2385-2852,1991 - 2009,Closed,ITA,eng,2385-2852,1120-9992,2037-7460,,616,Padova Unipress,Basic and applied myology.,,,http://www.bio.unipd.it/bam/,APPLIED SCIENCES. MEDICINE. TECHNOLOGY,Medical sciences,Pathology. Clinical medicine,
1,1120-9992,1991 - 2009,Closed,ITA,eng,1120-9992,1120-9992,2037-7452,,,Padova Unipress,Basic and applied myology.,,,,"APPLIED SCIENCES. MEDICINE. TECHNOLOGY, MATHEM...","Biological sciences in general, Medical sciences","Human biology, Pathology. Clinical medicine",Physiology. Human and comparative physiology
5,1121-418X,1987 - 1990,Closed,ITA,ita,1121-418X,1121-418X,1121-4171,,,Verona Società italiana di endodonzia,Giornale di endodonzia.,,,,APPLIED SCIENCES. MEDICINE. TECHNOLOGY,Medical sciences,Pathology. Clinical medicine,
17,1594-6525,1985 - 1994,Closed,ITA,ita,1594-6525,1594-6525,1594-6517,,,Bergamo [s.n.],Quaderni del Dipartimento di Linguistica e Let...,,,,LANGUAGE. LINGUISTICS. LITERATURE,General questions relating to both linguistics...,Literatures of individual languages and langua...,
18,0392-4424,1975 - 2000,Closed,ITA,ita,0392-4424,0392-4424,1721-971X,0390-4067,,Parma SIFET,Bollettino della società italiana di topografi...,,,,"THE ARTS. RECREATION. ENTERTAINMENT. SPORT, MA...",Astronomy. Astrophysics. Space research. Geode...,Geodesy. Surveying. Photogrammetry. Remote sen...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1024,0026-4733,1946 - 2020,Closed,ITA,ita,0026-4733,0026-4733,2724-5691,0390-9417,617,Torino Edizioni Minerva Medica,Minerva chirurgica.,,,,APPLIED SCIENCES. MEDICINE. TECHNOLOGY,Medical sciences,Surgery. Orthopaedics. Ophthalmology,
1038,2038-1700,1882 - 2009,Closed,ITA,ita,2038-1700,0037-8763,2038-1727,,550,Roma Società Geologica Italiana,Bollettino della Società Geologica Italiana.,,,http://www.socgeol.info/Ricerca/pubblicazioni.asp,MATHEMATICS. NATURAL SCIENCES,Earth Sciences. Geological sciences,,
1039,0037-8763,1882 - 2009,Closed,ITA,ita,0037-8763,0037-8763,2038-1719,0366-2241,550,Roma Società Geologica Italiana,Bollettino della Società Geologica Italiana.,,,,MATHEMATICS. NATURAL SCIENCES,Earth Sciences. Geological sciences,,
1044,0366-2241,1947 - 1998,Closed,ITA,ita,0366-2241,0366-2241,0037-8763,0366-2314,,Roma Servizio Geologico d'Italia,Bollettino del Servizio Geologico d'Italia.,,,,,,,


In [7]:
len(idf)

83

In [16]:
idf.to_csv('./input/issnExtracted-Simone.tsv',sep="\t")