In [83]:
import pandas as pd
from urllib import request, parse
from bs4 import BeautifulSoup
import re
import copy
from IPython.display import display

In [84]:
table = pd.read_pickle("files/1.extracted data from xml.pkl")

In [85]:
len(table)

257225

# Identify SCI, SSCI or EI journals

In [91]:
# Get unique list of ISSN and EISSN
issn_eissn = table.groupby(["ISSN","EISSN", 'Published On']).agg(
    count = ("Abstract", "count") 
).reset_index()
print(len(issn_eissn))
issn_eissn = issn_eissn[(issn_eissn.ISSN != "") | (issn_eissn.EISSN != "")]
print(len(issn_eissn))
issn_eissn = issn_eissn.drop_duplicates(subset=["ISSN", "EISSN"])
print(len(issn_eissn))

2987
2169
2051


In [92]:
issn_eissn

Unnamed: 0,ISSN,EISSN,Published On,count
818,,1091-6490,Proceedings of the National Academy of Sciences,68
819,,1094-4087,Optics Express,4
820,,1099-1018,Fire and Materials,2
821,,1099-1077,Human Psychopharmacology: Clinical and Experim...,2
822,,1179-1500,Open Access Emergency Medicine,1
...,...,...,...,...
2981,2631-4967,,Airlines (International Air Transport Associat...,1
2982,8750-9229,,Roads & Bridges,404
2983,8755-1985,,Journal of Protective Coatings & Linings,32
2985,8756-1417,,Journal of Ship Production,0


In [88]:
# A network request builder and response collection function
# that can retrieve information whether specific ISSN or EISSN corresponds to SCI, SSCI or EI
def request_paper_detail(text):
    
    data = {'secondpass': 1, 'lang': 2, 'searchsrc': "ALL",
            "stext": text}
    url = "https://publik.tuwien.ac.at/info/sci_search.php"
    data = parse.urlencode(data).encode()
    req = request.Request(url, data=data)  # this will make the method "POST"
    resp = request.urlopen(req)
    soup = BeautifulSoup(resp, 'html.parser')
    return soup

In [89]:
issn_eissn["SCI"] = False
issn_eissn["SSCI"] = False
issn_eissn["EI"] = False
for index, row in issn_eissn.iterrows():
    print(str(index))
    issn = row["ISSN"]
    search_by = issn
    if issn =="":
        search_by = row["EISSN"]
    print("search by = "+search_by)
    if pd.isnull(search_by):
        continue
    
    if '-' not in search_by and len(search_by)>4:
        search_by = search_by[:4]+"-"+search_by[4:]
    print("search by = "+search_by)
    soup = request_paper_detail(search_by)
    all_td = soup.find_all('td', {'align': 'left', 'valign': 'top'})

    if len(all_td) == 0 :
            issn_eissn.at[index, "Searched Found Items"] = 0
    for each in all_td:
        journal_section = each.find_all('span')
        
        issn_eissn.at[index, "Searched Found Items"] = len(journal_section)
        
        for each_journal_section in journal_section:
            splitted_text = str(each_journal_section.text).splitlines()
            splitted_text = [text.strip() for text in splitted_text]
            print(splitted_text)
            category = re.findall(r'\bS\S+I\b', splitted_text[2])
            if len(category)>0:
                issn_eissn.at[index, category[0]]=True
            


818
search by = 1091-6490
search by = 1091-6490
819
search by = 1094-4087
search by = 1094-4087
['OPTICS EXPRESS', 'ISSN: 1094-4087', 'Index: SCI']
820
search by = 1099-1018
search by = 1099-1018
821
search by = 1099-1077
search by = 1099-1077
822
search by = 1179-1500
search by = 1179-1500
823
search by = 1407-6179
search by = 1407-6179
824
search by = 1440-1819
search by = 1440-1819
825
search by = 1442-9071
search by = 1442-9071
826
search by = 1463-1326
search by = 1463-1326
827
search by = 1464-5491
search by = 1464-5491
828
search by = 1468-2079
search by = 1468-2079
829
search by = 1468-2710
search by = 1468-2710
830
search by = 1468-3296
search by = 1468-3296
831
search by = 1471-2962
search by = 1471-2962
832
search by = 1475-4959
search by = 1475-4959
833
search by = 1476-069X
search by = 1476-069X
['ENVIRONMENTAL HEALTH', 'ISSN: 1476-069X', 'Index: SCI']
834
search by = 1476-4431
search by = 1476-4431
835
search by = 1478-7547
search by = 1478-7547
['COST EFFECTIVENESS AND R


KeyboardInterrupt



In [None]:
issn_eissn

In [47]:
ei_journals = pd.read_csv("data/ei_journals.csv", engine='python')

In [53]:
ei_issn_list = ei_journals["ISSN"].dropna().to_list()
ei_issn_list = [n.strip() for n in ei_issn_list]

In [54]:
ei_issn_list[:5]

['23297662', '7434618', '1491423', '10133119', '1266209']

In [ ]:
for index, row in issn_eissn.iterrows():
    issn = str(row["ISSN"])
    issn = issn.replace("-", "").strip()
    print(index)
    print(issn)
    
    if issn in ei_issn_list:
        issn_eissn.at[index, "EI"] = True

In [ ]:
issn_eissn["Manual"]=False
issn_eissn.loc[issn_eissn["ISSN"] == "0733-947X", ["Manual"]] = True

In [ ]:
issn_eissn["Selected"] = issn_eissn["SCI"] | issn_eissn["SSCI"] | issn_eissn["EI"] | issn_eissn["Manual"]

In [20]:
issn_eissn.to_csv("files/2.0 identified_sci_ssci_ei_journals.csv", index=False)
issn_eissn.to_pickle("files/2.0 identified_sci_ssci_ei_journals.pkl")  

# Identify valid conferences

In [95]:
# issn_eissn = pd.read_pickle("files/2.0 identified_sci_ssci_ei_journals.pkl")

In [96]:
issn_eissn

Unnamed: 0,ISSN,EISSN,Published On,count,SCI,SSCI,EI,Searched Found Items,Manual,Selected
0,0361-1981,,Transportation Research Record: Journal of the...,9815.0,True,False,False,1.0,False,True
1,0041-1558,,Transport Topics,6998.0,False,False,False,0.0,False,False
2,0005-2175,,Aviation Week & Space Technology,3294.0,False,False,False,0.0,False,False
3,0001-4575,,Accident Analysis & Prevention,3082.0,False,True,False,1.0,False,True
4,0887-9877,,Aviation International News,2925.0,False,False,False,0.0,False,False
...,...,...,...,...,...,...,...,...,...,...
2020,0095-0696,,Journal of Environmental Economics and Management,1.0,False,True,False,1.0,False,True
2021,1540-7330,,Journal of Prevention and Intervention in the ...,1.0,False,False,False,0.0,False,False
2022,1468-0491,,Governance,1.0,False,False,False,0.0,False,False
2023,1059-7123,1741-2633,Adaptive Behavior,1.0,True,True,False,2.0,False,True


In [97]:
table_conference = table[table["Conference"]!=""]
conference_unique = table_conference.drop_duplicates(subset=["Conference"])
conference_unique = conference_unique[["Conference", "Conference Location", "Type"]]