In [160]:
from api_wrapper import search_scopus
import os
# os.environ['PYB_CONFIG_FILE'] = "./pybliometrics.cfg"
os.environ['PYB_CONFIG_FILE'] = "~/.config/pybliometrics.cfg"


from pybliometrics.scopus import ScopusSearch, CitationOverview
import pandas as pd
import requests
import yaml
import itertools as it
from tqdm import tqdm
import glob
import json
import sys
from time import sleep
import numpy as np

rng = np.random.default_rng()
pd.set_option('max_colwidth', None)


In [120]:
def combine_terms(term_list, how='and'):

    if how == 'and':
        joiner = " AND "
    elif how == 'or':
        joiner = " OR "

    terms = []
    for term in term_list:
        if term:
            if " " in term:
                terms.append(f"\"{term}\"")
            else:
                terms.append(f"{term}")
        else:
            continue
    terms = joiner.join(terms)

    return terms

In [None]:
TERM_TYPES = {'lang':'LANGUAGE',
              'subject':'SUBJAREA'}
def limit_terms(term_list, term_type):

    limit_query = []
    try:
        code = TERM_TYPES[term_type]
    except ValueError:
        raise ValueError
    
    for term in term_list:
        q = f"LIMIT-TO({code}, \"term\")"
        limit_query.append(q)
    
    
    return 

In [144]:
with open('keywords.yml', 'r') as file:
    data = yaml.load(file, yaml.SafeLoader)

In [145]:
data.keys()

dict_keys(['Include', 'Exclude', 'incl_subjects', 'excl_subjects', 'all_subjects'])

In [146]:
include = data['Include']
include.keys()

dict_keys(['Energy modelling', 'Energy justice', 'Aspects of Justice', 'Location', 'Planning processes', 'Model development'])

In [147]:
keys_q6 = ["Location", 
           "Energy justice", 
           "Planning processes", 
           "Energy modelling"]

In [148]:
exclude_subjects = combine_terms(data['excl_subjects'], how='or')

In [149]:
terms_list = []
for key in keys_q6:
    topic_terms = combine_terms(include[key], how='or')
    query_str = f"TITLE-ABS-KEY({topic_terms})"
    terms_list.append(query_str)
    # display(query_str.strip('\''))
long_query = " AND ".join(terms_list)
long_query

'TITLE-ABS-KEY(Municipal* OR Local* OR Community OR City) AND TITLE-ABS-KEY(justice OR equity) AND TITLE-ABS-KEY(Planning OR vision* OR goal* OR Decision*making OR Participat* OR Deliberat* OR Democra* OR Transpar*) AND TITLE-ABS-KEY("Energy systems" OR "Energy model*" OR "Energy system optimization" OR Multi*objective)'

In [150]:
excluded_terms = combine_terms(data['Exclude'], how='or')
excluded_terms

'building OR machine*learning OR "artificial intelligence" OR AI OR "Game theor*" OR "Integrated assessment model" OR Agriculture'

In [151]:
subjects = combine_terms(data['incl_subjects'], how='or')
subjects

'COMP OR EART OR ENER OR ENGI OR ENVI OR MATH OR DECI OR ECON OR SOCI'

In [152]:
q6 = (f"{long_query} " + 
        f"AND NOT TITLE-ABS-KEY({excluded_terms}) " +
        f"AND SUBJAREA({subjects}) " +
        f"AND NOT SUBJAREA({exclude_subjects}) " +
        f"AND LANGUAGE(english)")
q6

'TITLE-ABS-KEY(Municipal* OR Local* OR Community OR City) AND TITLE-ABS-KEY(justice OR equity) AND TITLE-ABS-KEY(Planning OR vision* OR goal* OR Decision*making OR Participat* OR Deliberat* OR Democra* OR Transpar*) AND TITLE-ABS-KEY("Energy systems" OR "Energy model*" OR "Energy system optimization" OR Multi*objective) AND NOT TITLE-ABS-KEY(building OR machine*learning OR "artificial intelligence" OR AI OR "Game theor*" OR "Integrated assessment model" OR Agriculture) AND SUBJAREA(COMP OR EART OR ENER OR ENGI OR ENVI OR MATH OR DECI OR ECON OR SOCI) AND NOT SUBJAREA(HEAL OR DENT OR MEDI OR MULT OR NURS OR VETE OR AGRI OR BIOC OR IMMU OR NEUR OR PHAR OR BUSI OR CHEM OR CENG OR PSYC) AND LANGUAGE(english)'

In [153]:
combinations = list(it.product(*[v for k, v in include.items() if k in keys_q6]))

In [154]:
len(combinations)

256

In [104]:
q6 = f"TITLE-ABS-KEY({combine_terms(combinations[0])})"

In [105]:
x = ScopusSearch(query=q6, view='COMPLETE')

In [106]:
excluded_terms = combine_terms(data['Exclude'], how='or')

In [107]:
frames=[]
for combo in tqdm(combinations[:3]):
        combined_terms = combine_terms(combo)
        q6 = f"TITLE-ABS-KEY({combined_terms}) AND NOT TITLE-ABS-KEY({excluded_terms})"

        x = ScopusSearch(query=q6, view='COMPLETE')

        if x.results:
            print(f"Query: {combined_terms} , Results count: {len(x.results)}", end='\n', flush=True)
            results = pd.DataFrame([doc._asdict() for doc in x.results])
            frames.append(results)

            # store the results and add the ref_docs key to store each reference
            for doc in tqdm(x.results, file=sys.stdout):
                sleep(0.1)
                rand_val = rng.integers(low=0,high=1)
                if rand_val == 1:
                    print("SKIP (File already exists)", end='\r', flush=True)
        else:
            print(f"Query: {combined_terms} , Results count: {x.results}", end='\n', flush=True)

    

  0%|          | 0/3 [00:00<?, ?it/s]

Query: 'Energy systems' AND 'Energy justice' AND Municipal AND 'Energy planning' , Results count: 2
100%|██████████| 2/2 [00:00<00:00,  9.52it/s]

 33%|███▎      | 1/3 [00:00<00:00,  4.04it/s]


Query: 'Energy systems' AND 'Energy justice' AND Municipal AND Planning , Results count: 2
100%|██████████| 2/2 [00:00<00:00,  9.35it/s]

 67%|██████▋   | 2/3 [00:00<00:00,  4.30it/s]


Query: 'Energy systems' AND 'Energy justice' AND Municipal AND 'Municipal planning' , Results count: 2
100%|██████████| 2/2 [00:00<00:00,  9.51it/s]

100%|██████████| 3/3 [00:00<00:00,  4.30it/s]







In [43]:
# q = "TITLE-ABS-KEY ( energy AND justice ) AND DOCTYPE ( re ) AND PUBYEAR > 2015"
frames = []
for combo in tqdm(combinations):
    q6 = f"TITLE-ABS-KEY({combine_terms(combo)})"
    x = ScopusSearch(query=q6, view='COMPLETE')
    if x.results:
        print(f"{len(x.results)} matches found.", flush=True, end='\')
        results = pd.DataFrame([doc._asdict() for doc in x.results])
        # results['query_terms'] = combo*len(results)
        frames.append(results)

  0%|          | 0/1600 [00:00<?, ?it/s]

2 matches found.
2 matches found.
2 matches found.
1 matches found.
6 matches found.


  0%|          | 7/1600 [00:00<02:30, 10.58it/s]

3 matches found.
1 matches found.


  1%|          | 9/1600 [00:01<06:18,  4.20it/s]

2 matches found.


  1%|          | 11/1600 [00:03<12:30,  2.12it/s]

39 matches found.


  1%|          | 12/1600 [00:31<2:54:31,  6.59s/it]

39 matches found.


  1%|          | 13/1600 [00:42<3:19:31,  7.54s/it]

2 matches found.


  1%|          | 14/1600 [00:42<2:31:09,  5.72s/it]

3 matches found.


  1%|          | 15/1600 [01:04<4:21:05,  9.88s/it]

26 matches found.


  1%|          | 17/1600 [01:15<3:18:52,  7.54s/it]

39 matches found.


  1%|          | 17/1600 [01:23<2:10:04,  4.93s/it]


KeyboardInterrupt: 

In [50]:
files = glob.glob("./output/question_06/*.json")
journal = set()
for f in files:
    with open(f, 'r') as doc:
        data = json.load(doc)
    journal.add(data['publicationName'])

In [52]:
len(journal)

2116

In [38]:
df = pd.concat(frames, axis=0)