In [None]:
from IPython.display import JSON
from ratelimit import limits, sleep_and_retry
import json
import time
import ast
from tqdm import tqdm

tqdm.pandas()

import pandas as pd
import xmltodict
from tqdm.notebook import tqdm, trange

from serpapi import GoogleSearch

import sys, os

# Disable
def blockPrint():
    sys.stdout = open(os.devnull, "w")


# Restore
def enablePrint():
    sys.stdout = sys.__stdout__

### Read in URLs

In [None]:
# loading 25494 unique OJS journal URLs
# df = pd.read_csv("../data/gscholar_urls.csv")
df = pd.read_csv("../data/scholar_tocheck_domains.csv")
len(df)

In [None]:
# sample for testing
# df = df.sample(2)
df.rename(columns={'domain': 'url'}, inplace=True)
df

### Search GScholar

In [None]:
# function to query gscholar max 15000 times per hour (our serpapi limit is 20000)

blockPrint()


@sleep_and_retry
@limits(calls=15000, period=3600)
def query_journal_url(url):
    try:
        query_string = "site:" + url
        params = {
            "engine": "google_scholar",
            "q": query_string,
            "api_key": "dbf73152443f7b9c57c405609fa9be00baaf8b325d648a796b252f01c500aa1e",
        }
        search = GoogleSearch(params)
        results = search.get_dict()
        # organic_results = results['organic_results']
        return results
        # return GoogleSearch(params).get_dict()
    except Exception as e:
        # print(repr(e))
        return repr(e)


def get_total_results(result_json):
    try:
        return result_json["search_information"]["total_results"]
    except:
        return result_json["search_information"]["organic_results_state"]

In [None]:
## $$$ COSTS MONEY $$$ ##
df["result_json"] = df.progress_apply(lambda row: query_journal_url(row["url"]), axis=1)

In [None]:
# output to csv
# df.to_csv("gscholar_urls_mapped.csv", encoding="utf-8", index=False)

In [None]:
df["n_results"] = df.progress_apply(
    lambda row: get_total_results(row["result_json"]), axis=1
)

In [None]:
df.head()

In [None]:
# total OJS articles on gscholar
df["n_results"].value_counts()

In [None]:
df[df["n_results"] != "Fully empty"]["n_results"].sum()

In [None]:
JSON(df["result_json"].iloc[0])

In [None]:
# output to csv
# df.to_csv("gscholar_domains_mapped.csv", encoding="utf-8", index=False)

In [None]:
df

### Read back data for stats

In [None]:
df = pd.read_csv("gscholar_domains_mapped.csv")
df = df[df["n_results"] != "Fully empty"].reset_index(drop=True)
df

In [None]:
JSON(ast.literal_eval(df["result_json"].iloc[232]))

In [None]:
def get_single_citation(x):
    try:
        return x["inline_links"]["cited_by"]["total"]
    except:
        return 0


def get_total_citations(result_json):

    result_json = ast.literal_eval(result_json)
    counter = 0

    for i in range(len(result_json["organic_results"])):
        counter += get_single_citation(result_json["organic_results"][i])

    return counter

def get_max_citation_by_article(result_json):
    result_json = ast.literal_eval(result_json)
    max_cite = 0

    for i in range(len(result_json["organic_results"])):
        current_cite = get_single_citation(result_json["organic_results"][i])
        if current_cite > max_cite:
            max_cite = current_cite

    return max_cite

In [None]:
get_total_citations(df["result_json"].iloc[1])

In [None]:
get_single_citation(ast.literal_eval(df["result_json"].iloc[1])["organic_results"][0])

In [None]:
df["n_citations"] = df.progress_apply(
    lambda row: get_total_citations(row["result_json"]), axis=1
)
df

In [None]:
df["max_cite_per_article"] = df.progress_apply(
    lambda row: get_max_citation_by_article(row["result_json"]), axis=1
)
df

In [None]:
df["n_citations"].describe()

In [None]:
df["max_cite_per_article"].describe()

In [None]:
df[df["max_cite_per_article"] == 15220]

In [None]:
# df[["url", "n_citations", "max_cite_per_article"]].to_csv("gscholar_citations.csv", encoding="utf-8", index=False)

In [None]:
df.rename(columns={'url': 'domain'}, inplace=True)

In [None]:
df[['domain', 'n_citations']].to_csv("../data/scholar_present_domains_set2.csv", encoding="utf-8", index=False)