# Check Basics

> Latest stats are from 2021, either because scimago stopped collecting data, or newer data is deemed unreliable regarding citations

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("journal.csv")

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
sourceid,21100939709,21100842169,22882,21100830485,13244
rank,392,659,661,799,854
title,Proceedings of the 8th USENIX Symposium on Ope...,AISec 2017 - Proceedings of the 10th ACM Works...,Proceedings - Annual IEEE Symposium on Foundat...,SIGCOMM 2017 - Proceedings of the 2017 Confer...,Evidence-Based Medicine
type,conference and proceedings,conference and proceedings,conference and proceedings,conference and proceedings,journal
issn,-,-,02725428,-,"14736810, 13565524"
h_index,4,9,97,27,28
total_docs_2020,0.0,0.0,0.0,0.0,0.0
ref_per_doc,0.0,0.0,0.0,0.0,0.0
sjr_best_quartile,-,-,-,-,Q1
total_docs_3years,6,16,92,37,145


In [4]:
def get_cats(df):
    _cs = df["categories"].str.split("; ")
    idk = "sourceid"
    _df = pd.DataFrame(
        {"catbase": _cs.sum(), idk: np.repeat(df[idk].values, _cs.str.len())}
    )
    return pd.concat([_df, _df["catbase"].str.extract("(.*) \((.*)\)")], axis=1).assign(
        field=lambda df: np.where(df.loc[:, 0].isna(), df["catbase"], df.loc[:, 0])
    ).rename(columns={1: "Q"}).loc[:, ["field", "Q", idk]]

In [5]:
cat_base = get_cats(df.head(10000)).merge(df)

In [6]:
field_pivot = (
    cat_base.fillna("no Q")
    .pivot_table(index="field", columns="Q", values="sourceid", aggfunc="count")
    .fillna(0)
    .assign(s=lambda df: df.sum(axis=1))
    .sort_values("s", ascending=False)
)
field_pivot.head(15)

Q,Q1,Q2,Q3,Q4,arts and humanities,clinical,medical,miscellaneous,no Q,nursing,social science,s
field,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Computer Networks and Communications,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2552.0,0.0,0.0,2553.0
Computer Science Applications,3.0,1.0,2.0,5.0,0.0,0.0,0.0,0.0,1634.0,0.0,0.0,1645.0
Electrical and Electronic Engineering,1.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,1428.0,0.0,0.0,1442.0
Artificial Intelligence,0.0,3.0,1.0,2.0,0.0,0.0,0.0,0.0,1298.0,0.0,0.0,1304.0
Hardware and Architecture,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1216.0,0.0,0.0,1217.0
Software,3.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,1146.0,0.0,0.0,1153.0
"Safety, Risk, Reliability and Quality",0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,922.0,0.0,0.0,925.0
Information Systems,2.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,886.0,0.0,0.0,893.0
Instrumentation,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,796.0,0.0,0.0,797.0
Control and Optimization,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,793.0,0.0,0.0,794.0


In [7]:
def draw_table(df):
    return (
        df.pivot_table(index="field", columns="Q")
        .loc[field_pivot.head(15).index]
        .loc[:, lambda df: df.isna().mean() < 0.5]
        .style.background_gradient(axis=0)
    )

In [8]:
(
    cat_base.groupby(["field", "Q"])[["h_index", "journal_rating", "total_docs_3years"]]
    .mean()
    .pipe(draw_table)
)

Unnamed: 0_level_0,h_index,journal_rating,total_docs_3years
Q,Q4,Q4,Q4
field,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Computer Networks and Communications,,,
Computer Science Applications,24.2,0.1522,194.4
Electrical and Electronic Engineering,10.307692,0.111462,50.076923
Artificial Intelligence,21.0,0.106,185.5
Hardware and Architecture,,,
Software,31.0,0.116,323.333333
"Safety, Risk, Reliability and Quality",6.0,0.134,75.0
Information Systems,11.75,0.1375,44.75
Instrumentation,6.0,0.107,95.0
Control and Optimization,,,


In [9]:
def gini(s):
    vc = s.value_counts(normalize=True)
    diffs = np.abs(vc.values.reshape(-1, 1) - vc.values.reshape(1, -1))
    return diffs.sum() / (2 * vc.shape[0] ** 2 * vc.mean())

def top5(s):
    return s.value_counts(normalize=True).head(5).sum()

## Concentration metrics by field

In [10]:
cat_base.groupby(["field", "Q"])[["country", "publisher"]].agg([gini, top5]).pipe(draw_table)

Unnamed: 0_level_0,country,country,publisher,publisher
Unnamed: 0_level_1,gini,top5,gini,top5
Q,Q4,Q4,Q4,Q4
field,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Computer Networks and Communications,,,,
Computer Science Applications,0.15,1.0,0.0,1.0
Electrical and Electronic Engineering,0.221154,0.769231,0.13986,0.538462
Artificial Intelligence,0.0,1.0,0.0,1.0
Hardware and Architecture,,,,
Software,0.166667,1.0,0.0,1.0
"Safety, Risk, Reliability and Quality",0.0,1.0,0.0,1.0
Information Systems,0.0,1.0,0.0,1.0
Instrumentation,0.0,1.0,0.0,1.0
Control and Optimization,,,,
