# Check Basics

> Latest stats are from 2020, either because scimago stopped collecting data, or newer data is deemed unreliable regarding citations

In [6]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("scimago-journals.csv")

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
sourceid,28773,19434,20315,29431,21100812243
rank,1,2,3,4,5
title,Ca-A Cancer Journal for Clinicians,MMWR Recommendations and Reports,Nature Reviews Molecular Cell Biology,Quarterly Journal of Economics,Nature Reviews Materials
type,journal,journal,journal,journal,journal
issn,"15424863, 00079235","10575987, 15458601","14710072, 14710080","00335533, 15314650",20588437
h_index,168,143,431,259,108
total_docs_2020,47,10,115,40,92
ref_per_doc,73.45,129.2,73.38,68.33,115.57
sjr_best_quartile,Q1,Q1,Q1,Q1,Q1
total_docs_3years,119,9,338,110,264


In [31]:
def get_cats(df):
    _cs = df["categories"].str.split("; ")
    idk = "sourceid"
    _df = pd.DataFrame(
        {"catbase": _cs.sum(), idk: np.repeat(df[idk].values, _cs.str.len())}
    )
    return pd.concat([_df, _df["catbase"].str.extract("(.*) \((.*)\)")], axis=1).assign(
        field=lambda df: np.where(df.loc[:, 0].isna(), df["catbase"], df.loc[:, 0])
    ).rename(columns={1: "Q"}).loc[:, ["field", "Q", idk]]

In [46]:
cat_base = get_cats(df.head(10000)).merge(df)

In [47]:
field_pivot = (
    cat_base.fillna("no Q")
    .pivot_table(index="field", columns="Q", values="sourceid", aggfunc="count")
    .fillna(0)
    .assign(s=lambda df: df.sum(axis=1))
    .sort_values("s", ascending=False)
)
field_pivot.head(15)

Q,Q1,Q2,Q3,Q4,miscellaneous,no Q,s
field,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Medicine (miscellaneous),599.0,600.0,98.0,0.0,0.0,0.0,1297.0
Education,318.0,177.0,0.0,0.0,0.0,6.0,501.0
Sociology and Political Science,320.0,35.0,0.0,0.0,0.0,2.0,357.0
"Ecology, Evolution, Behavior and Systematics",163.0,152.0,0.0,0.0,0.0,0.0,315.0
Computer Science Applications,162.0,115.0,0.0,0.0,0.0,36.0,313.0
Molecular Biology,99.0,99.0,98.0,14.0,0.0,0.0,310.0
Economics and Econometrics,165.0,134.0,0.0,0.0,0.0,2.0,301.0
Biochemistry,108.0,108.0,70.0,0.0,0.0,0.0,286.0
Psychiatry and Mental Health,133.0,134.0,13.0,0.0,0.0,0.0,280.0
"Public Health, Environmental and Occupational Health",137.0,137.0,0.0,0.0,0.0,0.0,274.0


In [95]:
def draw_table(df):
    return (
        df.pivot_table(index="field", columns="Q")
        .loc[field_pivot.head(15).index]
        .loc[:, lambda df: df.isna().mean() < 0.5]
        .style.background_gradient(axis=0)
    )

In [94]:
(
    cat_base.groupby(["field", "Q"])[["h_index", "journal_rating", "total_docs_3years"]]
    .mean()
    .pipe(draw_table)
)

Unnamed: 0_level_0,h_index,h_index,journal_rating,journal_rating,total_docs_3years,total_docs_3years
Q,Q1,Q2,Q1,Q2,Q1,Q2
field,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Medicine (miscellaneous),113.75793,64.521667,2.106803,0.740397,773.702838,518.175
Education,54.660377,31.446328,1.381597,0.601864,174.034591,126.163842
Sociology and Political Science,62.084375,26.371429,1.410753,0.531057,164.221875,94.4
"Ecology, Evolution, Behavior and Systematics",96.122699,49.282895,1.769196,0.653612,433.822086,248.486842
Computer Science Applications,91.950617,42.008696,1.584296,0.630165,638.679012,205.217391
Molecular Biology,159.848485,101.565657,4.443717,1.34302,586.868687,751.464646
Economics and Econometrics,92.212121,39.171642,3.524309,0.733552,253.248485,157.052239
Biochemistry,135.898148,84.87963,2.844343,0.951731,750.87037,660.981481
Psychiatry and Mental Health,107.609023,53.977612,2.119827,0.812478,448.887218,205.895522
"Public Health, Environmental and Occupational Health",82.007299,47.248175,1.658219,0.68992,509.080292,333.328467


In [88]:
def gini(s):
    vc = s.value_counts(normalize=True)
    diffs = np.abs(vc.values.reshape(-1, 1) - vc.values.reshape(1, -1))
    return diffs.sum() / (2 * vc.shape[0] ** 2 * vc.mean())

def top5(s):
    return s.value_counts(normalize=True).head(5).sum()

## Concentration metrics by field

In [96]:
cat_base.groupby(["field", "Q"])[["country", "publisher"]].agg([gini, top5]).pipe(draw_table)

Unnamed: 0_level_0,country,country,country,country,publisher,publisher,publisher,publisher
Unnamed: 0_level_1,gini,gini,top5,top5,gini,gini,top5,top5
Q,Q1,Q2,Q1,Q2,Q1,Q2,Q1,Q2
field,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Medicine (miscellaneous),0.803075,0.772549,0.894825,0.783333,0.573972,0.579221,0.216443,0.207358
Education,0.792004,0.774325,0.962264,0.898305,0.66965,0.573316,0.540881,0.525424
Sociology and Political Science,0.821429,0.612245,0.971875,0.942857,0.66709,0.3,0.478125,0.485714
"Ecology, Evolution, Behavior and Systematics",0.701578,0.647478,0.920245,0.743421,0.539318,0.407207,0.447205,0.273333
Computer Science Applications,0.726337,0.635404,0.950617,0.86087,0.47779,0.412267,0.403727,0.347826
Molecular Biology,0.679798,0.618493,0.949495,0.878788,0.441667,0.389881,0.424242,0.346939
Economics and Econometrics,0.581818,0.67853,0.987879,0.91791,0.553788,0.468169,0.509091,0.380597
Biochemistry,0.656566,0.654321,0.916667,0.925926,0.357474,0.408589,0.259259,0.333333
Psychiatry and Mental Health,0.720193,0.710021,0.917293,0.88806,0.42324,0.432836,0.285714,0.30597
"Public Health, Environmental and Occupational Health",0.71982,0.696559,0.919708,0.79562,0.429225,0.436334,0.335821,0.328467
