# Field Analysis

> Latest stats are from 2021, either because scimago stopped collecting data, or newer data is deemed unreliable regarding citations

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("journal.csv")

In [3]:
def get_cats(df):
    _cs = df["categories"].str.split("; ")
    idk = "sourceid"
    _df = pd.DataFrame(
        {"catbase": _cs.sum(), idk: np.repeat(df[idk].values, _cs.str.len())}
    )
    return pd.concat([_df, _df["catbase"].str.extract("(.*) \((.*)\)")], axis=1).assign(
        field=lambda df: np.where(df.loc[:, 0].isna(), df["catbase"], df.loc[:, 0])
    ).rename(columns={1: "Q"}).loc[:, ["field", "Q", idk]]

In [4]:
cat_base = get_cats(df.head(10000)).merge(df)

In [5]:
field_pivot = (
    cat_base.fillna("no Q")
    .pivot_table(index="field", columns="Q", values="sourceid", aggfunc="count")
    .fillna(0)
    .assign(s=lambda df: df.sum(axis=1))
    .sort_values("s", ascending=False)
)
field_pivot.head(15).style.set_caption("Count values by fields")

Q,Q1,Q2,Q3,Q4,miscellaneous,no Q,s
field,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Medicine (miscellaneous),612.0,611.0,5.0,0.0,0.0,0.0,1228.0
Education,337.0,166.0,0.0,0.0,0.0,0.0,503.0
Computer Science Applications,174.0,174.0,4.0,0.0,0.0,3.0,355.0
Sociology and Political Science,327.0,22.0,0.0,0.0,0.0,0.0,349.0
Economics and Econometrics,174.0,150.0,0.0,0.0,0.0,0.0,324.0
"Ecology, Evolution, Behavior and Systematics",172.0,127.0,0.0,0.0,0.0,0.0,299.0
Molecular Biology,99.0,100.0,99.0,0.0,0.0,0.0,298.0
Psychiatry and Mental Health,137.0,137.0,17.0,0.0,0.0,0.0,291.0
Electrical and Electronic Engineering,168.0,106.0,0.0,0.0,0.0,13.0,287.0
"Public Health, Environmental and Occupational Health",142.0,136.0,0.0,0.0,0.0,0.0,278.0


In [6]:
def draw_table(df):
    return (
        df.pivot_table(index="field", columns="Q")
        .loc[field_pivot.head(15).index]
        .loc[:, lambda df: df.isna().mean() < 0.5]
        .style.background_gradient(axis=0)
    )

In [7]:
(
    cat_base.groupby(["field", "Q"])[["h_index", "journal_rating", "total_docs_3years"]]
    .mean()
    .pipe(draw_table)
    .set_caption("Mean values of fields")
)

Unnamed: 0_level_0,h_index,h_index,journal_rating,journal_rating,total_docs_3years,total_docs_3years
Q,Q1,Q2,Q1,Q2,Q1,Q2
field,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Medicine (miscellaneous),118.382353,71.324059,1.977067,0.664445,825.107843,555.322422
Education,57.827893,33.114458,1.25373,0.592361,184.115727,144.325301
Computer Science Applications,92.54023,43.367816,1.891839,0.707368,642.017241,268.563218
Sociology and Political Science,65.577982,33.136364,1.309413,0.521091,173.562691,113.545455
Economics and Econometrics,95.022989,41.38,3.128592,0.70642,267.258621,166.88
"Ecology, Evolution, Behavior and Systematics",97.505814,54.354331,1.54811,0.612693,444.72093,264.866142
Molecular Biology,165.111111,103.75,3.755131,1.14669,585.40404,786.84
Psychiatry and Mental Health,111.423358,57.29927,1.914197,0.768876,458.029197,238.021898
Electrical and Electronic Engineering,111.005952,61.264151,2.033702,0.624264,892.178571,1069.367925
"Public Health, Environmental and Occupational Health",83.56338,52.441176,1.612197,0.655044,543.93662,398.492647


In [8]:
def gini(s):
    vc = s.value_counts(normalize=True)
    diffs = np.abs(vc.values.reshape(-1, 1) - vc.values.reshape(1, -1))
    return diffs.sum() / (2 * vc.shape[0] ** 2 * vc.mean())

def top5(s):
    return s.value_counts(normalize=True).head(5).sum()

## Concentration metrics by fields

In [9]:
cat_base.groupby(["field", "Q"])[["country", "publisher"]].agg([gini, top5]).pipe(draw_table).set_caption("Concentration metrics by fields")

Unnamed: 0_level_0,country,country,country,country,publisher,publisher,publisher,publisher
Unnamed: 0_level_1,gini,gini,top5,top5,gini,gini,top5,top5
Q,Q1,Q2,Q1,Q2,Q1,Q2,Q1,Q2
field,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Medicine (miscellaneous),0.811634,0.757824,0.897059,0.782324,0.581513,0.570499,0.216393,0.188834
Education,0.796921,0.767914,0.94362,0.891566,0.665984,0.5531,0.540299,0.515152
Computer Science Applications,0.729885,0.673428,0.931034,0.850575,0.503868,0.442821,0.398844,0.283237
Sociology and Political Science,0.802635,0.393939,0.966361,1.0,0.667734,0.242424,0.48773,0.545455
Economics and Econometrics,0.443678,0.701333,1.0,0.92,0.540525,0.478596,0.482759,0.36
"Ecology, Evolution, Behavior and Systematics",0.720203,0.645669,0.895349,0.80315,0.555603,0.373512,0.44186,0.261905
Molecular Biology,0.687879,0.576364,0.949495,0.89,0.437546,0.40427,0.40404,0.343434
Psychiatry and Mental Health,0.72732,0.714112,0.912409,0.854015,0.422333,0.423889,0.291971,0.306569
Electrical and Electronic Engineering,0.68006,0.622642,0.97619,0.849057,0.661936,0.475472,0.660606,0.45283
"Public Health, Environmental and Occupational Health",0.72334,0.702012,0.915493,0.823529,0.440714,0.414439,0.335714,0.294118
