# Field Analysis

> Latest stats are from 2021, either because scimago stopped collecting data, or newer data is deemed unreliable regarding citations

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("journal.csv")

In [3]:
def get_cats(df):
    _cs = df["categories"].str.split("; ")
    idk = "sourceid"
    _df = pd.DataFrame(
        {"catbase": _cs.sum(), idk: np.repeat(df[idk].values, _cs.str.len())}
    )
    return pd.concat([_df, _df["catbase"].str.extract("(.*) \((.*)\)")], axis=1).assign(
        field=lambda df: np.where(df.loc[:, 0].isna(), df["catbase"], df.loc[:, 0])
    ).rename(columns={1: "Q"}).loc[:, ["field", "Q", idk]]

In [4]:
cat_base = get_cats(df.head(10000)).merge(df)

In [5]:
field_pivot = (
    cat_base.fillna("no Q")
    .pivot_table(index="field", columns="Q", values="sourceid", aggfunc="count")
    .fillna(0)
    .assign(s=lambda df: df.sum(axis=1))
    .sort_values("s", ascending=False)
)
field_pivot.head(15).style.set_caption("Count values by fields")

Q,Q1,Q2,Q3,Q4,arts and humanities,clinical,medical,miscellaneous,no Q,social science,s
field,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Computer Networks and Communications,52.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2500.0,0.0,2553.0
Computer Science Applications,116.0,1.0,2.0,5.0,0.0,0.0,0.0,0.0,1521.0,0.0,1645.0
Electrical and Electronic Engineering,101.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,1328.0,0.0,1442.0
Artificial Intelligence,58.0,3.0,1.0,2.0,0.0,0.0,0.0,0.0,1240.0,0.0,1304.0
Hardware and Architecture,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1193.0,0.0,1217.0
Software,67.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,1082.0,0.0,1153.0
"Safety, Risk, Reliability and Quality",12.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,910.0,0.0,925.0
Information Systems,52.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,836.0,0.0,893.0
Instrumentation,7.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,789.0,0.0,797.0
Control and Optimization,12.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,781.0,0.0,794.0


In [6]:
def draw_table(df):
    return (
        df.pivot_table(index="field", columns="Q")
        .loc[field_pivot.head(15).index]
        .loc[:, lambda df: df.isna().mean() < 0.5]
        .style.background_gradient(axis=0)
    )

In [7]:
(
    cat_base.groupby(["field", "Q"])[["h_index", "journal_rating", "total_docs_3years"]]
    .mean()
    .pipe(draw_table)
    .set_caption("Mean values of fields")
)

Unnamed: 0_level_0,h_index,h_index,journal_rating,journal_rating,total_docs_3years,total_docs_3years
Q,Q1,Q4,Q1,Q4,Q1,Q4
field,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Computer Networks and Communications,85.653846,,2.371385,,502.519231,
Computer Science Applications,103.931034,24.2,2.284948,0.1522,637.698276,194.4
Electrical and Electronic Engineering,133.0,10.307692,2.708901,0.111462,1004.574257,50.076923
Artificial Intelligence,95.758621,21.0,2.700621,0.106,486.155172,185.5
Hardware and Architecture,79.916667,,2.158458,,509.0,
Software,110.910448,31.0,2.634373,0.116,527.014925,323.333333
"Safety, Risk, Reliability and Quality",91.75,6.0,1.95575,0.134,571.083333,75.0
Information Systems,95.711538,11.75,2.520904,0.1375,391.326923,44.75
Instrumentation,80.142857,6.0,2.621571,0.107,1348.285714,95.0
Control and Optimization,52.083333,,2.322083,,470.833333,


In [8]:
def gini(s):
    vc = s.value_counts(normalize=True)
    diffs = np.abs(vc.values.reshape(-1, 1) - vc.values.reshape(1, -1))
    return diffs.sum() / (2 * vc.shape[0] ** 2 * vc.mean())

def top5(s):
    return s.value_counts(normalize=True).head(5).sum()

## Concentration metrics by fields

In [9]:
cat_base.groupby(["field", "Q"])[["country", "publisher"]].agg([gini, top5]).pipe(draw_table).set_caption("Concentration metrics by fields")

Unnamed: 0_level_0,country,country,country,country,publisher,publisher,publisher,publisher
Unnamed: 0_level_1,gini,gini,top5,top5,gini,gini,top5,top5
Q,Q1,Q4,Q1,Q4,Q1,Q4,Q1,Q4
field,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Computer Networks and Communications,0.593407,,0.961538,,0.467451,,0.607843,
Computer Science Applications,0.653257,0.15,0.956897,1.0,0.479206,0.0,0.443478,1.0
Electrical and Electronic Engineering,0.681754,0.221154,0.980198,0.769231,0.632997,0.13986,0.69697,0.538462
Artificial Intelligence,0.55665,0.0,0.965517,1.0,0.419753,0.0,0.54386,1.0
Hardware and Architecture,0.566667,,1.0,,0.402778,,0.708333,
Software,0.579957,0.166667,0.970149,1.0,0.418873,0.0,0.462687,1.0
"Safety, Risk, Reliability and Quality",0.291667,0.0,1.0,1.0,0.305556,0.0,0.916667,1.0
Information Systems,0.532051,0.0,0.980769,1.0,0.357143,0.0,0.442308,1.0
Instrumentation,0.380952,0.0,1.0,1.0,0.0,0.0,0.714286,1.0
Control and Optimization,0.333333,,1.0,,0.348485,,0.909091,
