In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import ast
import statsmodels.api as sm
from statsmodels.tsa.api import SARIMAX

import matplotlib.pyplot as plt
import seaborn as sns


# Basic Trends

In [2]:
df = pd.read_csv("../FINAL_ARXIV_2025.csv")

# Parse date
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Keep only 2025
df_2025 = df[df["date"].dt.year == 2025].copy()

# Month index
df_2025["month"] = df_2025["date"].dt.to_period("M")


In [3]:
# Approximate paper length
df_2025["abstract_length"] = df_2025["abstract"].fillna("").str.split().str.len()

# Figures & tables
df_2025["num_figures"] = pd.to_numeric(df_2025["figures"], errors="coerce").fillna(0)
df_2025["num_tables"] = pd.to_numeric(df_2025["tables"], errors="coerce").fillna(0)

# Collaboration proxy: number of authors
df_2025["num_authors"] = (
    df_2025["authors"]
    .fillna("")
    .apply(lambda x: len([a for a in x.split(",") if a.strip()]))
)

In [4]:
monthly = (
    df_2025
    .groupby("month")
    .agg(
        avg_abstract_length=("abstract_length", "mean"),
        avg_figures=("num_figures", "mean"),
        avg_tables=("num_tables", "mean"),
        avg_authors=("num_authors", "mean"),
        paper_count=("title", "count")
    )
    .reset_index()
)
#removing november because huge outlier and august beause figure count is too high
monthly["month_idx"] = np.arange(len(monthly))
outlier_month = pd.Period("2025-11", freq="M")

monthly = monthly[monthly["month"] != outlier_month].copy()
outlier_month = pd.Period("2025-08", freq="M")
monthly = monthly[monthly["month"] != outlier_month].copy()
outlier_month = pd.Period("2025-10", freq="M")
monthly = monthly[monthly["month"] != outlier_month].copy()
monthly

Unnamed: 0,month,avg_abstract_length,avg_figures,avg_tables,avg_authors,paper_count,month_idx
0,2025-01,212.290923,10.505952,7.165923,9.723214,1344,0
1,2025-02,209.236985,10.353535,80.996115,9.48174,1287,1
2,2025-03,212.940694,10.28265,5.872555,12.622082,1585,2
3,2025-04,210.728698,9.829584,10.321745,9.089298,1467,3
4,2025-05,212.734488,10.2114,2.526696,9.437951,1386,4
5,2025-06,215.192828,9.431664,3.907984,10.566306,1478,5
6,2025-07,209.227423,8.965859,4.424009,11.535242,1816,6
8,2025-09,210.057527,10.193548,2.415054,9.655914,1860,8
11,2025-12,205.363636,9.138135,11.210744,10.543684,1694,11


In [5]:
def sarimax_forecast(series, steps=3):
    model = SARIMAX(
        series,
        order=(1, 1, 1),
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    result = model.fit(disp=False)
    forecast = result.forecast(steps=steps)
    return forecast

forecast_df = pd.DataFrame({
    "month": pd.period_range(monthly["month"].iloc[-1] + 1, periods=3, freq="M"),
    "abstract_length": sarimax_forecast(monthly["avg_abstract_length"]),
    "figures": sarimax_forecast(monthly["avg_figures"]),
    "tables": sarimax_forecast(monthly["avg_tables"]),
    "authors": sarimax_forecast(monthly["avg_authors"])
})

forecast_df

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Unnamed: 0,month,abstract_length,figures,tables,authors
9,2026-01,207.558249,9.741983,10.601726,10.269967
10,2026-02,206.263469,9.438713,10.630971,10.364456
11,2026-03,207.027365,9.591024,10.629567,10.331838


In [6]:
yearly_counts = pd.DataFrame({
    "year": list(range(2000, 2026)),
    "articles": [
        6341, 6806, 7027, 7899, 8242, 8746, 9271, 10140, 10162,
        11280, 11606, 11948, 12120, 12470, 12585, 12900, 13214,
        13262, 14079, 14421, 14835, 14463, 14496, 15274,
        16333, 18661
    ]
})

yearly_counts


Unnamed: 0,year,articles
0,2000,6341
1,2001,6806
2,2002,7027
3,2003,7899
4,2004,8242
5,2005,8746
6,2006,9271
7,2007,10140
8,2008,10162
9,2009,11280


In [7]:
# Growth Model USING LOG LINEAR REGRESSION

yearly_counts["t"] = yearly_counts["year"] - yearly_counts["year"].min()
yearly_counts["log_articles"] = np.log(yearly_counts["articles"])

X = sm.add_constant(yearly_counts["t"])
y = yearly_counts["log_articles"]

growth_model = sm.OLS(y, X).fit()

t_2026 = yearly_counts["t"].max() + 1
log_pred_2026 = growth_model.predict([1, t_2026])[0]
pred_2026 = int(np.exp(log_pred_2026))

pred_2026


18651

# citation predictions

In [8]:
# affiliation predictions
df_affil = pd.read_csv("../test_filled_21.csv")

df_affil["citations"] = (
    pd.to_numeric(df_affil["non_self_citations"], errors="coerce")
    .fillna(0)
)


In [9]:
def parse_list(x):
    if pd.isna(x):
        return []
    try:
        parsed = ast.literal_eval(x)
        return parsed if isinstance(parsed, list) else []
    except Exception:
        return []

df_affil["author_list"] = df_affil["authors"].apply(parse_list)

def parse_affiliations(x):
    if pd.isna(x):
        return []
    try:
        raw = ast.literal_eval(x)
        if not isinstance(raw, list):
            return []

        affils = []
        for item in raw:
            if not isinstance(item, str):
                continue
            # Split multiple affiliations per author
            for aff in item.split(";"):
                aff = aff.strip()
                if aff:
                    affils.append(aff)
        return affils
    except Exception:
        return []

df_affil["affiliation_list"] = df_affil["affiliations"].apply(parse_affiliations)



In [10]:
author_rows = []

for _, row in df_affil.iterrows():
    authors = row["author_list"]
    if not authors:
        continue

    credit = row["citations"] / len(authors)

    for a in authors:
        author_rows.append({
            "author": a.strip(),
            "citations": credit
        })

author_df = pd.DataFrame(author_rows)


In [11]:
top_authors_now = (
    author_df
    .groupby("author", as_index=False)["citations"]
    .sum()
    .sort_values("citations", ascending=False)
    .head(10)
)

top_authors_now


Unnamed: 0,author,citations
16976,Eleonora Di Valentino,88.760877
63128,William J. Wolf,79.833333
56398,Shouvik Roy Choudhury,71.666667
58002,Supriya Pan,60.593333
33881,Kohei Inayoshi,52.825864
50622,Rafael C. Nunes,49.707619
2863,Abraham Loeb,49.333333
5365,Andronikos Paliathanasis,48.776667
14794,Deng Wang,44.5
22973,H.-Thomas Janka,44.0


In [12]:
author_future = (
    author_df
    .groupby("author")
    .agg(
        total_citations=("citations", "sum"),
        appearances=("citations", "count")
    )
)

author_future["citations_per_paper"] = (
    author_future["total_citations"] / author_future["appearances"]
)

future_top_authors = (
    author_future
    .sort_values("citations_per_paper", ascending=False)
    .head(10)
)

future_top_authors


Unnamed: 0_level_0,total_citations,appearances,citations_per_paper
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Shouvik Roy Choudhury,71.666667,2,35.833333
Yong Xu,31.0,1,31.0
Andrew L. Miller,24.0,1,24.0
Lu Huang,22.333333,1,22.333333
H.-Thomas Janka,44.0,2,22.0
William J. Wolf,79.833333,4,19.958333
T.A. Dzhatdoev,19.0,1,19.0
Mai Yashiki,17.0,1,17.0
Suvashis Maity,33.0,2,16.5
KM3NeT collaboration,32.0,2,16.0


In [13]:
affil_rows = []

for _, row in df_affil.iterrows():
    affils = row["affiliation_list"]
    if not affils:
        continue

    credit = row["citations"] / len(affils)

    for aff in affils:
        if not isinstance(aff, str):
            continue

        aff = aff.strip()
        if not aff:
            continue

        affil_rows.append({
            "affiliation": aff,
            "citations": credit
        })


affil_df = pd.DataFrame(affil_rows)


In [14]:
affil_df['affiliation'][0]

'Space Telescope Science Institute, 3700 San Martin Dr, Baltimore, MD 21218, USA'

In [15]:
top_affiliations_now = (
    affil_df
    .groupby("affiliation", as_index=False)["citations"]
    .sum()
    .sort_values("citations", ascending=False)
    .head(10)
)

top_affiliations_now


Unnamed: 0,affiliation,citations
33279,"Lawrence Berkeley National Laboratory, 1 Cyclo...",145.701254
14360,Department of Physics &amp,142.625485
42323,"Space Telescope Science Institute, 3700 San Ma...",141.479572
40081,"School of Mathematical and Physical Sciences, ...",127.537037
2362,'Max Planck Institute for Gravitational Physic...,101.0
27922,"Institute of Astronomy, University of Cambridg...",100.650885
30286,International Centre for Radio Astronomy Resea...,99.636364
11427,Department of Astronomy &amp,92.92632
44775,"Universit 'e Paris-Saclay, CNRS/IN2P3, IJCLab,...",91.930909
31416,Kavli Institute for Astronomy and Astrophysics...,77.865054


In [16]:
affil_future = (
    affil_df
    .groupby("affiliation")
    .agg(
        total_citations=("citations", "sum"),
        appearances=("citations", "count")
    )
)

affil_future["citations_per_paper"] = (
    affil_future["total_citations"] / affil_future["appearances"]
)

future_top_affiliations = (
    affil_future
    .sort_values("citations_per_paper", ascending=False)
    .head(10)
)

future_top_affiliations


Unnamed: 0_level_0,total_citations,appearances,citations_per_paper
affiliation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"'Max Planck Institute for Gravitational Physics (Albert Einstein Institute), D-14476 Potsdam, Germany', 'Department of Astronomy, Beijing Normal University, Beijing 100875, China', 'Kapteyn Astronomical Institute, University of Groningen, PO Box 800, NL-9700 AV, Groningen, The Netherlands', ' Institut für Experimentalphysik, University of Hamburg, Luruper Chaussee 149, 22761 Hamburg, Germany', ' IRFU, CEA, Université Paris-Saclay, F-91191 Gif-sur-Yvette, France', 'Max Planck Institute for Gravitational Physics (Albert Einstein Institute), D-14476 Potsdam, Germany'",101.0,2,50.5
"California Univ., Santa Cruz",42.0,1,42.0
"Department of Astronomy, Case Western Reserve University, 10900 Euclid Avenue, Cleveland, Ohio 44106, USA",39.0,1,39.0
"PRISMA+ Cluster of Excellence and Mainz Institute for Theoretical Physics, Johannes Gutenberg University, 55099 Mainz, Germany",31.0,1,31.0
"Indian Institute of Science Education and Research, Pune, 411008, India",30.0,1,30.0
"'Institute of Astrophysics and Space Sciences, Faculty of Sciences, University of Lisbon, P-1769-016 Lisbon, Portugal'",30.0,1,30.0
"'Astronomy Centre, University of Sussex, Falmer, Brighton, BN1 9QH, UK'",24.0,1,24.0
"Kavli Institute for Cosmology Cambridge and Institute of Astronomy, Madingley Road, Cambridge CB3 OHA, United Kingdom",23.0,1,23.0
"Department of Astronomy, California Institute of Technology, 1200 E California Blvd, Pasadena, CA, 91125, USA",22.0,1,22.0
"Division of Science Education and Institute of Fusion Science, Jeonbuk National University, Jeonju 54896, Republic of Korea",20.5,1,20.5
