In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os, sys
sys.path.append('./analysis/')

%load_ext autoreload
%autoreload 2
import utils
from utils import RECENT_DATE_THRESHOLD

### Save annotated dataframe for public release of data (only run once)

In [28]:
# Add citation data at the end of the dataframe
lm_metadata = utils.annotate_lm_metadata()
lm_metadata = lm_metadata[[c for c in lm_metadata.columns if c != 'citationCount'] + ['citationCount']]
lm_metadata = utils.convert_citations_to_ranks(lm_metadata, month_window_sizes=[3, 12])
lm_metadata.id = lm_metadata.id.apply(lambda x: f'arXiv:{x}') # add arXiv prefix to ids
display(lm_metadata.sample(5))

# # Save the dataframe (this will include all the annotations we use in our analyses)
# lm_metadata.to_json(os.path.join(utils.PROCESSED_DATA_DIR, 'lm_metadata_all_annotations.json'),
#                     orient='records', lines=True)

Papers with S2 data: 16924 out of 16979 (99.7%)


Papers with an identified affiliation: 11688 out of 16979 (68.8%)


Unnamed: 0,id,authors,title,categories,abstract,versions,first_category,v1_date,LM_related_terms,mentions_LM_keyword,cluster,domains,industry,academic,citationCount,percentile_rank_in_3_month_window,percentile_rank_in_12_month_window
4552,arXiv:2105.08021,"[Qingyun Wang, Semih Yavuz, Victoria Lin, Heng...",Stage-wise Fine-tuning for Graph-to-Text Gener...,"[cs.CL, cs.AI]",Graph-to-text generation has benefited from ...,"[{'version': 'v1', 'created': 'Mon, 17 May 202...",cs.CL,2021-05-17 17:15:29,[language model],True,Knowledge Graphs and Commonsense,"[fb.com, illinois.edu, salesforce.com]",True,True,13.0,0.569714,0.629011
9483,arXiv:2210.12574,"[Koustuv Sinha, Amirhossein Kazemnejad, Siva R...",The Curious Case of Absolute Position Embeddings,"[cs.CL, cs.LG]",Transformer language models encode the notio...,"[{'version': 'v1', 'created': 'Sun, 23 Oct 202...",cs.CL,2022-10-23 00:00:04,[language model],True,"Representations, Syntax, Semantics",[mcgill.ca],False,True,5.0,0.691117,0.584592
13412,arXiv:2305.13478,"[Joseph Marvin Imperial, Ekaterina Kochmar]",Automatic Readability Assessment for Closely R...,[cs.CL],"In recent years, the main focus of research ...","[{'version': 'v1', 'created': 'Mon, 22 May 202...",cs.CL,2023-05-22 20:42:53,[language model],True,Translation & Low-Resource Languages,"[bath.ac.uk, mbzuai.ac.ae]",False,True,0.0,0.199259,0.228022
7860,arXiv:2205.10674,"[Shushan Arakelyan, Anna Hakhverdyan, Miltiadi...",NS3: Neuro-Symbolic Semantic Code Search,[cs.LG],Semantic code search is the task of retrievi...,"[{'version': 'v1', 'created': 'Sat, 21 May 202...",cs.LG,2022-05-21 20:55:57,[language model],True,Code Generation,"[usc.edu, allamanis.com]",False,True,1.0,0.188462,0.249245
10954,arXiv:2302.04012,"[Hossein Hajipour, Thorsten Holz, Lea Schönher...",Systematically Finding Security Vulnerabilitie...,"[cs.CR, cs.AI, cs.CL, cs.LG, cs.SE]","Recently, large language models for code gen...","[{'version': 'v1', 'created': 'Wed, 8 Feb 2023...",cs.CR,2023-02-08 11:54:07,"[language model, large language model]",True,Privacy & Adversarial Risks,[],False,False,6.0,0.7257,0.863389


In [29]:
lm_metadata_v2 = utils.load_annotated_lm_metadata()

In [30]:
# Print rows of lm_metadata that have academic == False but lm_metadata_v2 has academic == True
lm_metadata[(lm_metadata.academic == False) & (lm_metadata_v2.academic == True)]

Unnamed: 0,id,authors,title,categories,abstract,versions,first_category,v1_date,LM_related_terms,mentions_LM_keyword,cluster,domains,industry,academic,citationCount,percentile_rank_in_3_month_window,percentile_rank_in_12_month_window


In [34]:
# Find domains that are in lm_metadata_v2 but not in lm_metadata
domains_in_lm_metadata_v2 = lm_metadata_v2.explode('domains').domains.unique()
domains_in_lm_metadata = lm_metadata.explode('domains').domains.unique()

domains_in_lm_metadata_v2_but_not_in_lm_metadata = set(domains_in_lm_metadata_v2) - set(domains_in_lm_metadata)
print(f'{len(domains_in_lm_metadata_v2_but_not_in_lm_metadata)} domains in lm_metadata_v2 but not in lm_metadata')
print(domains_in_lm_metadata_v2_but_not_in_lm_metadata)

domains_in_lm_metadata_but_not_in_lm_metadata_v2 = set(domains_in_lm_metadata) - set(domains_in_lm_metadata_v2)
print(f'{len(domains_in_lm_metadata_but_not_in_lm_metadata_v2)} domains in lm_metadata but not in lm_metadata_v2')
print(domains_in_lm_metadata_but_not_in_lm_metadata_v2)

4 domains in lm_metadata_v2 but not in lm_metadata
{'toronto.edu', 'cuhk.edu.cn', 'se.cuhk.edu.hk.com', 'cuhk.edu.hk'}
1 domains in lm_metadata but not in lm_metadata_v2
{'cuhk.edu'}


In [35]:
# Print all domains with cuhk.edu in them in domain_counts_v2
domains_in_lm_metadata_v2 = set(lm_metadata_v2.explode('domains').domains.unique())
for d in domains_in_lm_metadata_v2:
    if not isinstance(d, str):
        print(d)
        continue
    if isinstance(d, str) and "cuhk.edu" in d:
        print(d)

nan
cuhk.edu.hk
se.cuhk.edu.hk.com
cuhk.edu.cn


In [36]:
domain_counts_v2 = lm_metadata_v2.explode('domains').domains.value_counts()
domain_counts = lm_metadata.explode('domains').domains.value_counts()

for domain in domains_in_lm_metadata_v2_but_not_in_lm_metadata:
    print(domain)
    print(domain_counts_v2[domain])
    # print(domain_counts[domain])
    print()

toronto.edu
55

cuhk.edu.cn
30

se.cuhk.edu.hk.com
1

cuhk.edu.hk
93



In [40]:
# Display rows in lm_metadata_v2 where 'u-tokyo.ac' is in the list of domains
display(lm_metadata_v2[lm_metadata_v2.domains.apply(lambda x: 'bits-pilani.ac.in' in x)])

Unnamed: 0,id,authors,title,categories,abstract,versions,first_category,v1_date,LM_related_terms,mentions_LM_keyword,cluster,domains,industry,academic,above_pred_female_threshold,inferred_female_frac_nqg_uncertainty_threshold_0.100,citationCount,percentile_rank_in_3_month_window,percentile_rank_in_12_month_window
1676,arXiv:2003.13821,"[Ayush Jain, Dr. N. M. Meenachi, Dr. B. Venkat...",NukeBERT: A Pre-trained language model for Low...,"[cs.LG, stat.ML]",Significant advances have been made in recen...,"[{'version': 'v1', 'created': 'Mon, 30 Mar 202...",cs.LG,2020-03-30 21:10:19,"[language model, BERT]",True,Question Answering & Retrieval,"[bits-pilani.ac.in, igcar.gov.in]",False,False,False,0.0,7.0,0.242,0.314909
2174,arXiv:2006.00593,"[Rajaswa Patil, Somesh Singh, Swati Agarwal]",BPGC at SemEval-2020 Task 11: Propaganda Detec...,[cs.CL],Propaganda spreads the ideology and beliefs ...,"[{'version': 'v1', 'created': 'Sun, 31 May 202...",cs.CL,2020-05-31 19:35:53,[BERT],True,Social Media & Misinformation,[bits-pilani.ac.in],False,False,True,0.5,8.0,0.27619,0.345332
2175,arXiv:2006.00607,"[Siddhant Mahurkar, Rajaswa Patil]",LRG at SemEval-2020 Task 7: Assessing the Abil...,[cs.CL],"In this paper, we assess the ability of BERT...","[{'version': 'v1', 'created': 'Sun, 31 May 202...",cs.CL,2020-05-31 20:55:08,"[BERT, language model]",True,Emotion/Sentiment Analysis,"[bits-pilani.ac.in, vitstudent.ac.in]",False,False,False,0.0,5.0,0.184127,0.245936
2246,arXiv:2006.08870,"[Ahan M. R., Shreyas Sunil Kulkarni]",End-to-End Code Switching Language Models for ...,"[cs.CL, cs.SD, eess.AS]","In this paper, we particularly work on the c...","[{'version': 'v1', 'created': 'Tue, 16 Jun 202...",cs.CL,2020-06-16 02:11:18,"[language model, BERT]",True,Spelling & Grammar Correction,[bits-pilani.ac.in],False,False,False,0.0,2.0,0.088095,0.116117
3535,arXiv:2012.14427,"[Mohit Sewak, Sanjay K. Sahay, Hemant Rathore]",Assessment of the Relative Importance of diffe...,"[cs.CR, cs.LG]",Recurrent deep learning language models like...,"[{'version': 'v1', 'created': 'Sat, 26 Dec 202...",cs.CR,2020-12-26 18:00:37,[language model],True,Privacy & Adversarial Risks,"[microsoft.com, bits-pilani.ac.in]",True,False,False,0.0,4.0,0.236196,0.20785
3743,arXiv:2101.11891,"[Shreya Gupta, Parantak Singh, Megha Sundriyal...",LESA: Linguistic Encapsulation and Semantic Am...,[cs.CL],The conceptualization of a claim lies at the...,"[{'version': 'v1', 'created': 'Thu, 28 Jan 202...",cs.CL,2021-01-28 09:51:30,[language model],True,Biases & Harms,"[bits-pilani.ac.in, iiitd.ac.in]",False,True,True,0.5,9.0,0.446023,0.52607
3901,arXiv:2102.12254,"[Gunjan Chhablani, Abheesht Sharma, Harshit Pa...",NLRG at SemEval-2021 Task 5: Toxic Spans Detec...,"[cs.CL, cs.LG]",Toxicity detection of text has been a popula...,"[{'version': 'v1', 'created': 'Wed, 24 Feb 202...",cs.CL,2021-02-24 12:30:09,[BERT],True,Toxicity & Hate Speech,"[bits-pilani.ac.in, uncg.edu]",False,False,False,0.0,9.0,0.446023,0.52607
3902,arXiv:2102.12255,"[Abheesht Sharma, Harshit Pandey, Gunjan Chhab...",LRG at SemEval-2021 Task 4: Improving Reading ...,"[cs.CL, cs.LG]","In this article, we present our methodologie...","[{'version': 'v1', 'created': 'Wed, 24 Feb 202...",cs.CL,2021-02-24 12:33:12,"[language model, BERT]",True,Question Answering & Retrieval,[bits-pilani.ac.in],False,False,False,0.0,2.0,0.140152,0.189171
3923,arXiv:2103.00380,"[Abheesht Sharma, Harshit Pandey]",LRG at TREC 2020: Document Ranking with XLNet-...,"[cs.IR, cs.CL, cs.LG]",Establishing a good information retrieval sy...,"[{'version': 'v1', 'created': 'Sun, 28 Feb 202...",cs.IR,2021-02-28 03:04:29,[XLNet],True,Entity Extraction & RecSys,[bits-pilani.ac.in],False,False,False,0.0,2.0,0.140152,0.189171
3936,arXiv:2103.00854,"[Rajaswa Patil, Jasleen Dhillon, Siddhant Mahu...",Vy\=akarana: A Colorless Green Benchmark for S...,[cs.CL],While there has been significant progress to...,"[{'version': 'v1', 'created': 'Mon, 1 Mar 2021...",cs.CL,2021-03-01 09:07:58,"[language model, BERT]",True,Translation & Low-Resource Languages,"[bits-pilani.ac.in, vitstudent.ac.in]",False,False,False,0.25,1.0,0.089015,0.119151


In [38]:
domain_counts_v2['u-tokyo.ac'], domain_counts['u-tokyo.ac']

(48, 48)

### Load annotated data

In [2]:
lm_metadata = utils.load_annotated_lm_metadata()
display(lm_metadata.sample(5))

lm_metadata['n_authors'] = lm_metadata.authors.apply(lambda x: len(x))

Unnamed: 0,id,authors,title,categories,abstract,versions,first_category,v1_date,LM_related_terms,mentions_LM_keyword,cluster,domains,industry,academic,above_pred_female_threshold,inferred_female_frac_nqg_uncertainty_threshold_0.100,citationCount,percentile_rank_in_3_month_window,percentile_rank_in_12_month_window
15402,arXiv:2307.06540,"[Yufei Xie, Rodolfo C. Raga]",Convolutional Neural Networks for Sentiment An...,"[cs.CL, cs.LG]",This study addressed the complex task of sen...,"[{'version': 'v1', 'created': 'Thu, 13 Jul 202...",cs.CL,2023-07-13 03:02:56,[BERT],True,Emotion/Sentiment Analysis,"[students.national-u.edu.ph, national-u.edu.ph]",False,False,False,0.0,0.0,0.353985,0.228022
11070,arXiv:2302.07445,"[Jiamou Sun, Zhenchang Xing, Qinghua Lu, Xiwei...",Silent Vulnerable Dependency Alert Prediction ...,"[cs.CR, cs.SE]","Due to convenience, open-source software is ...","[{'version': 'v1', 'created': 'Wed, 15 Feb 202...",cs.CR,2023-02-15 03:32:03,[BERT],True,Privacy & Adversarial Risks,"[csiro.au, anu.edu.au]",False,True,False,0.0,1.0,0.340574,0.546834
4378,arXiv:2104.10640,"[Sushant Singh, Ausif Mahmood]",The NLP Cookbook: Modern Recipes for Transform...,"[cs.CL, cs.LG]","In recent years, Natural Language Processing...","[{'version': 'v1', 'created': 'Tue, 23 Mar 202...",cs.CL,2021-03-23 22:38:20,"[language model, BERT]",True,Efficiency & Performance,[my.bridgeport.edu],False,False,False,0.0,37.0,0.808712,0.852941
3869,arXiv:2102.09914,"[Brooke Stephenson, Thomas Hueber, Laurent Gir...",Alternate Endings: Improving Prosody for Incre...,"[cs.CL, eess.AS]",The prosody of a spoken word is determined b...,"[{'version': 'v1', 'created': 'Fri, 19 Feb 202...",cs.CL,2021-02-19 13:11:34,[language model],True,Speech Recognition,"[univ-grenoble-alpes.fr, gipsa-lab.grenoble-in...",False,True,False,0.25,10.0,0.473485,0.55615
9373,arXiv:2210.09549,"[Ruijun Li, Weihua Li, Yi Yang, Hanyu Wei, Jia...",Swinv2-Imagen: Hierarchical Vision Transformer...,"[cs.CV, cs.LG]","Recently, diffusion models have been proven ...","[{'version': 'v1', 'created': 'Tue, 18 Oct 202...",cs.CV,2022-10-18 02:50:34,[language model],True,Video & Multimodal Models,"[autuni.ac.nz, jlufe.edu.cn, utas.edu.au, aut....",False,False,,,4.0,0.639973,0.534366


## Proportion of papers with different authorship groups: pre-2023 v. 2023


In [3]:
lm_metadata_pre_2023 = lm_metadata[lm_metadata['v1_date'] < RECENT_DATE_THRESHOLD]
lm_metadata_post_2023 = lm_metadata[lm_metadata['v1_date'] >= RECENT_DATE_THRESHOLD]

### Affiliation

In [4]:
# # This sets the denominator as any paper with at least one tracked domain (deprecated)
# papers_with_domain_pre_2023 = lm_metadata_pre_2023.domains.apply(lambda x: len(x) > 0).sum()
# papers_with_domain_post_2023 = lm_metadata_post_2023.domains.apply(lambda x: len(x) > 0).sum()

# This sets the denominator as any paper from one of the institutions with >= 10 papers
papers_with_domain_pre_2023 = lm_metadata_pre_2023.apply(lambda x: x.academic or x.industry, axis=1).sum()
papers_with_domain_post_2023 = lm_metadata_post_2023.apply(lambda x: x.academic or x.industry, axis=1).sum()

# Academic
n_acad_pre_2023 = lm_metadata_pre_2023['academic'].sum()
print(f'Academic pre-2023: {n_acad_pre_2023} out of {papers_with_domain_pre_2023} ({100*n_acad_pre_2023/papers_with_domain_pre_2023 :.1f}%)')
n_acad_post_2023 = lm_metadata_post_2023['academic'].sum()
print(f'Academic post-2023: {n_acad_post_2023} out of {papers_with_domain_post_2023} ({100*n_acad_post_2023/papers_with_domain_post_2023 :.1f}%)')
print()

# Industry
n_ind_pre_2023 = lm_metadata_pre_2023['industry'].sum()
print(f'Industry pre-2023: {n_ind_pre_2023} out of {papers_with_domain_pre_2023} ({100*n_ind_pre_2023/papers_with_domain_pre_2023 :.1f}%)')
n_ind_post_2023 = lm_metadata_post_2023['industry'].sum()
print(f'Industry post-2023: {n_ind_post_2023} out of {papers_with_domain_post_2023} ({100*n_ind_post_2023/papers_with_domain_post_2023 :.1f}%)')

Academic pre-2023: 6153 out of 7372 (83.5%)
Academic post-2023: 3784 out of 4255 (88.9%)

Industry pre-2023: 2642 out of 7372 (35.8%)
Industry post-2023: 1132 out of 4255 (26.6%)


In [6]:
# Count the % papers in the top 10 US domains before and after 2023: utils.top_10_us_institution_domains
# Count the % papers in the top 10 Chinese domains before and after 2023: utils.top_10_chinese_institution_domains

# Top 10 US
n_us_pre_2023 = lm_metadata_pre_2023.domains.apply(lambda x: len(set(x).intersection(utils.top_10_us_institution_domains)) > 0).sum()
print(f'US pre-2023: {n_us_pre_2023} out of {papers_with_domain_pre_2023} ({100*n_us_pre_2023/papers_with_domain_pre_2023 :.1f}%)')
n_us_post_2023 = lm_metadata_post_2023.domains.apply(lambda x: len(set(x).intersection(utils.top_10_us_institution_domains)) > 0).sum()
print(f'US post-2023: {n_us_post_2023} out of {papers_with_domain_post_2023} ({100*n_us_post_2023/papers_with_domain_post_2023 :.1f}%)')
print()

# Top 10 China
n_china_pre_2023 = lm_metadata_pre_2023.domains.apply(lambda x: len(set(x).intersection(utils.top_10_chinese_institution_domains)) > 0).sum()
print(f'China pre-2023: {n_china_pre_2023} out of {papers_with_domain_pre_2023} ({100*n_china_pre_2023/papers_with_domain_pre_2023 :.1f}%)')
n_china_post_2023 = lm_metadata_post_2023.domains.apply(lambda x: len(set(x).intersection(utils.top_10_chinese_institution_domains)) > 0).sum()
print(f'China post-2023: {n_china_post_2023} out of {papers_with_domain_post_2023} ({100*n_china_post_2023/papers_with_domain_post_2023 :.1f}%)')

# Do Chi-square tests for increase in US papers from pre-2023 to post-2023, same for China
from scipy.stats import chi2_contingency
print()
print("Significant increase in top 10 U.S. university papers?")
print(chi2_contingency([[n_us_pre_2023, n_us_post_2023], [papers_with_domain_pre_2023 - n_us_pre_2023, papers_with_domain_post_2023 - n_us_post_2023]]))

print()
print("Significant increase in top 10 Chinese university papers?")
print(chi2_contingency([[n_china_pre_2023, n_china_post_2023], [papers_with_domain_pre_2023 - n_china_pre_2023, papers_with_domain_post_2023 - n_china_post_2023]]))

US pre-2023: 1289 out of 7372 (17.5%)
US post-2023: 715 out of 4255 (16.8%)

China pre-2023: 935 out of 7372 (12.7%)
China post-2023: 687 out of 4255 (16.1%)

Significant increase in top 10 U.S. university papers?
Chi2ContingencyResult(statistic=0.8307872522569287, pvalue=0.36204498556760867, dof=1, expected_freq=array([[1270.61907629,  733.38092371],
       [6101.38092371, 3521.61907629]]))

Significant increase in top 10 Chinese university papers?
Chi2ContingencyResult(statistic=26.657786980984238, pvalue=2.4286999226553046e-07, dof=1, expected_freq=array([[1028.41524039,  593.58475961],
       [6343.58475961, 3661.41524039]]))


### Pred. gender

In [14]:
# This is the number of papers with at least one predicted gender for one of the authors.
papers_with_pred_gender_pre_2023 = (~pd.isna(lm_metadata_pre_2023.above_pred_female_threshold)).sum()
papers_with_pred_gender_post_2023 = (~pd.isna(lm_metadata_post_2023.above_pred_female_threshold)).sum()

# Majority predicted-female
n_pred_female_pre_2023 = lm_metadata_pre_2023['above_pred_female_threshold'].sum()
print(f'Majority predicted-female pre-2023: {n_pred_female_pre_2023} out of {papers_with_pred_gender_pre_2023} ({100*n_pred_female_pre_2023/papers_with_pred_gender_pre_2023 :.1f}%)')
n_pred_female_post_2023 = lm_metadata_post_2023['above_pred_female_threshold'].sum()
print(f'Majority predicted-female post-2023: {n_pred_female_post_2023} out of {papers_with_pred_gender_post_2023} ({100*n_pred_female_post_2023/papers_with_pred_gender_post_2023 :.1f}%)')

Majority predicted-female pre-2023: 1712.0 out of 9769 (17.5%)
Majority predicted-female post-2023: 946.0 out of 5886 (16.1%)


### Predicted gender vs. industry/academic

In [68]:
papers_with_gender = lm_metadata[(~pd.isna(lm_metadata.above_pred_female_threshold))]
n_total = len(papers_with_gender)

# Compute above_pred_female_threshold for academic and industry papers separately
n_pred_female_industry = ((papers_with_gender.above_pred_female_threshold) & (papers_with_gender.industry)).sum()
n_industry = (papers_with_gender.industry).sum()
print(f'Majority pred-female on industry-affiliated papers: {n_pred_female_industry} out of {n_industry} papers ({100*n_pred_female_industry/n_industry :.1f}%)')

n_pred_female_industry_only = ((papers_with_gender.above_pred_female_threshold) & (papers_with_gender.industry) & (~papers_with_gender.academic)).sum()
n_industry_only = ((papers_with_gender.industry) & (~papers_with_gender.academic)).sum()
print(f'Majority pred-female on industry-affiliation-only papers: {n_pred_female_industry_only} out of {n_industry_only} papers ({100*n_pred_female_industry_only/n_industry_only :.1f}%)')

n_pred_female_academic = ((papers_with_gender.above_pred_female_threshold) & (papers_with_gender.academic)).sum()
n_academic = (papers_with_gender.academic).sum()
print(f'Majority pred-female on academic-affiliated papers: {n_pred_female_academic} out of {n_academic} papers ({100*n_pred_female_academic/n_academic :.1f}%)')

n_pred_female_academic_only = ((papers_with_gender.above_pred_female_threshold) & (~papers_with_gender.industry) & (papers_with_gender.academic)).sum()
n_academic_only = ((~papers_with_gender.industry) & (papers_with_gender.academic)).sum()
print(f'Majority pred-female on academic-affiliation-only papers: {n_pred_female_academic_only} out of {n_academic_only} papers ({100*n_pred_female_academic_only/n_academic_only :.1f}%)')

n_pred_female_collab = ((papers_with_gender.above_pred_female_threshold) & (papers_with_gender.industry) & (papers_with_gender.academic)).sum()
n_collab = ((papers_with_gender.industry) & (papers_with_gender.academic)).sum()
print(f'Majority pred-female on industry & academic-affiliated papers: {n_pred_female_collab} out of {n_collab} papers ({100*n_pred_female_collab/n_collab :.1f}%)')

print(chi2_contingency([[n_pred_female_industry_only, n_industry_only],
                        [n_pred_female_academic_only, n_academic_only]]))

Majority pred-female on industry-affiliated papers: 472 out of 3520 papers (13.4%)
Majority pred-female on industry-affiliation-only papers: 191 out of 1592 papers (12.0%)
Majority pred-female on academic-affiliated papers: 1648 out of 9095 papers (18.1%)
Majority pred-female on academic-affiliation-only papers: 1367 out of 7167 papers (19.1%)
Majority pred-female on industry & academic-affiliated papers: 281 out of 1928 papers (14.6%)
Chi2ContingencyResult(statistic=31.974369384949345, pvalue=1.5622022722865024e-08, dof=1, expected_freq=array([[ 269.25598527, 1513.74401473],
       [1288.74401473, 7245.25598527]]))


### Author count

In [78]:
# Map author counts to 1, 2-5, 6-9, or 10+
def map_author_counts(x):
    if x == 1: return '1'
    elif x <= 5: return '2-5'
    elif x <= 9: return '6-9'
    else: return '10+'

# Paper team size percentages sorted by keys
print('Pre-2023 paper team sizes')
print('Mean team size:', lm_metadata_pre_2023.n_authors.median())
print(lm_metadata_pre_2023.n_authors.apply(map_author_counts).value_counts().sort_index())
print(lm_metadata_pre_2023.n_authors.apply(map_author_counts).value_counts().sort_index() / len(lm_metadata_pre_2023))

print('Post-2023 paper team sizes')
print('Mean team size:', lm_metadata_post_2023.n_authors.median())
print(lm_metadata_post_2023.n_authors.apply(map_author_counts).value_counts().sort_index())
print(lm_metadata_post_2023.n_authors.apply(map_author_counts).value_counts().sort_index() / len(lm_metadata_post_2023))

Pre-2023 paper team sizes
Mean team size: 4.0
1       364
10+     436
2-5    7346
6-9    2406
Name: n_authors, dtype: int64
1      0.034496
10+    0.041319
2-5    0.696171
6-9    0.228014
Name: n_authors, dtype: float64
Post-2023 paper team sizes
Mean team size: 5.0
1       323
10+     513
2-5    3790
6-9    1801
Name: n_authors, dtype: int64
1      0.050257
10+    0.079820
2-5    0.589700
6-9    0.280224
Name: n_authors, dtype: float64


In [76]:
# Compute chi2 test on the frequency of solo-author papers

print(chi2_contingency([[lm_metadata_pre_2023.n_authors.apply(lambda x: x == 1).sum(),
                            lm_metadata_post_2023.n_authors.apply(lambda x: x == 1).sum()],
                        [lm_metadata_pre_2023.n_authors.apply(lambda x: x > 1).sum(),
                            lm_metadata_post_2023.n_authors.apply(lambda x: x > 1).sum()]]))

Chi2ContingencyResult(statistic=25.151258126479604, pvalue=5.300506209523394e-07, dof=1, expected_freq=array([[  426.95235291,   260.04764709],
       [10125.04764709,  6166.95235291]]))
