
This notebook implements functions to preprocess the raw data and prepare data
for analyzing temporal trends in the usage of GPT across different factors:

1. Author Frequency 
2. Number of Authors
3. Overall Trend

This process takes approximately 1 hour to complete. 

**Note** that the parquet files are  generated from a saved pkl instance of ArticleCollection. The details on this can be found at main_generate.py and scripts/generate_data.sh. The latter is a bash script I used to generate the parquet files. 

In [1]:
from base import * 
import pandas as pd 
import os 
import pickle 
from pathlib import Path 
from src.estimation import estimate_text_distribution
from src.MLE import MLE
import warnings
import numpy as np 
import matplotlib.pyplot as plt 
warnings.filterwarnings("ignore")

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
## the key variables 

subject = "education"
base_loc = f"inference_data/{subject}"
years = [2024, 2023, 2022, 2021]

## Author Frequency

We bin the articles for each quarter into two bins: 

    1. Written by authors with fewer than two publications in that month 
    2. Written by authors with two or more publications in that month 

In [8]:
def categorize_by_authors(collection):
    """
    maps authors to their corresponding articles  
    
    Args:
        collection: (base.ArticleCollection)
    Returns:
        (dict) (str) author name -> (List[int]) List of article ids
    """
    
    author_dict = {}
    count = 0 
    for ar_id in collection.all_articles:
        count += 1
        ar = collection.all_articles[ar_id]
        if ar.authors is not None and len(ar.authors) > 0:
            # all author data 
            #for author in ar.authors:
            #    if author not in author_dict:
            #        author_dict[author] = []
            #    author_dict[author].append(ar_id)
            
            ## first author data 
            author = ar.authors[0]
            if author not in author_dict:
                author_dict[author] = []
            author_dict[author].append(ar_id)

    return author_dict

def bin_by_freq(data_dict, cutoff=2):
    """
    bins the data into two groups according to the cutoff
    
    Args:
        cutoff: (int) 
        data_dict: (dict) (object) -> List[object]
    """
    
    cutoff_left = []
    cutoff_right = []
    for key in data_dict:
        if len(data_dict[key]) <= cutoff:
            cutoff_left += data_dict[key]
        else:
            cutoff_right += data_dict[key]
            
    return set(cutoff_left), set(cutoff_right)

def add_to_sub_collection(sub_collection, main_collection, ids=None):
    """
    add selected articles from the main collection to sub-collection
    
    Args:
        sub_collection: (ArticleCollection)
        main_collection: (ArticleCollection) 
        ids: (List[object]) list of ids of articles of interest 
    """
    if ids is None:
        ids = list(main_collection.all_articles.keys())
    for id_ in ids:
        sub_collection.add_article(main_collection.get_article(id_), id_)

def combine_collections(coll_list):
    """
    combine the a group of collections into a single collection, and returns the collection
    """
    if coll_list is None:
        raise ValueError("the parameter is None")
    if len(coll_list) == 0:
        raise ValueError("empty collection list")
        
    sample = coll_list[0]
    collection = ArticleCollection(subject=sample.subject)
    for coll in coll_list:
        add_to_sub_collection(collection, coll)
        
    return collection 
    

In [10]:
thresh = 2 # equals the value in Liang et al.
dest = Path("inference_data/parquet_files/first_author_quarterly/{}".format(subject))
dest.mkdir(parents=True, exist_ok=True)
time_window = 3
for year in [2024, 2023, 2022, 2021]:
    dest_year = dest / str(year)
    dest_year.mkdir(parents=True, exist_ok=True)
    quarterly_data = []
    months = np.arange(1, 13)
    q_count = 0
    for m in months:
        full_path = f"{base_loc}/{year}/{subject}_{year}_{m}.pkl"
        with open(full_path, "rb") as f:
            data = pickle.load(f)
            quarterly_data.append(data) 
            if len(quarterly_data) == time_window:
                combined = combine_collections(quarterly_data)
                quarterly_data = []
                by_author = categorize_by_authors(combined)
                q_count += 1
                name = f"{subject}_{year}_{q_count}"
                if len(by_author) > 0:
                    group1, group2 = bin_by_freq(by_author)
                    collection1 = ArticleCollection(subject, year)
                    collection2 = ArticleCollection(subject, year)
                    add_to_sub_collection(collection1, combined, group1)
                    add_to_sub_collection(collection2, combined, group2)
                    collection1.export_parquet(dest_year, f"{name}_less")
                    collection2.export_parquet(dest_year, f"{name}_more")

100%|███████████████████████████████████████| 7242/7242 [03:05<00:00, 39.13it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2024/education_2024_1_less.parquet


100%|█████████████████████████████████████████| 362/362 [00:06<00:00, 59.74it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2024/education_2024_1_more.parquet


100%|███████████████████████████████████████| 1063/1063 [00:28<00:00, 37.60it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2024/education_2024_2_less.parquet


100%|█████████████████████████████████████████████| 9/9 [00:00<00:00, 41.99it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2024/education_2024_2_more.parquet


100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 27.12it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2024/education_2024_3_less.parquet


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2024/education_2024_3_more.parquet


100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 30.60it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2024/education_2024_4_less.parquet


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2024/education_2024_4_more.parquet


100%|███████████████████████████████████████| 7578/7578 [03:19<00:00, 38.05it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2023/education_2023_1_less.parquet


100%|█████████████████████████████████████████| 623/623 [00:12<00:00, 50.41it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2023/education_2023_1_more.parquet


100%|█████████████████████████████████████| 10475/10475 [04:30<00:00, 38.72it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2023/education_2023_2_less.parquet


100%|███████████████████████████████████████| 1038/1038 [00:21<00:00, 49.14it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2023/education_2023_2_more.parquet


100%|███████████████████████████████████████| 8119/8119 [03:46<00:00, 35.78it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2023/education_2023_3_less.parquet


100%|█████████████████████████████████████████| 440/440 [00:12<00:00, 36.17it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2023/education_2023_3_more.parquet


100%|███████████████████████████████████████| 9715/9715 [04:18<00:00, 37.53it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2023/education_2023_4_less.parquet


100%|█████████████████████████████████████████| 381/381 [00:07<00:00, 49.41it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2023/education_2023_4_more.parquet


100%|███████████████████████████████████████| 5776/5776 [02:39<00:00, 36.17it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2022/education_2022_1_less.parquet


100%|█████████████████████████████████████████| 449/449 [00:10<00:00, 41.39it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2022/education_2022_1_more.parquet


100%|███████████████████████████████████████| 7612/7612 [03:35<00:00, 35.36it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2022/education_2022_2_less.parquet


100%|█████████████████████████████████████████| 555/555 [00:13<00:00, 40.87it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2022/education_2022_2_more.parquet


100%|███████████████████████████████████████| 5846/5846 [02:54<00:00, 33.48it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2022/education_2022_3_less.parquet


100%|█████████████████████████████████████████| 439/439 [00:12<00:00, 35.64it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2022/education_2022_3_more.parquet


100%|███████████████████████████████████████| 9041/9041 [04:26<00:00, 33.99it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2022/education_2022_4_less.parquet


100%|█████████████████████████████████████████| 695/695 [00:18<00:00, 37.85it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2022/education_2022_4_more.parquet


100%|███████████████████████████████████████| 4614/4614 [02:11<00:00, 34.96it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2021/education_2021_1_less.parquet


100%|█████████████████████████████████████████| 562/562 [00:17<00:00, 32.39it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2021/education_2021_1_more.parquet


100%|███████████████████████████████████████| 6198/6198 [02:56<00:00, 35.09it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2021/education_2021_2_less.parquet


100%|███████████████████████████████████████| 1079/1079 [00:37<00:00, 29.01it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2021/education_2021_2_more.parquet


100%|███████████████████████████████████████| 5475/5475 [02:59<00:00, 30.42it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2021/education_2021_3_less.parquet


100%|█████████████████████████████████████████| 521/521 [00:16<00:00, 31.09it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2021/education_2021_3_more.parquet


100%|███████████████████████████████████████| 6982/6982 [03:42<00:00, 31.34it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2021/education_2021_4_less.parquet


100%|█████████████████████████████████████████| 728/728 [00:19<00:00, 37.44it/s]


File Exported as: inference_data/parquet_files/first_author_quarterly/education/2021/education_2021_4_more.parquet


## Number of Authors

We consider the number of authors involved in the article. To decide the threshold for binning, we do some exploratory analyis. 

In [11]:
def get_num_authors(article_coll):
    """
    returns the number of authors in each article contained in the input collection 
    
    Args:
        article_coll: (ArticleCollection)
        
    Returns:
        (List[int]) number of authors in each article 
    """

    num_authors = []
    for ar_id in article_coll.all_articles:
        ar = article_coll.all_articles[ar_id]
        authors =  ar.authors 
        if authors is not None:
            num_authors.append(len(authors))

    return num_authors


all_authors = []
for y in years:
    with open("inference_data/education/{}/{}_{}_whole.pkl".format(y, subject, y), "rb") as f:
        data = pickle.load(f)
    all_authors += get_num_authors(data)
all_authors = np.asarray(all_authors)
md = np.median(all_authors)
print("Median: {}".format(md))

Median: 2.0


As seen from above cell, the median number of authors is 2. So we will have two bins, 

    1. Articles with number of authors less than or equal to 2 
    2. Articles with number of authors more than 2

In [12]:
def combine_collections(coll_list):
    """
    combine the a group of collections into a single collection
    """
    
    if coll_list is None:
        raise ValueError("the parameter is None")
    if len(coll_list) == 0:
        raise ValueError("empty collection list")
    elif len(coll_list) == 1:
        return coll_list[0]
        
    sample = coll_list[0]
    collection = ArticleCollection(subject=sample.subject)
    for coll in coll_list:
        add_to_sub_collection(collection, coll)
        
    return collection 

def group_by_num_authors(collection, thresh=2):
    """
    divides the article collection into two groups: 
       1. articles with authors less than or equal to the thresh
       2. articles with authors more than the thresh 
    
    Args:
       collection: (ArticleCollection)
       thres: (int) the threshold
    
    Returns:
        (ArticleCollection, ArticleCollection) the collection of articles in the two bins 
    """
    
    collection1 = ArticleCollection(collection.subject, collection.year)
    collection2 = ArticleCollection(collection.subject, collection.year)

    for ar_id in collection.all_articles:
        ar = collection.all_articles[ar_id]
        authors = ar.authors 
        n = len(authors)
        if authors is not None and n > 0:
            if n <= thresh:
                collection1.add_article(ar, ar_id) 
            else:
                collection2.add_article(ar, ar_id)
                
    return collection1, collection2
    

In [13]:
thresh = 2 
dest = Path("inference_data/parquet_files/num_authors/{}".format(subject))
dest.mkdir(parents=True, exist_ok=True)
time_window = 1 # the time frame over which we look 
for year in [2024, 2023, 2022, 2021]:
    dest_year = dest / str(year) 
    dest_year.mkdir(parents=True, exist_ok=True) 
    months = np.arange(1, 13) 
    grouped_data = []
    q_count = 0
    for m in months:
        full_path = f"{base_loc}/{year}/{subject}_{year}_{m}.pkl"
        with open(full_path, "rb") as f:
            data = pickle.load(f)
        grouped_data.append(data)
        if len(grouped_data) == time_window:
            combined = combine_collections(grouped_data)
            sub_coll1, sub_coll2 = group_by_num_authors(combined)
            grouped_data = []
            q_count += 1
            name = f"{subject}_{year}_{q_count}"
            print("Year: {}, Month: {}, total: {}, less than {}: {}, more than {}:{}".format(
                            year, m, combined.get_size(), thresh, sub_coll1.get_size(), 
                            thresh, sub_coll2.get_size()))
            sub_coll1.export_parquet(dest_year, f"{name}_less")
            sub_coll2.export_parquet(dest_year, f"{name}_more")

Year: 2024, Month: 1, total: 2726, less than 2: 1604, more than 2:1108


100%|███████████████████████████████████████| 1604/1604 [00:38<00:00, 41.59it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_1_less.parquet


100%|███████████████████████████████████████| 1108/1108 [00:33<00:00, 32.89it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_1_more.parquet
Year: 2024, Month: 2, total: 2192, less than 2: 1425, more than 2:754


100%|███████████████████████████████████████| 1425/1425 [00:33<00:00, 42.29it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_2_less.parquet


100%|█████████████████████████████████████████| 754/754 [00:23<00:00, 31.88it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_2_more.parquet
Year: 2024, Month: 3, total: 2727, less than 2: 1709, more than 2:1004


100%|███████████████████████████████████████| 1709/1709 [00:38<00:00, 44.73it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_3_less.parquet


100%|███████████████████████████████████████| 1004/1004 [00:29<00:00, 33.94it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_3_more.parquet
Year: 2024, Month: 4, total: 1042, less than 2: 700, more than 2:342


100%|█████████████████████████████████████████| 700/700 [00:17<00:00, 39.70it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_4_less.parquet


100%|█████████████████████████████████████████| 342/342 [00:11<00:00, 28.73it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_4_more.parquet
Year: 2024, Month: 5, total: 6, less than 2: 5, more than 2:1


100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 24.67it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_5_less.parquet


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 18.12it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_5_more.parquet
Year: 2024, Month: 6, total: 24, less than 2: 5, more than 2:19


100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 26.96it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_6_less.parquet


100%|███████████████████████████████████████████| 19/19 [00:00<00:00, 27.81it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_6_more.parquet
Year: 2024, Month: 7, total: 0, less than 2: 0, more than 2:0


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_7_less.parquet


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_7_more.parquet
Year: 2024, Month: 8, total: 4, less than 2: 3, more than 2:1


100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 26.18it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_8_less.parquet


100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 23.36it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_8_more.parquet
Year: 2024, Month: 9, total: 0, less than 2: 0, more than 2:0


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_9_less.parquet


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_9_more.parquet
Year: 2024, Month: 10, total: 0, less than 2: 0, more than 2:0


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_10_less.parquet


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_10_more.parquet
Year: 2024, Month: 11, total: 0, less than 2: 0, more than 2:0


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_11_less.parquet


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_11_more.parquet
Year: 2024, Month: 12, total: 2, less than 2: 2, more than 2:0


100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 30.87it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_12_less.parquet


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_12_more.parquet
Year: 2023, Month: 1, total: 2548, less than 2: 1549, more than 2:983


100%|███████████████████████████████████████| 1549/1549 [00:41<00:00, 37.76it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_1_less.parquet


100%|█████████████████████████████████████████| 983/983 [00:30<00:00, 32.02it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_1_more.parquet
Year: 2023, Month: 2, total: 2424, less than 2: 1585, more than 2:776


100%|███████████████████████████████████████| 1585/1585 [00:37<00:00, 41.85it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_2_less.parquet


100%|█████████████████████████████████████████| 776/776 [00:25<00:00, 30.70it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_2_more.parquet
Year: 2023, Month: 3, total: 3322, less than 2: 2220, more than 2:1088


100%|███████████████████████████████████████| 2220/2220 [00:54<00:00, 40.87it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_3_less.parquet


100%|███████████████████████████████████████| 1088/1088 [00:35<00:00, 31.01it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_3_more.parquet
Year: 2023, Month: 4, total: 3440, less than 2: 2234, more than 2:1190


100%|███████████████████████████████████████| 2234/2234 [00:51<00:00, 43.35it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_4_less.parquet


100%|███████████████████████████████████████| 1190/1190 [00:38<00:00, 31.03it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_4_more.parquet
Year: 2023, Month: 5, total: 3505, less than 2: 2346, more than 2:1140


100%|███████████████████████████████████████| 2346/2346 [00:53<00:00, 44.26it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_5_less.parquet


100%|███████████████████████████████████████| 1140/1140 [00:35<00:00, 32.06it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_5_more.parquet
Year: 2023, Month: 6, total: 4631, less than 2: 2793, more than 2:1810


100%|███████████████████████████████████████| 2793/2793 [01:09<00:00, 40.47it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_6_less.parquet


100%|███████████████████████████████████████| 1810/1810 [01:02<00:00, 28.89it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_6_more.parquet
Year: 2023, Month: 7, total: 3210, less than 2: 1815, more than 2:1381


100%|███████████████████████████████████████| 1815/1815 [00:52<00:00, 34.78it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_7_less.parquet


100%|███████████████████████████████████████| 1381/1381 [00:43<00:00, 31.54it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_7_more.parquet
Year: 2023, Month: 8, total: 2765, less than 2: 1621, more than 2:1129


100%|███████████████████████████████████████| 1621/1621 [00:47<00:00, 34.14it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_8_less.parquet


100%|███████████████████████████████████████| 1129/1129 [00:38<00:00, 29.37it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_8_more.parquet
Year: 2023, Month: 9, total: 2622, less than 2: 1496, more than 2:1117


100%|███████████████████████████████████████| 1496/1496 [00:43<00:00, 34.77it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_9_less.parquet


100%|███████████████████████████████████████| 1117/1117 [00:38<00:00, 29.24it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_9_more.parquet
Year: 2023, Month: 10, total: 2749, less than 2: 1657, more than 2:1059


100%|███████████████████████████████████████| 1657/1657 [00:45<00:00, 36.79it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_10_less.parquet


100%|███████████████████████████████████████| 1059/1059 [00:37<00:00, 28.37it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_10_more.parquet
Year: 2023, Month: 11, total: 2925, less than 2: 1811, more than 2:1097


100%|███████████████████████████████████████| 1811/1811 [00:46<00:00, 39.32it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_11_less.parquet


100%|███████████████████████████████████████| 1097/1097 [00:38<00:00, 28.19it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_11_more.parquet
Year: 2023, Month: 12, total: 4480, less than 2: 2651, more than 2:1821


100%|███████████████████████████████████████| 2651/2651 [01:04<00:00, 41.31it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_12_less.parquet


100%|███████████████████████████████████████| 1821/1821 [00:58<00:00, 31.19it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_12_more.parquet
Year: 2022, Month: 1, total: 1975, less than 2: 1303, more than 2:660


100%|███████████████████████████████████████| 1303/1303 [00:36<00:00, 35.94it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_1_less.parquet


100%|█████████████████████████████████████████| 660/660 [00:20<00:00, 32.01it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_1_more.parquet
Year: 2022, Month: 2, total: 2031, less than 2: 1390, more than 2:635


100%|███████████████████████████████████████| 1390/1390 [00:35<00:00, 38.73it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_2_less.parquet


100%|█████████████████████████████████████████| 635/635 [00:23<00:00, 27.09it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_2_more.parquet
Year: 2022, Month: 3, total: 2244, less than 2: 1430, more than 2:807


100%|███████████████████████████████████████| 1430/1430 [00:43<00:00, 32.76it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_3_less.parquet


100%|█████████████████████████████████████████| 807/807 [00:31<00:00, 25.98it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_3_more.parquet
Year: 2022, Month: 4, total: 2323, less than 2: 1558, more than 2:758


100%|███████████████████████████████████████| 1558/1558 [00:46<00:00, 33.20it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_4_less.parquet


100%|█████████████████████████████████████████| 758/758 [00:29<00:00, 25.68it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_4_more.parquet
Year: 2022, Month: 5, total: 2171, less than 2: 1428, more than 2:734


100%|███████████████████████████████████████| 1428/1428 [00:39<00:00, 35.89it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_5_less.parquet


100%|█████████████████████████████████████████| 734/734 [00:24<00:00, 29.57it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_5_more.parquet
Year: 2022, Month: 6, total: 3698, less than 2: 2459, more than 2:1230


100%|███████████████████████████████████████| 2459/2459 [01:14<00:00, 33.18it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_6_less.parquet


100%|███████████████████████████████████████| 1230/1230 [00:44<00:00, 27.87it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_6_more.parquet
Year: 2022, Month: 7, total: 2079, less than 2: 1307, more than 2:765


100%|███████████████████████████████████████| 1307/1307 [00:39<00:00, 32.76it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_7_less.parquet


100%|█████████████████████████████████████████| 765/765 [00:26<00:00, 29.11it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_7_more.parquet
Year: 2022, Month: 8, total: 2013, less than 2: 1229, more than 2:780


100%|███████████████████████████████████████| 1229/1229 [00:41<00:00, 29.53it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_8_less.parquet


100%|█████████████████████████████████████████| 780/780 [00:28<00:00, 27.37it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_8_more.parquet
Year: 2022, Month: 9, total: 2214, less than 2: 1301, more than 2:903


100%|███████████████████████████████████████| 1301/1301 [00:42<00:00, 30.85it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_9_less.parquet


100%|█████████████████████████████████████████| 903/903 [00:32<00:00, 27.79it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_9_more.parquet
Year: 2022, Month: 10, total: 2377, less than 2: 1481, more than 2:887


100%|███████████████████████████████████████| 1481/1481 [00:47<00:00, 31.47it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_10_less.parquet


100%|█████████████████████████████████████████| 887/887 [00:32<00:00, 27.41it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_10_more.parquet
Year: 2022, Month: 11, total: 2897, less than 2: 1860, more than 2:1019


100%|███████████████████████████████████████| 1860/1860 [00:52<00:00, 35.65it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_11_less.parquet


100%|███████████████████████████████████████| 1019/1019 [00:37<00:00, 27.04it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_11_more.parquet
Year: 2022, Month: 12, total: 4506, less than 2: 2845, more than 2:1644


100%|███████████████████████████████████████| 2845/2845 [01:20<00:00, 35.42it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_12_less.parquet


100%|███████████████████████████████████████| 1644/1644 [01:02<00:00, 26.34it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_12_more.parquet
Year: 2021, Month: 1, total: 1677, less than 2: 1142, more than 2:515


100%|███████████████████████████████████████| 1142/1142 [00:37<00:00, 30.42it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_1_less.parquet


100%|█████████████████████████████████████████| 515/515 [00:19<00:00, 27.10it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_1_more.parquet
Year: 2021, Month: 2, total: 1517, less than 2: 1017, more than 2:487


100%|███████████████████████████████████████| 1017/1017 [00:33<00:00, 30.49it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_2_less.parquet


100%|█████████████████████████████████████████| 487/487 [00:17<00:00, 27.82it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_2_more.parquet
Year: 2021, Month: 3, total: 2025, less than 2: 1302, more than 2:713


100%|███████████████████████████████████████| 1302/1302 [00:41<00:00, 31.05it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_3_less.parquet


100%|█████████████████████████████████████████| 713/713 [00:26<00:00, 27.17it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_3_more.parquet
Year: 2021, Month: 4, total: 2112, less than 2: 1539, more than 2:566


100%|███████████████████████████████████████| 1539/1539 [00:47<00:00, 32.18it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_4_less.parquet


100%|█████████████████████████████████████████| 566/566 [00:20<00:00, 27.06it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_4_more.parquet
Year: 2021, Month: 5, total: 2317, less than 2: 1604, more than 2:691


100%|███████████████████████████████████████| 1604/1604 [00:50<00:00, 31.60it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_5_less.parquet


100%|█████████████████████████████████████████| 691/691 [00:27<00:00, 25.58it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_5_more.parquet
Year: 2021, Month: 6, total: 2896, less than 2: 1896, more than 2:981


100%|███████████████████████████████████████| 1896/1896 [01:09<00:00, 27.13it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_6_less.parquet


100%|█████████████████████████████████████████| 981/981 [00:38<00:00, 25.27it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_6_more.parquet
Year: 2021, Month: 7, total: 2047, less than 2: 1335, more than 2:708


100%|███████████████████████████████████████| 1335/1335 [00:45<00:00, 29.38it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_7_less.parquet


100%|█████████████████████████████████████████| 708/708 [00:25<00:00, 27.95it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_7_more.parquet
Year: 2021, Month: 8, total: 1935, less than 2: 1122, more than 2:797


100%|███████████████████████████████████████| 1122/1122 [00:38<00:00, 29.45it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_8_less.parquet


100%|█████████████████████████████████████████| 797/797 [00:29<00:00, 26.58it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_8_more.parquet
Year: 2021, Month: 9, total: 2044, less than 2: 1248, more than 2:786


100%|███████████████████████████████████████| 1248/1248 [00:43<00:00, 28.79it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_9_less.parquet


100%|█████████████████████████████████████████| 786/786 [00:28<00:00, 27.12it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_9_more.parquet
Year: 2021, Month: 10, total: 1851, less than 2: 1127, more than 2:693


100%|███████████████████████████████████████| 1127/1127 [00:36<00:00, 31.07it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_10_less.parquet


100%|█████████████████████████████████████████| 693/693 [00:25<00:00, 27.58it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_10_more.parquet
Year: 2021, Month: 11, total: 2214, less than 2: 1407, more than 2:793


100%|███████████████████████████████████████| 1407/1407 [00:47<00:00, 29.77it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_11_less.parquet


100%|█████████████████████████████████████████| 793/793 [00:30<00:00, 26.41it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_11_more.parquet
Year: 2021, Month: 12, total: 3704, less than 2: 2289, more than 2:1401


100%|███████████████████████████████████████| 2289/2289 [01:19<00:00, 28.71it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_12_less.parquet


100%|███████████████████████████████████████| 1401/1401 [00:53<00:00, 26.22it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_12_more.parquet


## Overall Trend 

The overall trend in the usage of GPT in scientific writing

In [14]:
for year in years:
    dest_year = dest / str(year)
    dest_year.mkdir(parents=True, exist_ok=True)
    for file in os.listdir(f"{base_loc}/{year}"):
        if "whole" not in file:
            full_path = f"{base_loc}/{year}/{file}"
            with open(full_path, "rb") as f:
                data = pickle.load(f)
                name = file.strip(".pkl")
                data.export_parquet(dest_year, name)

100%|█████████████████████████████████████████████| 6/6 [00:00<00:00, 20.08it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_5.parquet


100%|███████████████████████████████████████| 1042/1042 [00:32<00:00, 31.70it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_4.parquet


100%|███████████████████████████████████████████| 24/24 [00:01<00:00, 23.56it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_6.parquet


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_7.parquet


100%|███████████████████████████████████████| 2727/2727 [01:17<00:00, 35.12it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_3.parquet


100%|███████████████████████████████████████| 2192/2192 [01:05<00:00, 33.38it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_2.parquet


100%|███████████████████████████████████████| 2726/2726 [01:30<00:00, 30.20it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_1.parquet


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_11.parquet


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_10.parquet


100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 24.20it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_12.parquet


0it [00:00, ?it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_9.parquet


100%|█████████████████████████████████████████████| 4/4 [00:00<00:00, 22.36it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2024/education_2024_8.parquet


100%|███████████████████████████████████████| 2925/2925 [01:30<00:00, 32.41it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_11.parquet


100%|███████████████████████████████████████| 2548/2548 [01:21<00:00, 31.36it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_1.parquet


100%|███████████████████████████████████████| 2749/2749 [01:28<00:00, 30.97it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_10.parquet


100%|███████████████████████████████████████| 4480/4480 [02:23<00:00, 31.17it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_12.parquet


100%|███████████████████████████████████████| 3322/3322 [01:44<00:00, 31.76it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_3.parquet


100%|███████████████████████████████████████| 2424/2424 [01:14<00:00, 32.57it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_2.parquet


100%|███████████████████████████████████████| 4631/4631 [02:25<00:00, 31.88it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_6.parquet


100%|███████████████████████████████████████| 3210/3210 [01:46<00:00, 30.24it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_7.parquet


100%|███████████████████████████████████████| 3505/3505 [01:39<00:00, 35.29it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_5.parquet


100%|███████████████████████████████████████| 3440/3440 [01:38<00:00, 34.85it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_4.parquet


100%|███████████████████████████████████████| 2622/2622 [01:21<00:00, 32.31it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_9.parquet


100%|███████████████████████████████████████| 2765/2765 [01:28<00:00, 31.25it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2023/education_2023_8.parquet


100%|███████████████████████████████████████| 2244/2244 [01:10<00:00, 32.06it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_3.parquet


100%|███████████████████████████████████████| 2031/2031 [01:00<00:00, 33.69it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_2.parquet


100%|███████████████████████████████████████| 1975/1975 [01:01<00:00, 31.92it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_1.parquet


100%|███████████████████████████████████████| 2171/2171 [01:05<00:00, 33.24it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_5.parquet


100%|███████████████████████████████████████| 2323/2323 [01:11<00:00, 32.65it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_4.parquet


100%|███████████████████████████████████████| 3698/3698 [01:53<00:00, 32.64it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_6.parquet


100%|███████████████████████████████████████| 2079/2079 [01:09<00:00, 29.95it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_7.parquet


100%|███████████████████████████████████████| 4506/4506 [02:03<00:00, 36.47it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_12.parquet


100%|███████████████████████████████████████| 2214/2214 [01:07<00:00, 32.76it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_9.parquet


100%|███████████████████████████████████████| 2377/2377 [01:05<00:00, 36.55it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_10.parquet


100%|███████████████████████████████████████| 2897/2897 [01:20<00:00, 35.99it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_11.parquet


100%|███████████████████████████████████████| 2013/2013 [00:58<00:00, 34.50it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2022/education_2022_8.parquet


100%|███████████████████████████████████████| 1517/1517 [00:42<00:00, 36.08it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_2.parquet


100%|███████████████████████████████████████| 2025/2025 [00:59<00:00, 33.84it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_3.parquet


100%|███████████████████████████████████████| 1677/1677 [00:48<00:00, 34.64it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_1.parquet


100%|███████████████████████████████████████| 3704/3704 [01:53<00:00, 32.53it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_12.parquet


100%|███████████████████████████████████████| 2112/2112 [00:59<00:00, 35.27it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_4.parquet


100%|███████████████████████████████████████| 2317/2317 [01:04<00:00, 35.94it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_5.parquet


100%|███████████████████████████████████████| 2214/2214 [01:04<00:00, 34.59it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_11.parquet


100%|███████████████████████████████████████| 2047/2047 [01:02<00:00, 32.58it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_7.parquet


100%|███████████████████████████████████████| 2896/2896 [01:26<00:00, 33.65it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_6.parquet


100%|███████████████████████████████████████| 1851/1851 [00:51<00:00, 35.61it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_10.parquet


100%|███████████████████████████████████████| 1935/1935 [00:57<00:00, 33.68it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_8.parquet


100%|███████████████████████████████████████| 2044/2044 [01:01<00:00, 33.05it/s]


File Exported as: inference_data/parquet_files/num_authors/education/2021/education_2021_9.parquet
