In [1]:
import bz2 
import pickle
import _pickle as cPickle
import os
import pandas as pd
from itertools import chain
import numpy as np
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))



In [2]:
### Read in Files

path = os.getcwd()
folder = '\\Data'
files = os.listdir(path + folder)

files

['OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C105795698.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C111368507.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C12554922.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C144024400.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C153294291.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C184779094.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C8058405.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C97355855.pbz2']

In [43]:
# load in cPickle file for Geophysics (OpenAlex ID C8058405)
discipline = 'C8058405'



# load in cPickle file for Statistics (OpenAlex ID C105795698)
#discipline = 'C105795698'

# load in cPickle file for Oceanography (OpenAlex ID C111368507)
#discipline = 'C111368507'

# load in cPickle file for Biophysics (OpenAlex ID C12554922)
#discipline = 'C12554922'

# load in cPickle file for Meteorology (OpenAlex ID C153294291)
#discipline = 'C153294291'

# load in cPickle file for Atomic physics (OpenAlex ID C184779094)
#discipline = 'C184779094'

# load in cPickle file for Thermodynamics (OpenAlex ID C97355855)
#discipline = 'C97355855'

Data_Packet = 'Data\OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_' + discipline + '.pbz2'

f = bz2.BZ2File(Data_Packet, 'rb')
paper_id_year_df = cPickle.load(f)
corpus_dict = cPickle.load(f)
citation_df = cPickle.load(f)

# Object 1: dataframe with all the paper IDs and the year they were published: needed for the corpus_dict
geophysics_paper_id_year_df = paper_id_year_df
display(geophysics_paper_id_year_df.head())

# Object 2: dictionary where the keys are the paper IDs and the values are a list containing the extracted terms
# Structure: corpus_dict[Discipline_ID][paper_id] = [term1, term2, term3,...]: incl. eN and non-EN terms
geophysics_corpus_dict = corpus_dict

# sample call for single work
display(geophysics_corpus_dict.get('C8058405').get('https://openalex.org/W2765252368'))

# full call for all works
#display(next(iter(geophysics_corpus_dict.items())))

# Object 3: dataframe that's an edgelist between receiver RORs and sender RORs per year (= research organization registry)
display(citation_df.head())

Unnamed: 0,work_id,publication_year,Discipline
0,https://openalex.org/W2765252368,2017,C8058405
1,https://openalex.org/W2135405592,2009,C8058405
2,https://openalex.org/W2116007522,1971,C8058405
3,https://openalex.org/W2908600692,2019,C8058405
4,https://openalex.org/W3165125549,2021,C8058405


['substorm expansion',
 'wave frequencies',
 'small substorm',
 'expansion phase',
 'small substorm expansion',
 'substorm expansion phase',
 'substorm onset',
 'characteristics of the onset',
 'physics of substorm',
 'frequencies concurrent']

Unnamed: 0,Sender_ROR,Receiver_ROR,Year,Citations,Discipline
0,https://ror.org/00hj8s172,https://ror.org/00hj8s172,1966,1.0,C8058405
1,https://ror.org/042nb2s44,https://ror.org/00hj8s172,1966,1.0,C8058405
2,https://ror.org/016st3p78,https://ror.org/02acart68,1967,0.090909,C8058405
3,https://ror.org/027m9bs27,https://ror.org/02acart68,1967,0.25,C8058405
4,https://ror.org/02acart68,https://ror.org/02acart68,1967,0.5,C8058405


In [44]:
### Preprocess Data for single field

def reformat(dictionary):
    '''
    Takes the Object 2 dictionary and casts it into a dataframe
    '''
    # remove OpenAlex ID
    field_dictionary = dictionary[1]
    # reformat to list
    field_list = list(map(list, field_dictionary.items()))
    # cast into dataframe and rename columns
    field_df = pd.DataFrame(field_list)
    field_df = field_df.rename(columns = {0: 'work_id', 1: 'terms'})
    
    return field_df

def add_year(field_df, paper_id_year_df):
    '''
    Takes the Object 2 dataframe and merges it with corresponding publication years from Object 1
    '''
    # merge the two dataframes
    merged = pd.merge(field_df, paper_id_year_df, on ='work_id', how ='inner')
    # reorder columns
    columns = merged.columns.tolist()
    columns = columns[-1:] + columns[-2:-1] + columns[:-2]
    reordered = merged[columns]
    
    return reordered

def lowercase(dataframe, column):
    '''
    takes a dataframe and lowercases everything within a specified column (column contents must be in a list of strings)
    '''
    dataframe[column] = dataframe[column].apply(lambda lst: [word.lower() for word in lst])
    return dataframe

def counts_per_document(reordered):
    '''
    Add number of documents, terms, unique terms, words, and unique words per document to the dataframe
    '''
    pd.set_option('mode.chained_assignment',None)
    
    reordered.loc[:,'NoD_pD'] = 1
    reordered.loc[:,'NoT_pD'] = [len(cell) for cell in reordered['terms']]
    reordered.loc[:,'NoUT_pD'] = [len(set(cell)) for cell in reordered['terms']]
    reordered.loc[:,'NoW_pD'] = [sum([len(term.split()) for term in cell]) for cell in reordered['terms']]
    reordered.loc[:,'NoUW_pD'] = [len(set([item for sublist in [term.split() for term in cell]
                                           for item in sublist])) for cell in reordered['terms']]
    
    #display(reordered.describe())
    return reordered

def counts_per_year(reordered):
    '''
    Add number of documents, terms, unique terms, words, and unique words per year to the dataframe
    '''
    # aggregate documents per year and concatenate the list(s) of words
    words = reordered.groupby('publication_year', as_index=False)['stemmed_tokens'].agg(lambda x: list(chain.from_iterable(x)))
    # aggregate documents per year and count the number of documents
    documents = reordered.groupby('publication_year', as_index=False).size()
    # put the two dataframes together
    grouped = pd.concat([words, documents['size']], axis = 1)
    
    # get counts of terms and words per year
    grouped = grouped.rename(columns = {'size':'NoD'})
    #grouped.loc[:,'NoT'] = [len(cell) for cell in grouped['terms']]
    #grouped.loc[:,'NoUT'] = [len(set(cell)) for cell in grouped['terms']]
    #grouped.loc[:,'NoW'] = [sum([len(term.split()) for term in cell]) for cell in grouped['words']]
    #grouped.loc[:,'NoUW'] = [len(set([item for sublist in [term.split() for term in cell]
                                      #for item in sublist])) for cell in grouped['words']]
    grouped.loc[:,'NoS'] = [sum([len(term.split()) for term in cell]) for cell in grouped['stemmed_tokens']]
    grouped.loc[:,'NoUS'] = [len(set([item for sublist in [term.split() for term in cell]
                                      for item in sublist])) for cell in grouped['stemmed_tokens']]
    
    return grouped

def split_string(dataframe, column):
    '''
    Split strings into substrings for a given column in the dataframe, creating the new column 'words'
    '''
    dataframe['words'] = dataframe[column].apply(lambda lst: [word for line in lst for word in line.split()])
    return dataframe

def remove_stopwords(dataframe, column):
    '''
    Remove stopwords from a list of words
    '''
    dataframe[column] = dataframe[column].apply(lambda lst: [word for word in lst if word not in stopwords])
    return dataframe

def wordcounter(wordlist, n):
    '''
    Counts terms/words within a list of strings, returns top n terms/words over time
    Idea: Use output as illustrative example of how field progresses (validate with field-specific paper on paradigm shift)
    '''
    counts = {}
    for word in wordlist:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    
    # convert dictionary to list of tuples
    lst_counts = [(key, value) for key, value in counts.items()]
    #sort in descending order
    lst_counts.sort(key = lambda x: x[1], reverse=True)
    
    return lst_counts[:n]

def wordcounter_column(dataframe, column, n):
    '''
    Apply wordcounter() function to the entire column of a dataframe, returns a new column with top n items per year
    '''
    # define the new column name and fill it with nan values
    if n != 1:
        new_column = 'top ' + str(n) + ' ' + column
    else:
        new_column = 'top ' + str(n) + ' ' + column[:-1]
    dataframe[new_column] = np.nan
    
    # loop through each row to get most frequent words
    for index, row in dataframe.iterrows():
        dataframe.iloc[index,dataframe.columns.get_loc(new_column)] = [wordcounter(row[column], n)]    
    
    # above line throws an error if outer brackets are removed, the followinf code flattens the nested list
    # dataframe[new_column] =  dataframe[new_column].apply(np.ravel)
    
    return dataframe

def wordcounter_abs_and_perc(dataframe, column, n, percentage):
    '''
    UPDATED VERSION OF WORDCOUNTER_COLUMN
    
    Apply wordcounter() function to the entire column of a dataframe, returns a new column with either
    top n items per year or top n percent of items per year
    
    Function takes in a dataframe, a column name ('words' or 'terms'), n (either as whole number of cases or as percentage,
    and boolean percentage ('YES' or 'NO' to indicate if number is top n or top n percent))
    '''    
    # define the new column name conditional on percentage
    if percentage == 'YES':
        # get number of unique words/terms based on given percentage
        new_counter = 'NoU' + str(column[0]).capitalize() + ' (t' + str(n) + '%)'        
        new_column = 't' + str(n) + '% of ' + column
    elif n!= 1:
        new_column = 't' + str(n) + ' ' + column
    else:
        new_column = 't' + str(n) + ' ' + column[:-1]
        
    # populate new_counter column with an integer of terms, if percentage given
    if percentage == 'NO':
        pass
    #elif column == 'terms':
        #dataframe[new_counter] = dataframe['NoUT'].multiply((n/100)).round().astype(np.int64)
    #elif column == 'words':
        #dataframe[new_counter] = dataframe['NoUW'].multiply((n/100)).round().astype(np.int64)
    elif column == 'stemmed_tokens':
        dataframe[new_counter] = dataframe['NoUS'].multiply((n/100)).round().astype(np.int64)
        
    # fill other column with nan values
    dataframe[new_column] = np.nan
    
    # loop through each row to get most frequent words
    for index, row in dataframe.iterrows():        
        # condition for top n % of terms
        if percentage == 'YES':
            NoUX = dataframe.iloc[index,dataframe.columns.get_loc(new_counter)]
            # account for edge case of NoUT being 0
            if NoUX >= 1:
                dataframe.iloc[index,dataframe.columns.get_loc(new_column)] = [wordcounter(row[column], NoUX)]
            else:
                dataframe.iloc[index,dataframe.columns.get_loc(new_column)] = np.nan
        # condition for top n terms
        else:
            dataframe.iloc[index,dataframe.columns.get_loc(new_column)] = [wordcounter(row[column], n)]
            
        # above line throws an error if outer brackets are removed, the following code flattens the nested list
        # dataframe[new_column] =  dataframe[new_column].apply(np.ravel)
        
    return dataframe

In [45]:
import nltk
from nltk.stem.snowball import SnowballStemmer

regex = r'[^a-z\s]'

def tokenize(text):
    '''
    Cleans, tokenizes + stems Pandas series of strings    
    Returns pandas series of lists of tokens
    '''
    # Clean text with regex
    clean = text.str.lower().str.replace(regex, '', regex=True)

    # Anonymous tokenizer + stemmer functions
    stop = nltk.corpus.stopwords.words('english')
    tokenize = lambda text: [i for i in nltk.word_tokenize(text) if i not in stop]
    stemmer = lambda tokens: [SnowballStemmer('english').stem(token) for token in tokens]

    # Tokenize and stem clean text
    tokens = clean.apply(tokenize)
    stemmed_tokens = tokens.apply(stemmer)
    
    return stemmed_tokens

In [46]:
a = lowercase(add_year(reformat(next(iter(geophysics_corpus_dict.items()))), geophysics_paper_id_year_df), 'terms')

# remove rows with empty lists
b = a[a['terms'].map(lambda d: len(d)) > 0]

# reformat row as strings
b = b.astype({'terms':'string'})

# split terms into words and stem
b['stemmed_tokens'] = tokenize(b['terms'])

# get ocunts per year for documents, words ,adn unique words (now no more terms because of stemming)
b = counts_per_year(b)

# top 10, 50, 100, 500 unique stemmed tokens
c = wordcounter_abs_and_perc(b, 'stemmed_tokens', 10, 'NO')
c = wordcounter_abs_and_perc(c, 'stemmed_tokens', 50, 'NO')
c = wordcounter_abs_and_perc(c, 'stemmed_tokens', 100, 'NO')
c = wordcounter_abs_and_perc(c, 'stemmed_tokens', 500, 'NO')

# top 1%, 10%, 20%, 25 % of unique stemmed tokens
c = wordcounter_abs_and_perc(c, 'stemmed_tokens', 1, 'YES')
c = wordcounter_abs_and_perc(c, 'stemmed_tokens', 10, 'YES')
c = wordcounter_abs_and_perc(c, 'stemmed_tokens', 20, 'YES')
c = wordcounter_abs_and_perc(c, 'stemmed_tokens', 25, 'YES')

# flatten list, calculate len, and divide by 2 since list contains words and their count (should max at 100)
#c.loc[:,'t100 tokens count'] = c['t100 words'].apply(np.ravel).apply(len).div(2).astype(np.int64)
# select relevant keys and set publication_year to index
#d = c[['publication_year', 'NoD', 'NoT', 'NoUT', 'NoW', 'NoUW', 't100 words count', 't100 terms count', 'NoUW (t25%)',
      #'NoUT (t25%)']]

# fill NaN values with 0 (cases where terms do not meet threshold for meaningful results for a given top percentage)
c.fillna(0)

# convert publication year back to integer
c = c.astype({'publication_year':'int'})

d = c.set_index('publication_year')

# create complete index without missing years
new_index = list(range(int(min(d.index)), int(max(d.index)) + 1))

# create empty dataframe with complete index
e = pd.DataFrame(np.nan, index = new_index, columns = d.columns)

e.index.name = 'publication_year'

f = e.combine_first(d)
f.reset_index(inplace=True)

In [47]:
f

Unnamed: 0,publication_year,stemmed_tokens,NoD,NoS,NoUS,t10 stemmed_tokens,t50 stemmed_tokens,t100 stemmed_tokens,t500 stemmed_tokens,NoUS (t1%),t1% of stemmed_tokens,NoUS (t10%),t10% of stemmed_tokens,NoUS (t20%),t20% of stemmed_tokens,NoUS (t25%),t25% of stemmed_tokens
0,1832,"[mean, motion, mean, motion, motion, planet, m...",1.0,25.0,11.0,"[[(motion, 7), (mean, 6), (planet, 2), (earth,...","[[(motion, 7), (mean, 6), (planet, 2), (earth,...","[[(motion, 7), (mean, 6), (planet, 2), (earth,...","[[(motion, 7), (mean, 6), (planet, 2), (earth,...",0.0,,1.0,"[[(motion, 7)]]",2.0,"[[(motion, 7), (mean, 6)]]",3.0,"[[(motion, 7), (mean, 6), (planet, 2)]]"
1,1833,,,,,,,,,,,,,,,,
2,1834,,,,,,,,,,,,,,,,
3,1835,,,,,,,,,,,,,,,,
4,1836,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,2018,"[unit, boundari, geostatist, integr, geotechn,...",180.0,3952.0,887.0,"[[(field, 90), (seismic, 67), (model, 57), (ma...","[[(field, 90), (seismic, 67), (model, 57), (ma...","[[(field, 90), (seismic, 67), (model, 57), (ma...","[[(field, 90), (seismic, 67), (model, 57), (ma...",9.0,"[[(field, 90), (seismic, 67), (model, 57), (ma...",89.0,"[[(field, 90), (seismic, 67), (model, 57), (ma...",177.0,"[[(field, 90), (seismic, 67), (model, 57), (ma...",222.0,"[[(field, 90), (seismic, 67), (model, 57), (ma..."
187,2019,"[lunar, crater, terrestri, crater, crater, dia...",279.0,6103.0,1113.0,"[[(model, 94), (field, 85), (structur, 72), (w...","[[(model, 94), (field, 85), (structur, 72), (w...","[[(model, 94), (field, 85), (structur, 72), (w...","[[(model, 94), (field, 85), (structur, 72), (w...",11.0,"[[(model, 94), (field, 85), (structur, 72), (w...",111.0,"[[(model, 94), (field, 85), (structur, 72), (w...",223.0,"[[(model, 94), (field, 85), (structur, 72), (w...",278.0,"[[(model, 94), (field, 85), (structur, 72), (w..."
188,2020,"[defect, zone, observ, possibl, possibl, defec...",540.0,11527.0,1572.0,"[[(field, 190), (wave, 168), (model, 141), (ma...","[[(field, 190), (wave, 168), (model, 141), (ma...","[[(field, 190), (wave, 168), (model, 141), (ma...","[[(field, 190), (wave, 168), (model, 141), (ma...",16.0,"[[(field, 190), (wave, 168), (model, 141), (ma...",157.0,"[[(field, 190), (wave, 168), (model, 141), (ma...",314.0,"[[(field, 190), (wave, 168), (model, 141), (ma...",393.0,"[[(field, 190), (wave, 168), (model, 141), (ma..."
189,2021,"[ionospher, propag, receiv, amplitud, statist,...",442.0,9556.0,1406.0,"[[(wave, 156), (field, 154), (model, 124), (ma...","[[(wave, 156), (field, 154), (model, 124), (ma...","[[(wave, 156), (field, 154), (model, 124), (ma...","[[(wave, 156), (field, 154), (model, 124), (ma...",14.0,"[[(wave, 156), (field, 154), (model, 124), (ma...",141.0,"[[(wave, 156), (field, 154), (model, 124), (ma...",281.0,"[[(wave, 156), (field, 154), (model, 124), (ma...",352.0,"[[(wave, 156), (field, 154), (model, 124), (ma..."


In [40]:
f.to_csv('geophysics_df.csv')

In [None]:
'''
EXPLANATION

publication_year denotes the year of publication, starting with the earliest available data. For years after the first
observation for which data is not available, all entries are coded as NaN

stemmed_tokens is the full list of stemmed tokens, including tokens that were used multiple times

NoD counts then umber of documents
NoS counts the number of stemmed tokens
NoUS counts the number of unique stemmed tokens

t10 stemmed_tokens, t50 stemmed_tokens, t100 stemmed_tokens, and t500 stemmed_tokens represent the top 10, 50, 100, and 500
tokens by frequency, in the format of [('token', frequency count), ('token2', frequency count), etc.]. For years with a
number of unique stemmed tokens below the top n, the full number of tokens will be displayed
(e.g., 11 tokens for t50 stemmed_tokens in 1832)

NoUS (t1%), NoUS (t10%), NoUS (t20%), and NoUS (t25%) count the number of unique stemmed tokens in the top 1, 10, 20, and 25
percent. This can be 0 for smaller corpus sizes (e.g., top 1% for 11 unique stemmed tokens is 0, hence NoUS (t1%) is 0)

t1% of stemmed_tokens, t10% of stemmed_tokens, t20% of stemmed_tokens, and t25% of stemmed_tokens list the actual tokens
with their frequency of occurence, same as t10 stemmed_tokens (and subsequent columns) did before 
'''