In [1]:
import bz2 
import pickle
import _pickle as cPickle
import os
import pandas as pd
from itertools import chain
import numpy as np
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))



In [2]:
### Read in Files

path = os.getcwd()
folder = '\\Data'
files = os.listdir(path + folder)

files

['OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C105795698.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C111368507.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C12554922.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C153294291.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C184779094.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C8058405.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C97355855.pbz2',
 'Test']

In [18]:
# load in cPickle file for Geophysics (OpenAlex ID C8058405)
discipline = 'C8058405'
Data_Packet = 'Data\OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_' + discipline + '.pbz2'

f = bz2.BZ2File(Data_Packet, 'rb')
paper_id_year_df = cPickle.load(f)
corpus_dict = cPickle.load(f)
citation_df = cPickle.load(f)

# Object 1: dataframe with all the paper IDs and the year they were published: needed for the corpus_dict
geophysics_paper_id_year_df = paper_id_year_df
display(geophysics_paper_id_year_df.head())

# Object 2: dictionary where the keys are the paper IDs and the values are a list containing the extracted terms
# Structure: corpus_dict[Discipline_ID][paper_id] = [term1, term2, term3,...]: incl. eN and non-EN terms
geophysics_corpus_dict = corpus_dict

# sample call for single work
display(geophysics_corpus_dict.get('C8058405').get('https://openalex.org/W2765252368'))

# full call for all works
#display(next(iter(geophysics_corpus_dict.items())))

# Object 3: dataframe that's an edgelist between receiver RORs and sender RORs per year (= research organization registry)
display(citation_df.head())

Unnamed: 0,work_id,publication_year,Discipline
0,https://openalex.org/W2765252368,2017,C8058405
1,https://openalex.org/W2135405592,2009,C8058405
2,https://openalex.org/W2116007522,1971,C8058405
3,https://openalex.org/W2908600692,2019,C8058405
4,https://openalex.org/W3165125549,2021,C8058405


['substorm expansion',
 'wave frequencies',
 'small substorm',
 'expansion phase',
 'small substorm expansion',
 'substorm expansion phase',
 'substorm onset',
 'characteristics of the onset',
 'physics of substorm',
 'frequencies concurrent']

Unnamed: 0,Sender_ROR,Receiver_ROR,Year,Citations,Discipline
0,https://ror.org/00hj8s172,https://ror.org/00hj8s172,1966,1.0,C8058405
1,https://ror.org/042nb2s44,https://ror.org/00hj8s172,1966,1.0,C8058405
2,https://ror.org/016st3p78,https://ror.org/02acart68,1967,0.090909,C8058405
3,https://ror.org/027m9bs27,https://ror.org/02acart68,1967,0.25,C8058405
4,https://ror.org/02acart68,https://ror.org/02acart68,1967,0.5,C8058405


In [9]:
### Preprocess Data for single field

def reformat(dictionary):
    '''
    Takes the Object 2 dictionary and casts it into a dataframe
    '''
    # remove OpenAlex ID
    field_dictionary = dictionary[1]
    # reformat to list
    field_list = list(map(list, field_dictionary.items()))
    # cast into dataframe and rename columns
    field_df = pd.DataFrame(field_list)
    field_df = field_df.rename(columns = {0: 'work_id', 1: 'terms'})
    
    return field_df

def add_year(field_df, paper_id_year_df):
    '''
    Takes the Object 2 dataframe and merges it with corresponding publication years from Object 1
    '''
    # merge the two dataframes
    merged = pd.merge(field_df, paper_id_year_df, on ='work_id', how ='inner')
    # reorder columns
    columns = merged.columns.tolist()
    columns = columns[-1:] + columns[-2:-1] + columns[:-2]
    reordered = merged[columns]
    
    return reordered

def lowercase(dataframe, column):
    '''
    takes a dataframe and lowercases everything within a specified column (column contents must be in a list of strings)
    '''
    dataframe[column] = dataframe[column].apply(lambda lst: [word.lower() for word in lst])
    return dataframe

def counts_per_document(reordered):
    '''
    Add number of documents, terms, unique terms, words, and unique words per document to the dataframe
    '''
    pd.set_option('mode.chained_assignment',None)
    
    reordered.loc[:,'NoD_pD'] = 1
    reordered.loc[:,'NoT_pD'] = [len(cell) for cell in reordered['terms']]
    reordered.loc[:,'NoUT_pD'] = [len(set(cell)) for cell in reordered['terms']]
    reordered.loc[:,'NoW_pD'] = [sum([len(term.split()) for term in cell]) for cell in reordered['terms']]
    reordered.loc[:,'NoUW_pD'] = [len(set([item for sublist in [term.split() for term in cell]
                                           for item in sublist])) for cell in reordered['terms']]
    
    #display(reordered.describe())
    return reordered

def counts_per_year(reordered):
    '''
    Add number of documents, terms, unique terms, words, and unique words per year to the dataframe
    '''
    # aggregate documents per year and concatenate the list(s) of words
    words = reordered.groupby('publication_year', as_index=False)['terms'].agg(lambda x: list(chain.from_iterable(x)))
    # aggregate documents per year and count the number of documents
    documents = reordered.groupby('publication_year', as_index=False).size()
    # put the two dataframes together
    grouped = pd.concat([words, documents['size']], axis = 1)
    
    # get counts of terms and words per year
    grouped = grouped.rename(columns = {'size':'NoD'})
    grouped.loc[:,'NoT'] = [len(cell) for cell in grouped['terms']]
    grouped.loc[:,'NoUT'] = [len(set(cell)) for cell in grouped['terms']]
    grouped.loc[:,'NoW'] = [sum([len(term.split()) for term in cell]) for cell in grouped['terms']]
    grouped.loc[:,'NoUW'] = [len(set([item for sublist in [term.split() for term in cell]
                                      for item in sublist])) for cell in grouped['terms']]
    
    #display(grouped.describe())
    return grouped

def split_string(dataframe, column):
    '''
    Split strings into substrings for a given column in the dataframe, creating the new column 'words'
    '''
    dataframe['words'] = dataframe[column].apply(lambda lst: [word for line in lst for word in line.split()])
    return dataframe

def remove_stopwords(dataframe, column):
    '''
    Remove stopwords from a list of words
    '''
    dataframe[column] = dataframe[column].apply(lambda lst: [word for word in lst if word not in stopwords])
    return dataframe

def wordcounter(wordlist, n):
    '''
    Counts terms/words within a list of strings, returns top n terms/words over time
    Idea: Use output as illustrative example of how field progresses (validate with field-specific paper on paradigm shift)
    '''
    counts = {}
    for word in wordlist:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    
    # convert dictionary to list of tuples
    lst_counts = [(key, value) for key, value in counts.items()]
    #sort in descending order
    lst_counts.sort(key = lambda x: x[1], reverse=True)
    
    return lst_counts[:n]

def wordcounter_column(dataframe, column, n):
    '''
    Apply wordcounter() function to the entire column of a dataframe, returns a new column with top n items per year
    '''
    # define the new column name and fill it with nan values
    if n != 1:
        new_column = 'top ' + str(n) + ' ' + column
    else:
        new_column = 'top ' + str(n) + ' ' + column[:-1]
    dataframe[new_column] = np.nan
    
    # loop through each row to get most frequent words
    for index, row in dataframe.iterrows():
        dataframe.iloc[index,dataframe.columns.get_loc(new_column)] = [wordcounter(row[column], n)]    
    
    # above line throws an error if outer brackets are removed, the followinf code flattens the nested list
    # dataframe[new_column] =  dataframe[new_column].apply(np.ravel)
    
    return dataframe

def wordcounter_abs_and_perc(dataframe, column, n, percentage):
    '''
    UPDATED VERSION OF WORDCOUNTER_COLUMN
    
    Apply wordcounter() function to the entire column of a dataframe, returns a new column with either
    top n items per year or top n percent of items per year
    
    Function takes in a dataframe, a column name ('words' or 'terms'), n (either as whole number of cases or as percentage,
    and boolean percentage ('YES' or 'NO' to indicate if number is top n or top n percent))
    '''    
    # define the new column name conditional on percentage
    if percentage == 'YES':
        # get number of unique words/terms based on given percentage
        new_counter = 'NoU' + str(column[0]).capitalize() + ' (t' + str(n) + '%)'
        new_column = 't' + str(n) + '% of ' + column
    elif n!= 1:
        new_column = 't' + str(n) + ' ' + column
    else:
        new_column = 't' + str(n) + ' ' + column[:-1]
        
    # populate new_counter column with an integer of terms, if percentage given
    if percentage == 'NO':
        pass
    elif column == 'terms':
        dataframe[new_counter] = dataframe['NoUT'].multiply((n/100)).round().astype(np.int64)
    elif column == 'words':
        dataframe[new_counter] = dataframe['NoUW'].multiply((n/100)).round().astype(np.int64)
        
    # fill other column with nan values
    dataframe[new_column] = np.nan
    
    # loop through each row to get most frequent words
    for index, row in dataframe.iterrows():
        # condition for top n % of terms
        if percentage == 'YES':
            NoUX = dataframe.iloc[index,dataframe.columns.get_loc(new_counter)]
            # account for edge case of NoUT being 0
            if NoUX >= 1:
                dataframe.iloc[index,dataframe.columns.get_loc(new_column)] = [wordcounter(row[column], NoUX)]
            else:
                dataframe.iloc[index,dataframe.columns.get_loc(new_column)] = np.nan
        # condition for top n terms
        else:
            dataframe.iloc[index,dataframe.columns.get_loc(new_column)] = [wordcounter(row[column], n)]
            
        # above line throws an error if outer brackets are removed, the following code flattens the nested list
        # dataframe[new_column] =  dataframe[new_column].apply(np.ravel)
        
    return dataframe

In [13]:
# generate sample data for visualization
a = counts_per_year(lowercase(add_year(reformat(next(iter(geophysics_corpus_dict.items()))), geophysics_paper_id_year_df),
                              'terms')) 
b = remove_stopwords(split_string(a, 'terms'), 'words')

# top 100 unique words and terms, top 25 % of unique words and terms
c = wordcounter_abs_and_perc(b, 'words', 100, 'NO')
c = wordcounter_abs_and_perc(c, 'terms', 100, 'NO')
c = wordcounter_abs_and_perc(c, 'words', 25, 'YES')
c = wordcounter_abs_and_perc(c, 'terms', 25, 'YES')

# flatten list, calculate len, and divide by 2 since list contains words and their count (should max at 100)
c.loc[:,'t100 words count'] = c['t100 words'].apply(np.ravel).apply(len).div(2).astype(np.int64)
c.loc[:,'t100 terms count'] = c['t100 terms'].apply(np.ravel).apply(len).div(2).astype(np.int64)

# select relevant keys and set publication_year to index
d = c[['publication_year', 'NoD', 'NoT', 'NoUT', 'NoW', 'NoUW', 't100 words count', 't100 terms count', 'NoUW (t25%)',
      'NoUT (t25%)']]

# reformat entire dataframe to integer datatype
d = d.astype(int)

d = d.set_index('publication_year')

# create complete index without missing years
new_index = list(range(int(min(d.index)), int(max(d.index)) + 1))

# create empty dataframe with complete index
e = pd.DataFrame(np.nan, index = new_index, columns = d.columns)

e.index.name = 'publication_year'

f = e.combine_first(d)
f.reset_index(inplace=True)

### Function Test
x = counts_per_year(lowercase(add_year(reformat(next(iter(corpus_dict.items()))), paper_id_year_df),'terms')) 
y = remove_stopwords(split_string(x, 'terms'), 'words')

wordcounter_column(y, 'terms', 2)

# then write to csv

Unnamed: 0,publication_year,terms,NoD,NoT,NoUT,NoW,NoUW,words,top 2 terms
0,1832,"[mean motion, mean motions, motions of the pla...",1,10,10,40,16,"[mean, motion, mean, motions, motions, planets...","[[(mean motion, 1), (mean motions, 1)]]"
1,1869,"[secular change, change between the date, meas...",1,10,10,37,18,"[secular, change, change, date, measure, table...","[[(secular change, 1), (change between the dat..."
2,1879,"[external disturbing, bodily tides, considerat...",1,10,10,28,13,"[external, disturbing, bodily, tides, consider...","[[(external disturbing, 1), (bodily tides, 1)]]"
3,1884,"[high pressure, high pressure steam, pressure ...",1,10,10,28,11,"[high, pressure, high, pressure, steam, pressu...","[[(high pressure, 1), (high pressure steam, 1)]]"
4,1902,"[diurnal inequality, times of occurrence, note...",1,10,10,32,16,"[diurnal, inequality, times, occurrence, note,...","[[(diurnal inequality, 1), (times of occurrenc..."
...,...,...,...,...,...,...,...,...,...
76,2018,"[unit boundaries, geostatistical integration, ...",180,1786,1708,4802,1094,"[unit, boundaries, geostatistical, integration...","[[(magnetic field, 10), (geomagnetic field, 6)]]"
77,2019,"[lunar craters, terrestrial craters, craters w...",294,2756,2594,7425,1404,"[lunar, craters, terrestrial, craters, craters...","[[(upper mantle, 9), (magnetic field, 7)]]"
78,2020,"[defective zones, observed possible, possible ...",563,5276,4769,14086,2049,"[defective, zones, observed, possible, possibl...","[[(magnetic field, 24), (geomagnetic field, 16)]]"
79,2021,"[ionospheric propagation, receiver amplitude, ...",465,4327,3927,11633,1832,"[ionospheric, propagation, receiver, amplitude...","[[(magnetic field, 17), (electrical resistivit..."
