# Silvan's Code from preprocessing.ipynb

In [3]:
# imports
import bz2 
import pickle
import _pickle as cPickle
import os
import pandas as pd
from itertools import chain
import numpy as np
from numpy import array
from numpy import mean
from numpy import cov
from numpy.linalg import eig
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
import numpy as np

In [4]:
### Read in Files

path = os.getcwd()
folder = '/Data'
files = os.listdir(path + folder)

files

['OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C184779094.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C97355855.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C12554922.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C111368507.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C105795698.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C153294291.pbz2',
 'OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_C8058405.pbz2']

In [5]:
# load in cPickle file for Geophysics (OpenAlex ID C8058405)
discipline = 'C8058405'
# I had to change the \ to a / because of Linux file paths
Data_Packet = 'Data/OUTPUT_Python_OpenAlex_Citation_and_Abstract_Data_' + discipline + '.pbz2'

f = bz2.BZ2File(Data_Packet, 'rb')
paper_id_year_df = cPickle.load(f)
corpus_dict = cPickle.load(f)
citation_df = cPickle.load(f)

# Object 1: dataframe with all the paper IDs and the year they were published: needed for the corpus_dict
geophysics_paper_id_year_df = paper_id_year_df
display(geophysics_paper_id_year_df.head())

# Object 2: dictionary where the keys are the paper IDs and the values are a list containing the extracted terms
# Structure: corpus_dict[Discipline_ID][paper_id] = [term1, term2, term3,...]: incl. eN and non-EN terms
geophysics_corpus_dict = corpus_dict

# sample call for single work
display(geophysics_corpus_dict.get('C8058405').get('https://openalex.org/W2765252368'))

# full call for all works
#display(next(iter(geophysics_corpus_dict.items())))

# Object 3: dataframe that's an edgelist between receiver RORs and sender RORs per year (= research organization registry)
display(citation_df.head())

Unnamed: 0,work_id,publication_year,Discipline
0,https://openalex.org/W2765252368,2017,C8058405
1,https://openalex.org/W2135405592,2009,C8058405
2,https://openalex.org/W2116007522,1971,C8058405
3,https://openalex.org/W2908600692,2019,C8058405
4,https://openalex.org/W3165125549,2021,C8058405


['substorm expansion',
 'wave frequencies',
 'small substorm',
 'expansion phase',
 'small substorm expansion',
 'substorm expansion phase',
 'substorm onset',
 'characteristics of the onset',
 'physics of substorm',
 'frequencies concurrent']

Unnamed: 0,Sender_ROR,Receiver_ROR,Year,Citations,Discipline
0,https://ror.org/00hj8s172,https://ror.org/00hj8s172,1966,1.0,C8058405
1,https://ror.org/042nb2s44,https://ror.org/00hj8s172,1966,1.0,C8058405
2,https://ror.org/016st3p78,https://ror.org/02acart68,1967,0.090909,C8058405
3,https://ror.org/027m9bs27,https://ror.org/02acart68,1967,0.25,C8058405
4,https://ror.org/02acart68,https://ror.org/02acart68,1967,0.5,C8058405


# Landauer and Dumais Replication

In [6]:
# Constants
YEAR = 2021
EMBEDDING_DIMS = 10
YEAR_COL = "publication_year"
ID_COL = "work_id"

In [75]:
# This function processes training data, establishing number IDs for each vocabulary word,
# converting word sequence into ID sequence (input_as_ids), and providing dict
# to map from word to its ID (word2id), and list to map from ID back to word (id2word)
def process_training_data(tokens):
    """Taken from pset 2."""
    # Create the model's vocabulary and map to unique indices
    word2id = {}
    id2word = []
    for word in tokens:
        if word not in word2id:
            id2word.append(word)
            word2id[word] = len(id2word) - 1
    # Convert string of text into string of IDs in a tensor for input to model
    input_as_ids = []
    for word in tokens:
        input_as_ids.append(word2id[word])
    # final_ids = torch.LongTensor(input_as_ids)
    return input_as_ids,word2id,id2word


def generate_mat(tokens, df):
    """
    tokens: Iterable
        set of individual tokens for corpus
    df: pd.DataFrame
        any subset of the paper_id_year_df
    """
    input_as_ids, word2id, id2word = process_training_data(tokens)
    num_unique_tokens = len(id2word)
    num_docs = len(df["work_id"])
    mat = np.zeros((num_unique_tokens, num_docs))
    
    for token_idx in range(num_unique_tokens):
        for doc_idx, work_id in enumerate(df["work_id"]):
            word = id2word[token_idx]
            if word in geophysics_corpus_dict.get(work_id):
                mat[token_idx, doc_idx] = 1

    return mat

def svd_dim_reduction(mat):
    """
    TODO: replicate the Landauer and Dumais thing
    """
    u, s, vh = np.linalg.svd(mat)
    return None

def pca_dim_reduction(mat):
    """
    taken from https://machinelearningmastery.com/calculate-principal-component-analysis-scratch-python/
    """
    # want to reduce on the rows, so take the transpose
    A = mat.T
    # calculate the mean of each column
    M = mean(A.T, axis=1)
    # center columns by subtracting column means
    C = A - M
    # calculate covariance matrix of centered matrix
    V = cov(C.T)
    # eigendecomposition of covariance matrix
    values, vectors = eig(V)
    # project data
    P = vectors.T.dot(C.T)
    return P.T, values

def reduce_to_n_dimensions(mat, n):
    """
    mat: np.Array
        matrix being reduced
    n: int
        number of resulting dimensions
    """
    pca_mat, eigenvalues = pca_dim_reduction(mat)
    pca_mat = pca_mat.T
    abs_eigenvalues = abs(eigenvalues)
    sorted_abs = abs_eigenvalues.copy()
    sorted_abs.sort()
    threshold = sorted_abs[::-1][n]
    most_significant = [1 if eigenvalue > threshold else 0 for eigenvalue in abs_eigenvalues]
    new_mat = np.array([row for row, sig in zip(pca_mat, most_significant) if sig])
    return new_mat



In [76]:
df_year = geophysics_paper_id_year_df[geophysics_paper_id_year_df[YEAR_COL] == str(YEAR)]
geophysics_corpus_dict = geophysics_corpus_dict.get(discipline)

tokens = set()
for work_id in df_year[ID_COL]:
    new_tokens = set(geophysics_corpus_dict.get(work_id))
    tokens = tokens.union(new_tokens)

mat = generate_mat(tokens, df_year)
mat = reduce_to_n_dimensions(mat, EMBEDDING_DIMS)
# each column is an embedding of that particular
