In [103]:
import pandas
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy as sp
import pickle

In [5]:
# Load the data pickle
df = pandas.read_pickle('cleaned_articles.pkl')

In [6]:
# Prepare the data for Doc2Vec
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['tokens'])]

# Train a Doc2Vec model
model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, workers=24)

# Define a function to vectorize a document
def vectorize_doc(row):
    print(f'{row.name}\r', end='')
    doc = row['tokens']
    return model.infer_vector(doc)

# Apply the function to each row in the DataFrame
df['doc2vec'] = df.apply(vectorize_doc, axis=1)

126693

In [104]:
# Encode the authors and publications with onehot encoding

publication_encoder = OneHotEncoder()
publications = publication_encoder.fit_transform(df['publication'].values.reshape(-1,1))
with open('publication_encoder.pkl', 'wb') as f:
    pickle.dump(publication_encoder, f)

author_encoder = OneHotEncoder()
authors = author_encoder.fit_transform(df['author'].values.reshape(-1,1))
with open('author_encoder.pkl', 'wb') as f:
    pickle.dump(author_encoder, f)

In [120]:
# Save all the features to binarized numpy files
with open('doc2vec.npy', 'wb') as f:
    array = np.stack(df['doc2vec'].values)
    np.save(f, array)

sp.sparse.save_npz('publications_encoded.npz', publications)

np.save('publications.npy', df['publication'].to_numpy())

np.save('authors.npy', df['author'].to_numpy())