In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
from nltk.tokenize import word_tokenize
from transformers import BertModel, BertTokenizer
from tqdm import tqdm
import torch

In [2]:
# Load the data pickle
df = pandas.read_pickle('../data/cleaned_articles.pkl')

In [12]:
# Prepare the data for Doc2Vec

# tokenize the text
df['tokens'] = df['clean_content'].apply(word_tokenize)

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['tokens'])]

# Train a Doc2Vec model
model = Doc2Vec(documents, vector_size=768, window=5, min_count=1, workers=12)

# Define a function to vectorize a document
def vectorize_doc(row):
    print(f'{row.name}\r', end='')
    doc = row['tokens']
    return model.infer_vector(doc)

# Apply the function to each row in the DataFrame
df['doc2vec'] = df.apply(vectorize_doc, axis=1)

14482

In [13]:
# Encode the authors  with label encoding
author_encoder = LabelEncoder()
authors = author_encoder.fit_transform(df['author'].values.ravel())
with open('../pickles/author_encoder.pkl', 'wb') as f:
    pickle.dump(author_encoder, f)

In [14]:
# Save all the features to binarized numpy files
with open('../data/doc2vec.npy', 'wb') as f:
    array = np.stack(df['doc2vec'].values)
    np.save(f, array)

np.save('../data/authors_encoded.npy', authors)
np.save('../data/authors.npy', df['author'].to_numpy())

In [3]:
torch.cuda.empty_cache()

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Assuming your dataframe is named df and the content column is 'content'
content_list = df['clean_content'].tolist()

# Tokenize all content at once with a smaller batch size and sequence length
batch_size = 128
max_length = 512
embeddings_list = []

# Process data in batches
for i in tqdm(range(0, len(content_list), batch_size), desc="Processing batches"):
    batch = content_list[i:i+batch_size]
    tokens_batch = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)
    
    # Get BERT embeddings for all tokens in the batch
    with torch.no_grad():
        outputs = model(**tokens_batch)
    
    # Extract the [CLS] token embeddings
    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    embeddings_list.append(cls_embeddings)

# Concatenate embeddings from all batches
cls_embeddings = np.concatenate(embeddings_list, axis=0)

Processing batches: 100%|██████████| 114/114 [08:47<00:00,  4.63s/it]


In [5]:
# Save the BERT embeddings to a binarized numpy file
with open('../data/bert.npy', 'wb') as f:
    np.save(f, cls_embeddings)