In [57]:
import pandas as pd
from nltk.corpus import stopwords
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
STOP_WORDS = set(stopwords.words('english'))
MAX_LEN = 512

In [58]:
# Load the data
df1 = pd.read_csv('../data/articles1.csv', index_col=0)
df2 = pd.read_csv('../data/articles2.csv', index_col=0)
df3 = pd.read_csv('../data/articles3.csv', index_col=0)

# Concatenate the data files
df = pd.concat([df1, df2, df3])

In [59]:
# Drop unnecessary columns
df = df.drop(columns=['title', 'date', 'year', 'month', 'url'])

# Drop rows with NaN publication or author
df = df.dropna(subset=['publication', 'author'], how='any')

In [60]:
# Only keep articles from the top 20 authors
top_authors = df['author'].value_counts().head(20).index
df = df[df['author'].isin(top_authors)].reset_index().drop('index', axis=1)

In [61]:
# Apply the cleaning function to the data
df['clean_content'] = df['content'].apply(lambda x: x.lower())

In [62]:
df.to_pickle('../data/cleaned_articles.pkl')