In [3]:
import pandas as pd
import numpy as np
import nltk
import spacy
import re
import string

from transformers import pipeline
from bertopic import BERTopic
from umap import UMAP

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [4]:
cp_df = pd.read_csv('cp_articles 2.csv')
cp_df = cp_df.dropna(subset='content')
cp_df = cp_df.drop_duplicates(subset='content')
cp_df.reset_index(inplace=True)

In [5]:
def preprocess_text(text):
    """
    A function to preprocess text by performing the following steps:
    1. Convert to lowercase
    2. Remove punctuation and whitespace
    3. Remove stopwords
    4. Lemmatize words
    """
    nlp = spacy.load('en_core_web_sm')
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and whitespace
    doc = nlp(text)
    tokens = [token for token in doc if not token.is_punct and not token.is_space]

    # Remove stopwords
    tokens = [token for token in tokens if not token.is_stop and str(token) not in ['company', 'year', 'market']]

    # Lemmatize words
    tokens = [token.lemma_ for token in tokens]

    # Join tokens back into a string
    text = ' '.join(tokens)

    return text

In [15]:
def bert_model(series, return_df=False, return_obj=False):

    umap_obj = UMAP()
    bert_model = BERTopic(umap_model=umap_obj)
    topics, probability =  bert_model.fit_transform(series)
    docTopics_df = bert_model.get_document_info(series)
    if return_df:
        return docTopics_df
    if return_obj:
        return bert_model

In [16]:
content = cp_df[['content']]
content = content.reset_index()
content['preproces_content'] = content['content'].apply(preprocess_text)
content['preproces_content'] = content['preproces_content'].astype('str')



In [23]:
content.to_csv('content.csv')

In [17]:

obj_model = bert_model(content['preproces_content'], return_obj=True)

In [31]:
doc_df = bert_model(content['preproces_content'], return_df=True)

In [32]:
doc_df['Name'].unique()

array(['-1_company_us_investment_investor',
       '0_company_dividend_index_growth', '6_bond_fund_portfolio_high',
       '3_china_company_trade_global', '4_economy_us_market_volatility',
       '1_plan_retirement_participant_sponsor',
       '2_rate_inflation_fed_bond', '7_bond_tax_municipal_muni',
       '5_election_house_senate_president'], dtype=object)