In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
import re
import string

from transformers import pipeline
from bertopic import BERTopic
from umap import UMAP

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [73]:
cp_df = pd.read_csv('cp_articles 2.csv')
cp_df = cp_df.dropna(subset='content')
cp_df = cp_df.drop_duplicates(subset='content')
cp_df.reset_index(inplace=True)

In [74]:
cp_df.sample(3)

Unnamed: 0.1,index,Unnamed: 0,section,link,title,topic,date,content
241,309,309,markets_economy,https://www.capitalgroup.com/advisor/insights/...,Why is the dollar weaker? Look towards Europe,Currencies,"September 5, 2017",The strength of the U.S. dollar in recent year...
37,57,57,equity,https://www.capitalgroup.com/advisor/insights/...,China: 3 views on what’s next for its economy ...,China,"July 22, 2020",KEY TAKEAWAYS \nChina’s economic rebound from ...
145,201,201,markets_economy,https://www.capitalgroup.com/advisor/insights/...,The economy in 2023: Where we differ from mark...,Markets & Economy,"December 28, 2022",The wisdom of crowds can be powerful. But in t...


# Preprocessing

In [75]:
from spacy.lang.en.stop_words import STOP_WORDS

In [76]:
dir(spacy)

['Any',
 'Config',
 'Dict',
 'Errors',
 'Iterable',
 'Language',
 'Path',
 'Union',
 'Vocab',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'about',
 'attrs',
 'blank',
 'cli',
 'compat',
 'displacy',
 'errors',
 'explain',
 'git_info',
 'glossary',
 'info',
 'kb',
 'lang',
 'language',
 'lexeme',
 'load',
 'logger',
 'lookups',
 'matcher',
 'ml',
 'morphology',
 'parts_of_speech',
 'pipe_analysis',
 'pipeline',
 'prefer_gpu',
 'registry',
 'require_cpu',
 'require_gpu',
 'schemas',
 'scorer',
 'strings',
 'symbols',
 'sys',
 'tokenizer',
 'tokens',
 'training',
 'ty',
 'util',
 'vectors',
 'vocab']

In [77]:
content = cp_df[['content']].fillna("")

In [117]:

def lemmatization(text, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    
    return ' '.join(tokens)

# By Chat GPT -----------------------------------------------------------------------------------------------------------
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    """
    A function to preprocess text by performing the following steps:
    1. Convert to lowercase
    2. Remove punctuation and whitespace
    3. Remove stopwords
    4. Lemmatize words
    """
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and whitespace
    doc = nlp(text)
    tokens = [token for token in doc if not token.is_punct and not token.is_space]
    
    # Remove stopwords
    tokens = [token for token in tokens if not token.is_stop and str(token) not in ['company', 'year', 'market']]
    
    # Lemmatize words
    tokens = [token.lemma_ for token in tokens]
    
    # Join tokens back into a string
    text = ' '.join(tokens)
    
    return text


# By Sina -------------------------------------------------------------------------------------------------------------

def text_preproces(text, add_to_stopwords=None, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # Remove Punctuation
    exclude = set(string.punctuation)
    text = ''.join(ch for ch in text if ch not in exclude)
    
    # Converting to Lowercase
    text = text.lower()
    
    # Word Tokenize Text
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    
   
    # lemmatization
    tokens = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    
     # Remove Stopwords
    if add_to_stopwords is None:
        add_to_stopwords = []
    
    stopwords_list = list(stopwords.words('english')) + add_to_stopwords
    tokens = [token for token in tokens if token not in stopwords_list]
    
    
    # Converting to Text
    text = " ".join(tokens)
    
    return text

In [118]:
content['preproces_content'] = content['content'].apply(preprocess_text)

In [119]:
content

Unnamed: 0,index,content,preproces_content,tfidf
0,0,As a portfolio manager with New Perspective Fu...,portfolio manager new perspective fund ® inves...,"[[russell, 0.077614180198083], [lse, 0.2800603..."
1,1,"Growth stocks took a beating in 2022, no quest...",growth stock take beating 2022 question crucia...,"[[llc, 0.04162219859414107], [indice, 0.042232..."
2,2,Is there more fuel in the tank for energy stoc...,fuel tank energy stock question investor mind ...,"[[cap, 0.04838476158881427], [refining, 0.0519..."
3,3,The death of the 60/40 portfolio has been grea...,death 60/40 portfolio greatly exaggerated ask ...,"[[thoughtful, 0.04977678249803588], [payer, 0...."
4,4,As nearly three years of lockdowns come to an ...,nearly year lockdown come end china 2023 turni...,"[[french, 0.0448698525369694], [reset, 0.06017..."
...,...,...,...,...
298,298,"Move over, Florida. Mountaintop hideaways, tro...",florida mountaintop hideaway tropical island s...,"[[headache, 0.05014546969715025], [preparation..."
299,299,"Thirty-five years of loyal service, a gold wat...",thirty year loyal service gold watch champagne...,"[[businessperson, 0.05584040183927782], [hustl..."
300,300,KEY TAKEAWAYS \nA cash balance plan allows emp...,key takeaway cash balance plan allow employee ...,"[[unanticipated, 0.04135678349288946], [interi..."
301,301,KEY TAKEAWAYS \nPeople are carrying more debt ...,key takeaway people carry debt retirement youn...,"[[biola, 0.04714675060971049], [troy, 0.047146..."


In [120]:
umap_obj = UMAP()

In [121]:
bert_model = BERTopic(umap_model=umap_obj)

In [122]:
content = content.reset_index()
content['preproces_content'] = content['preproces_content'].astype('str')

In [123]:
topics, probability =  bert_model.fit_transform(content['preproces_content'])
docTopics_df = bert_model.get_document_info(content['preproces_content'])

In [124]:
docTopics_df['Name'].unique()

array(['1_china_company_us_global', '0_company_dividend_index_investment',
       '-1_us_investment_high_index', '3_rate_inflation_fed_bond',
       '4_bond_income_high_tax', '6_us_rate_economic_growth',
       '5_election_house_senate_president',
       '2_plan_retirement_participant_sponsor'], dtype=object)

# TF IDF

In [125]:
vectorizer = TfidfVectorizer(stop_words='english')
matrix = vectorizer.fit_transform(content['preproces_content'].tolist())
feature_names = vectorizer.get_feature_names_out()

In [126]:
tfidf_list = []
for doc in range(len(content['preproces_content'])):
    articletfidfs = []
    feat_index = matrix[doc, :].nonzero()[1]
    for val in feat_index:
        name = feature_names[val]

        tfidf = matrix[doc, val]
        articletfidfs.append([name, tfidf])
    tfidf_list.append(articletfidfs)

In [127]:
values = []

for a in tfidf_list:
    for b in a:
        values.append(b[1])
tfidfvalues = pd.Series(values).describe()

tfidf_list_2 = []
for doc in tfidf_list:
    newdoc = []
    for term in doc:
        if term[1] > tfidfvalues[6]:
            newdoc.append(term)
    tfidf_list_2.append(newdoc)

content['tfidf'] = tfidf_list_2


In [128]:
content['tfidf']

0      [[russell, 0.0787735051315795], [lse, 0.284243...
1      [[llc, 0.04224022513450568], [indice, 0.042859...
2      [[cap, 0.04856211460531024], [refining, 0.0521...
3      [[thoughtful, 0.05011207128740939], [payer, 0....
4      [[french, 0.045186831464992014], [reset, 0.060...
                             ...                        
298    [[headache, 0.05014745179070967], [preparation...
299    [[businessperson, 0.05587499607964495], [hustl...
300    [[unanticipated, 0.04136654023956189], [interi...
301    [[biola, 0.04714543515593733], [troy, 0.047145...
302    [[relabel, 0.062128951011570496], [clutter, 0....
Name: tfidf, Length: 303, dtype: object

In [129]:
# Let's create a score
# First the probability of topic matching article
articleprob = docTopics_df[['Topic', 'Probability']]

top_terms = bert_model.get_topics()
termweights = []

for row in tqdm(articleprob.to_dict('records')):
    topicnum = row['Topic']
    prob = row['Probability']
    article_terms = []
    for term in top_terms[topicnum]:
        w = term[0]
        term_prob = term[1]
        final_prob = term_prob * prob

        article_terms.append([w, final_prob])
    termweights.append(article_terms)

articleprob['termweights'] = termweights

100%|██████████| 303/303 [00:00<00:00, 180112.54it/s]


In [130]:
articleprob

Unnamed: 0,Topic,Probability,termweights
0,1,0.738699,"[[china, 0.045418494610023834], [company, 0.02..."
1,0,0.951031,"[[company, 0.03876206350123591], [dividend, 0...."
2,-1,0.000000,"[[us, 0.0], [investment, 0.0], [high, 0.0], [i..."
3,-1,0.000000,"[[us, 0.0], [investment, 0.0], [high, 0.0], [i..."
4,1,1.000000,"[[china, 0.06148443800857849], [company, 0.036..."
...,...,...,...
298,2,0.776396,"[[plan, 0.10066670667923533], [retirement, 0.0..."
299,-1,0.000000,"[[us, 0.0], [investment, 0.0], [high, 0.0], [i..."
300,2,1.000000,"[[plan, 0.12965901998870935], [retirement, 0.0..."
301,-1,0.000000,"[[us, 0.0], [investment, 0.0], [high, 0.0], [i..."


In [131]:
cp_df['bert_topic_name'] = docTopics_df['Name']
cp_df['bert_topic_number'] = docTopics_df['Topic']
cp_df['Top_n_words'] = docTopics_df['Top_n_words']
cp_df['termweights'] = articleprob['termweights']
cp_df['preproces_content'] = content['preproces_content']
cp_df['tfidf'] =  content['tfidf']

In [136]:
cp_df.to_csv('final.csv')

In [133]:
def integrated_topics_content(bert_df, topic_id):
    s1 = bert_df[bert_df['bert_topic_number'] == topic_id]
    topic = s1['bert_topic_number'].tolist()[0]
    content = ' '.join(s1['content'])
    topic_name = s1['bert_topic_name'].tolist()[0]
    top_n_words = s1['Top_n_words'].tolist()[0]
    num_article = s1.shape[0]
    dict_ = {
        "topic": topic,
        "num_article": num_article,
        "top_n_words": top_n_words,
        "topic_name": topic_name,
        "content": content,
    }
    return dict_

In [134]:
pd.DataFrame([integrated_topics_content(cp_df, i) for i in cp_df['bert_topic_number'].unique()])

Unnamed: 0,topic,num_article,top_n_words,topic_name,content
0,1,37,china - company - us - global - trade - market...,1_china_company_us_global,As a portfolio manager with New Perspective Fu...
1,0,73,company - dividend - index - investment - stoc...,0_company_dividend_index_investment,"Growth stocks took a beating in 2022, no quest..."
2,-1,75,us - investment - high - index - investor - ra...,-1_us_investment_high_index,Is there more fuel in the tank for energy stoc...
3,3,34,rate - inflation - fed - bond - interest - yie...,3_rate_inflation_fed_bond,Remember in 2011 when Netflix raised prices by...
4,4,24,bond - income - high - tax - yield - municipal...,4_bond_income_high_tax,KEY TAKEAWAYS \nCapital Group’s portfolio eval...
5,6,12,us - rate - economic - growth - bond - recessi...,6_us_rate_economic_growth,KEY TAKEAWAYS \nStocks have been hit with shar...
6,5,12,election - house - senate - president - party ...,5_election_house_senate_president,The buzz surrounding the U.S. midterm election...
7,2,36,plan - retirement - participant - sponsor - em...,2_plan_retirement_participant_sponsor,KEY TAKEAWAYS \nThe estimated $2 trillion CARE...


In [138]:
cp_df


Unnamed: 0.1,index,Unnamed: 0,section,link,title,topic,date,content,bert_topic_name,bert_topic_number,Top_n_words,termweights,preproces_content,tfidf
0,0,0,equity,https://www.capitalgroup.com/advisor/insights/...,Multinational companies can thrive in tough times,Global Equities,"March 2, 2023",As a portfolio manager with New Perspective Fu...,1_china_company_us_global,1,china - company - us - global - trade - market...,"[[china, 0.045418494610023834], [company, 0.02...",portfolio manager new perspective fund ® inves...,"[[russell, 0.0787735051315795], [lse, 0.284243..."
1,1,1,equity,https://www.capitalgroup.com/advisor/insights/...,3 views on the future of growth investing,U.S. Equities,"February 23, 2023","Growth stocks took a beating in 2022, no quest...",0_company_dividend_index_investment,0,company - dividend - index - investment - stoc...,"[[company, 0.03876206350123591], [dividend, 0....",growth stock take beating 2022 question crucia...,"[[llc, 0.04224022513450568], [indice, 0.042859..."
2,2,2,equity,https://www.capitalgroup.com/advisor/insights/...,5 trends driving energy markets in 2023,Energy,"February 15, 2023",Is there more fuel in the tank for energy stoc...,-1_us_investment_high_index,-1,us - investment - high - index - investor - ra...,"[[us, 0.0], [investment, 0.0], [high, 0.0], [i...",fuel tank energy stock question investor mind ...,"[[cap, 0.04856211460531024], [refining, 0.0521..."
3,4,4,equity,https://www.capitalgroup.com/advisor/insights/...,3 reasons why 60/40 portfolios may make a come...,Asset Allocation,"February 2, 2023",The death of the 60/40 portfolio has been grea...,-1_us_investment_high_index,-1,us - investment - high - index - investor - ra...,"[[us, 0.0], [investment, 0.0], [high, 0.0], [i...",death 60/40 portfolio greatly exaggerated ask ...,"[[thoughtful, 0.05011207128740939], [payer, 0...."
4,5,5,equity,https://www.capitalgroup.com/advisor/insights/...,China: What could drive its markets in 2023,China,"January 31, 2023",As nearly three years of lockdowns come to an ...,1_china_company_us_global,1,china - company - us - global - trade - market...,"[[china, 0.06148443800857849], [company, 0.036...",nearly year lockdown come end china 2023 turni...,"[[french, 0.045186831464992014], [reset, 0.060..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,382,382,retirement,https://www.capitalgroup.com/advisor/insights/...,Thinking of retiring abroad? Consider these 5 ...,Retirement Planning,"September 26, 2017","Move over, Florida. Mountaintop hideaways, tro...",2_plan_retirement_participant_sponsor,2,plan - retirement - participant - sponsor - em...,"[[plan, 0.10066670667923533], [retirement, 0.0...",florida mountaintop hideaway tropical island s...,"[[headache, 0.05014745179070967], [preparation..."
299,383,383,retirement,https://www.capitalgroup.com/advisor/insights/...,How millennials can plan for retirement in the...,Millennials,"September 5, 2017","Thirty-five years of loyal service, a gold wat...",-1_us_investment_high_index,-1,us - investment - high - index - investor - ra...,"[[us, 0.0], [investment, 0.0], [high, 0.0], [i...",thirty year loyal service gold watch champagne...,"[[businessperson, 0.05587499607964495], [hustl..."
300,384,384,retirement,https://www.capitalgroup.com/advisor/insights/...,3 reasons employers should consider a cash bal...,Defined Benefit,"May 30, 2017",KEY TAKEAWAYS \nA cash balance plan allows emp...,2_plan_retirement_participant_sponsor,2,plan - retirement - participant - sponsor - em...,"[[plan, 0.12965901998870935], [retirement, 0.0...",key takeaway cash balance plan allow employee ...,"[[unanticipated, 0.04136654023956189], [interi..."
301,385,385,retirement,https://www.capitalgroup.com/advisor/insights/...,How does household debt affect retirement?,Retirement Planning,"May 11, 2017",KEY TAKEAWAYS \nPeople are carrying more debt ...,-1_us_investment_high_index,-1,us - investment - high - index - investor - ra...,"[[us, 0.0], [investment, 0.0], [high, 0.0], [i...",key takeaway people carry debt retirement youn...,"[[biola, 0.04714543515593733], [troy, 0.047145..."
