In [1]:
import requests
from bs4 import BeautifulSoup

from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import json
import re
from pprint import pprint
from tqdm import tqdm
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

<img src='https://1000logos.net/wp-content/uploads/2021/05/Vanguard-logo.png' width=350>

In [7]:
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

In [8]:
class VanguardInsights:
    
    def __init__(self, all_insight_link):
        self.all_insight_link = all_insight_link
        service = Service(executable_path=ChromeDriverManager().install())
        self.driver = webdriver.Chrome(service=service)
            
    @staticmethod
    def extract_content_with_id(link):
        response = requests.get(link)
        soup = BeautifulSoup(response.text)
        content = ' '.join([i.text for i in soup.find_all("div", {"id": "iw_placeholder1585759247342"})]).strip()
        return content
            
    def get_article_insight(self):
        
        self.driver.get(self.all_insight_link)
        sleep(10)

        all_pages = []

        for i in range(18):

            title_tags = self.driver.find_elements(By.XPATH,'//*[(@id = "insights-archive-section")]//h3')
            title_contents = self.driver.find_elements(By.XPATH, '//*[(@id = "insights-archive-section")]//*[contains(concat( " ", @class, " " ), concat( " ", "p2", " " ))]')
            date_perspectives = self.driver.find_elements(By.XPATH,'//*[contains(concat( " ", @class, " " ), concat( " ", "eyebrow", " " ))]')
            perspectives = self.driver.find_elements(By.XPATH, '//*[contains(concat( " ", @class, " " ), concat( " ", "type", " " ))]')
            # dates = browser.find_elements(By.CSS_SELECTOR, '.content-eyebrow .eyebrow')
            tags = self.driver.find_elements(By.CSS_SELECTOR, "[id='insights-archive'] [class='tags']")
            tags = [i.find_elements(By.CSS_SELECTOR, "[id='insights-archive'] [class='tags'] [class='pill tag']") for i in tags]
            links = self.driver.find_elements(By.CSS_SELECTOR, '.detail-link')



            title_tag = [i.text for i in title_tags] 
            title_content = [i.text for i in title_contents]
            date_perspective = [i.text for i in date_perspectives]
            date = [''.join(re.findall(r'[A-Z]+\s\d+,\s\d+', i)) for i in date_perspective[1:]]
            perspective = [i.text for i in perspectives]
            #date = [i.text for i in dates]
            tag = [[j.text for j in i]for i in tags]
            links_list = [link.get_attribute('href') for link in links]

            list_of_dict = []
            for tg,tc,d,p,t,l in zip(title_tag, title_content, date, perspective, tag, links_list):
                economy_market_dict = {
                                    'company': 'Vanguard', 
                                    'topic': 'Insight', 
                                    'articel_title': tg, 
                                    'abstract' : tc,
                                    'date' : d,
                                    'perspective' : p,
                                    'tag': ','.join(t),
                                    'link':l,
                                    'content':self.extract_content_with_id(l),
                }
                list_of_dict.append(economy_market_dict)


            all_pages.append(list_of_dict)
            next_page = self.driver.find_element(By.CSS_SELECTOR,'#insights-archive-section > div.pagination > div.pagination__arrows > span.icon.icon-right')
            next_page.click()

        self.driver.quit()
        vanguard_insight_list = []
        for p_8 in all_pages:
            for p in p_8:
                vanguard_insight_list.append(p)
                
        return vanguard_insight_list


    @staticmethod
    def save_to_json(file_path, list_of_dict):
        with open(file_path, 'a') as f:
            for dict_ in list_of_dict:
                json.dump(dict_,f,indent=4)



In [9]:
vanguard_insight_obj = VanguardInsights('https://advisors.vanguard.com/insights/all')
vanguard_insight_list = vanguard_insight_obj.get_article_insight()

In [10]:
df = pd.DataFrame(vanguard_insight_list)

In [11]:
df['tag'].unique()

array(['Fixed Income', 'U.S.,International', 'ETF Investing',
       'Fixed Income,U.S.', 'Portfolio Construction',
       "Model Portfolios,Portfolio Construction,Advisor's Alpha®,Behavioral Coaching",
       "Portfolio Construction,Advisor's Alpha®,Wealth Management",
       'ESG Strategies,Equities', 'Personalized Indexing',
       'U.S.,ETF Investing,Fixed Income', 'U.S.',
       'Model Portfolios,Fixed Income', 'U.S.,Behavioral Coaching',
       'U.S.,Fixed Income', 'Fixed Income,Portfolio Construction',
       'Wealth Management', "Advisor's Alpha®",
       'Fixed Income,Equities,Portfolio Construction', 'International',
       'Fixed Income,U.S.,Bear Markets',
       "Advisor's Alpha®,Behavioral Coaching,Portfolio Construction",
       'Equities', "Advisor's Alpha®,Behavioral Coaching,Bear Markets",
       "Advisor's Alpha®,Behavioral Coaching",
       "Advisor's Alpha®,Behavioral Coaching,U.S.", '',
       'Equities,Personalized Indexing',
       'Equities,Portfolio Constructio

In [12]:
df['tag2'] = df['tag'].apply(lambda x: x.split(',')[0])

In [1]:
df

NameError: name 'df' is not defined

# Create Search Engine

### Application of tf-idf for Searching Text

In order to understand how to use tf-idf, I am going to make use of this technique in a text searching application. I will use a dataset of Vanguard Insight which I have scrape it from Vanguard website.  

### Implementation

In [22]:
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

In [15]:
#export
def preprocess(title, body=None):
    """ Preprocess the input, i.e. lowercase, remove html tags, special character and digits."""
    text = ''
    if body is None:
        text = title
    else:
        text = title + body
    # to lower case
    text = text.lower()

    # remove tags
    text = re.sub("</?.*?>"," <> ", text)
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+"," ", text).strip()
    return text
    
def create_tfidf_features(corpus, max_features=5000, max_df=0.95, min_df=2):
    """ Creates a tf-idf matrix for the `corpus` using sklearn. """
    tfidf_vectorizor = TfidfVectorizer(decode_error='replace', strip_accents='unicode', analyzer='word', 
                                       stop_words='english', ngram_range=(1, 1), max_features=max_features, 
                                       norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
                                       max_df=max_df, min_df=min_df)
    X = tfidf_vectorizor.fit_transform(corpus)
    print('tfidf matrix successfully created.')
    return X, tfidf_vectorizor

def calculate_similarity(X, vectorizor, query, top_k=5):
    """ Vectorizes the `query` via `vectorizor` and calculates the cosine similarity of 
    the `query` and `X` (all the documents) and returns the `top_k` similar documents."""
    
    # Vectorize the query to the same length as documents
    query_vec = vectorizor.transform(query)
    # Compute the cosine similarity between query_vec and all the documents
    cosine_similarities = cosine_similarity(X,query_vec).flatten()
    # Sort the similar documents from the most similar to less similar and return the indices
    most_similar_doc_indices = np.argsort(cosine_similarities, axis=0)[:-top_k-1:-1]
    return (most_similar_doc_indices, cosine_similarities)

def show_similar_documents(df, cosine_similarities, similar_doc_indices):
    """ Prints the most similar documents using indices in the `similar_doc_indices` vector."""
    counter = 1
    for index in similar_doc_indices:
        print('Top-{}, Similarity = {}'.format(counter, cosine_similarities[index]))
        print('body: {}, '.format(df[index]))
        print()
        counter += 1

In [16]:
sample_index = np.random.randint(len(df))
sample = df.loc[sample_index,['articel_title', 'content']]
print('title: {}, \ncontent: {}'.format(sample['articel_title'],sample['content']))

title: Fragile—handle with care, 
content: Updated April 29, 2022
Investors with bond-centric portfolios are likely feeling fragile after unusually poor first-quarter returns. Here are a few tips for handling them with care.

The first-quarter 2022 market downturn squeezed equity and fixed income asset classes alike. This less common market event of both negative stock and bond returns likely affected all investors, but its impact may have gained special notice from conservative, bond-centric investors because they also have the highest loss aversion.
You might find it beneficial to reach out proactively to your most conservatively invested clients, as they may be less accustomed to simultaneous stock and bond drawdowns of the magnitude we just witnessed.
As their advisor, you've spent a great deal of time building a strong relationship with your clients to gain their trust and develop a thoughtful financial plan. One unfavorable quarter should not trigger a change to that plan if it w

In [17]:
# Preprocess the corpus
data = [preprocess(title, body) for title, body in zip(df['articel_title'], df['content'])]

In [20]:
print('creating tfidf matrix...')
# Learn vocabulary and idf, return term-document matrix
X,v = create_tfidf_features(data)
features = v.get_feature_names_out()
len(features)

creating tfidf matrix...
tfidf matrix successfully created.


4031

In [23]:
user_question = ['What is the Market trend?']
search_start = time.time()
sim_vecs, cosine_similarities = calculate_similarity(X, v, user_question)
search_time = time.time() - search_start
print("search time: {:.2f} ms".format(search_time * 1000))
print()
show_similar_documents(data, cosine_similarities, sim_vecs)

search time: 6.47 ms

Top-1, Similarity = 0.14142986846146868
body: vanguard economic and market outlook, 

Top-2, Similarity = 0.0838665262702267
body: opportunities in emerging market bondspatient investors are likely to be rewarded as we expect yields to rally that is decline in emerging markets em that s the key finding of higher inflation is creating an opportunity in emerging markets an analysis produced by vanguard fixed income group s fig nishan pradhan liza ermolenko zoe odenwalder nick eisinger and daniel shaykevich reasons for optimism the authors cite two major reasons why they think the peak in em inflation is near and why the resulting disinflation will create attractive buying opportunities differences in goods and services consumption patterns have contributed to higher em inflation recent inflation in emerging markets has been more supply driven than in developed markets this makes em inflation more likely to fall when supply side pressures abate in general central ban

# Connect to Data Base

In [3]:
import psycopg2 

In [4]:
def db_query(username, password, host, port, db_name, command, read_query=None):
        """ run query in PostgreSQL data """

        try:

            with psycopg2.connect(database=db_name,
                                  user=username,
                                  password=password,
                                  host=host,
                                  port=port) as conn:

                with conn.cursor() as curs:
                    conn.autocommit = True
                    curs.execute(command)
                    if read_query:
                        data = curs.fetchall()

            if read_query:
                return data

        except (Exception, psycopg2.DatabaseError) as error:
            print(error)
            
# obj_data_base_class = DataBaseClass('guest', 'Aa12345', 'localhost', 5432, 'insights_db')

In [5]:
vanguard_command = """select * from insights_data"""

vanguard_data = db_query(
    username='guest',
    password='Aa12345',
    host='localhost',
    port=5432,
    db_name='insights_db',
    command=vanguard_command,
    read_query=True
)

In [16]:
vanguard_col_command = """select column_name from information_schema.columns where table_name='insights_data'"""

vanguard_col = db_query(
    username='guest',
    password='Aa12345',
    host='localhost',
    port=5432,
    db_name='insights_db',
    command=vanguard_col_command,
    read_query=True
)

In [17]:
vanguard_col = [tup[0] for tup in vanguard_col]

In [18]:
vanguard_col

['company',
 'topic',
 'article_title',
 'abstract',
 'date',
 'perspective',
 'tag',
 'link',
 'content']

In [19]:
vanguard_df = pd.DataFrame(vanguard_data, columns=vanguard_col)

In [20]:
vanguard_df.head()

Unnamed: 0,company,topic,article_title,abstract,date,perspective,tag,link,content
0,Vanguard,Insight,Vanguard Expands Tax-Exempt Bond Lineup with n...,Vanguard announces Vanguard Short-Term Tax-Exe...,"DECEMBER 21, 2022",PRODUCT NEWS,Fixed Income,https://advisors.vanguard.com/insights/article...,Vanguard today filed an initial registration s...
1,Vanguard,Insight,Vanguard Economic and Market Outlook 2023,Key takeaways from the 2023 Vanguard Economic ...,"DECEMBER 11, 2022",VANGUARD PERSPECTIVE,"U.S.,International",https://advisors.vanguard.com/insights/article...,Vanguard Economic and Market Outlook for 2023:...
2,Vanguard,Insight,Tallying the total cost of owning an ETF,"With the rise of commission-free ETF trading, ...","DECEMBER 08, 2022",EXPERT PERSPECTIVE,ETF Investing,https://advisors.vanguard.com/insights/article...,With the rise of commission-free ETF trading a...
3,Vanguard,Insight,Muni yields haven't been this strong for 10 years,Many of your clients can benefit from the high...,"DECEMBER 07, 2022",VANGUARD PERSPECTIVE,"Fixed Income,U.S.",https://advisors.vanguard.com/insights/article...,Looking for more yield? Munis are at a decade-...
4,Vanguard,Insight,Ahead of the curve: Munis prepare for economic...,Vanguard municipal bond portfolio managers tal...,"DECEMBER 07, 2022",EXPERT PERSPECTIVE,Fixed Income,https://advisors.vanguard.com/insights/article...,"In this wide-ranging Q&A discussion, Vanguard ..."


# Creat Text Summarization Approaches

### with gensim

In [21]:
from gensim.summarization import summarize

In [22]:
short_content = summarize(vanguard_df.content[0])

In [26]:
len(short_content)

1220

In [25]:
len(vanguard_df.content[0])

4449

### with sumy

In [30]:
import sumy
from sumy import summarizers

In [31]:
dir(summarizers)

['AbstractSummarizer',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_summarizer',
 'absolute_import',
 'division',
 'print_function',
 'unicode_literals']

### LexRank

In [None]:
from sumy.summarizers.lex_rank import LexRankSummarizer
summarizer_lex = LexRankSummarizer()

# Summarize using sumy LexRank
summary= summarizer_lex(parser.document, 2)
lex_summary=""
for sentence in summary:
lex_summary+=str(sentence)
print(lex_summary)

print(text_summary)

