In [48]:
import requests
from bs4 import BeautifulSoup
import re
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [None]:
nltk.download("stopwords")

def scrape_text_from_url(url):
    response = requests.get(url)
    response.raise_for_status()  
    soup = BeautifulSoup(response.text, "html.parser")
    text = soup.get_text(separator=" ", strip=True)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/snehsuresh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    # Remove stop words
    stop_words = set(stopwords.words("english"))
    words = [word for word in text.split() if word not in stop_words]

    # Filter out tokens that can be converted to integers (purely numeric strings)
    def is_numeric(word):
        try:
            int(word) 
            return True  
        except ValueError:
            return False  
    words = [word for word in words if not is_numeric(word)]
    return words

In [None]:

def calculate_word_frequency(words):
    return Counter(words)

def calculate_tfidf(documents):
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()
    return feature_names, tfidf_scores

In [None]:
def analyze_webpage(url):
    text = scrape_text_from_url(url)
    words = preprocess_text(text)
    word_freq = calculate_word_frequency(words)

    print("Word Frequency:", word_freq)

    documents = [text]
    feature_names, tfidf_scores = calculate_tfidf(documents)

    print("\nTF-IDF Scores:")

    word_score_pairs = list(zip(feature_names, tfidf_scores[0]))

    sorted_word_score_pairs = sorted(word_score_pairs, key=lambda x: x[1], reverse=True)


    for word, score in sorted_word_score_pairs:
        if score > 0:
            print(f"{word}: {score:.3f}")

In [53]:
url = "https://www.jpmorgan.com/insights/outlook/economic-outlook/cpi-report-august-2024#:~:text=The%20August%202024%20Consumer%20Price%20Index%20(CPI)%20rose%20by%200.2,from%202.9%25%20YoY%20in%20July.&text=This%20marks%20the%20smallest%20annual,the%20Federal%20Reserve's%202%25%20target."
analyze_webpage(url)

Word Frequency: Counter({'services': 38, 'cpi': 26, 'jp': 24, 'investment': 23, 'morgan': 22, 'us': 21, 'products': 21, 'inflation': 20, 'august': 18, 'prices': 18, 'core': 16, 'chase': 16, 'report': 15, 'wealth': 15, 'market': 14, 'information': 14, 'mom': 13, 'may': 13, 'read': 12, 'rose': 11, 'insurance': 11, 'jpmorgan': 11, 'outlook': 10, 'fed': 10, 'index': 10, 'rise': 10, 'strategists': 10, 'management': 10, 'fell': 10, 'material': 10, 'jpms': 10, 'yoy': 9, 'view': 9, 'ibid': 9, 'strategies': 8, 'shelter': 8, 'please': 7, 'food': 7, 'energy': 7, 'change': 7, 'year': 7, 'co': 7, 'risks': 7, 'provided': 7, 'agency': 7, 'target': 6, 'advisors': 6, 'clients': 6, 'september': 6, 'price': 6, 'since': 6, 'data': 6, 'doesnt': 6, 'headline': 6, 'financial': 6, 'strategy': 6, 'important': 6, 'views': 6, 'asset': 6, 'including': 6, 'securities': 6, 'bank': 6, 'eye': 6, 'feds': 5, 'contact': 5, 'advisor': 5, 'explore': 5, 'banking': 5, 'rates': 5, 'next': 5, 'consumer': 5, 'increase': 5, 'dr