In [10]:
%pip install rake_nltk wordcloud matplotlib

Note: you may need to restart the kernel to use updated packages.


In [11]:
import requests
from bs4 import BeautifulSoup
import string
from nltk.tokenize import word_tokenize
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rake_nltk import Rake
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [12]:
# Download the resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shyam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shyam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shyam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
def wikiscrape(url):
    # Get URL
    page = requests.get(url)

    # Scrape webpage
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find all occurrences of p in HTML
    # Includes HTML tags
    paragraphs = soup.find_all('p')

    # Remove square brackets and their contents using regular expressions
    pattern = re.compile(r'\[[^\]]*\]')
    content = ""
    for p in paragraphs:
        cleaned_text = re.sub(pattern, '', p.get_text())
        content += cleaned_text.lower()

    return content

In [14]:
def lemmatize_and_get_forms(word):
    lemmatizer = WordNetLemmatizer()
    forms = set()
    forms.add(lemmatizer.lemmatize(word))
    for synset in nltk.corpus.wordnet.synsets(word):
        for lemma in synset.lemmas():
            forms.add(lemma.name())
    return forms

In [15]:
def rake(text, ignore_words, keep_words):
    # Remove punctuation except hyphen
    translator = str.maketrans("", "", string.punctuation.replace("-", ""))
    text = text.translate(translator)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Reconstruct the modified text
    modified_text = " ".join(tokens)

    # Initialize RAKE
    r = Rake()

    # Extract keywords using RAKE
    r.extract_keywords_from_text(modified_text)

    # Get the ranked keyword phrases
    ranked_keywords = r.get_ranked_phrases()

    # Filter out keywords that contain ignore words
    filtered_keywords = [keyword for keyword in ranked_keywords if not any(word in keyword for word in ignore_words)]

    # Filter out keywords that do not contain any of the keep words or their forms
    if keep_words:
        final_forms = set()
        for word in keep_words:
            word_forms = lemmatize_and_get_forms(word)
            print(f"Word: {word} ; Word-forms: {word_forms}")
            final_forms.update(word_forms)
            filtered_keywords = [keyword for keyword in filtered_keywords if any(word in keyword for word in final_forms)]

        print(f"\nAll Word Forms: {final_forms}\n")


    return filtered_keywords[:20]

In [16]:
def generate_word_cloud(keywords):
    # Join the keywords into a single string
    text = " ".join(keywords)

    # Generate the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)

    # Display the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()


In [17]:
"""def rake(text, ignore_words, keep_words):
    # Remove punctuation except hyphen
    translator = str.maketrans("", "", string.punctuation.replace("-", ""))
    text = text.translate(translator)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Reconstruct the modified text
    modified_text = " ".join(tokens)

    # Initialize RAKE
    r = Rake()

    # Extract keywords using RAKE
    r.extract_keywords_from_text(modified_text)

    # Get the ranked keyword phrases
    ranked_keywords = r.get_ranked_phrases()

    # Filter out keywords that contain ignore words
    filtered_keywords = [keyword for keyword in ranked_keywords if not any(word in keyword for word in ignore_words)]

    # Filter out keywords that do not contain any of the keep words or their forms
    if keep_words:
        final_forms = set()
        for word in keep_words:
            word_forms = lemmatize_and_get_forms(word)
            print(f"Word: {word} ; Word-forms: {word_forms}")
            final_forms.update(word_forms)

        print(f"\nAll Word Forms: {final_forms}\n")

    if final_forms:
        filtered_keywords = [keyword for keyword in filtered_keywords if any(word in keyword for word in final_forms)]

    return filtered_keywords[:20]"""

'def rake(text, ignore_words, keep_words):\n    # Remove punctuation except hyphen\n    translator = str.maketrans("", "", string.punctuation.replace("-", ""))\n    text = text.translate(translator)\n\n    # Tokenize the text\n    tokens = word_tokenize(text)\n\n    # Remove stopwords\n    stop_words = set(stopwords.words(\'english\'))\n    tokens = [token for token in tokens if token.lower() not in stop_words]\n\n    # Reconstruct the modified text\n    modified_text = " ".join(tokens)\n\n    # Initialize RAKE\n    r = Rake()\n\n    # Extract keywords using RAKE\n    r.extract_keywords_from_text(modified_text)\n\n    # Get the ranked keyword phrases\n    ranked_keywords = r.get_ranked_phrases()\n\n    # Filter out keywords that contain ignore words\n    filtered_keywords = [keyword for keyword in ranked_keywords if not any(word in keyword for word in ignore_words)]\n\n    # Filter out keywords that do not contain any of the keep words or their forms\n    if keep_words:\n        final_

In [18]:
while True:
    # Input URL
    last_part = input("Enter the last part of the Wikipedia URL (e.g., 'hotflashes'): ")
    if last_part == '1':
        break

    url = "https://en.wikipedia.org/wiki/" + last_part

    # Customized stopwords specific to your problem
    custom_stopwords = ["a", "an", "the"]

    # Words to ignore in the final keyword list
    ignore_words_input = input("Enter the words to ignore in the final keyword list, separated by hyphens: ")
    ignore_words = ignore_words_input.split("-") if ignore_words_input != '1' else []

    # Words to keep in the final keyword list
    keep_words_input = input("Enter the words to keep in the final keyword list, separated by hyphens: ")
    keep_words = keep_words_input.split("-") if keep_words_input != '1' else []

    # Calling wikiscrape()
    article_text = wikiscrape(url)

    # Apply RAKE algorithm
    filtered_keywords = rake(article_text, ignore_words, keep_words)

    # Print the filtered keywords
    if filtered_keywords:
        print("Filtered Keywords:")
        count = 1
        for keyword in filtered_keywords:
            print(f"{count}. {keyword}")
            count += 1
    else:
        print("No keywords found.")

    # Generate and display the word cloud
    generate_word_cloud(filtered_keywords)

    print()

print("Exiting the program.")

Filtered Keywords:
1. time high vast majority resources spent interdiction law enforcement instead public health united states number nonviolent drug offenders prison exceeds 100000 total incarcerated population eu despite fact eu 100 million citizens despite drug legislation perhaps large organized criminal drug cartels operate worldwide advocates decriminalization argue drug prohibition makes drug dealing lucrative business leading much associated criminal activity states us late focused facilitating safe use opposed eradicating example 2022 new jersey made effort expand needle exchange programs throughout state passing bill legislature gives control decisions regarding types programs states department health state level bill significant new jersey could used model states possibly follow well bill partly reaction issues occurring local level city governments within state new jersey late one example atlantic city government came lawsuit halted enactment said programs within city suit 

AttributeError: 'TransposedFont' object has no attribute 'getbbox'