##Key Word Extraction using NLP

Installing Libraries

In [13]:
!pip install requests spacy beautifulsoup4 gensim nltk PyPDF2 python-docx

import requests
from bs4 import BeautifulSoup
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
import nltk
import PyPDF2
import docx

nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Text Extraction from URL,PDF, and  Word Files

In [14]:
#Functions to extract Text
# Function to extract text from a webpage
def extract_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        article_text = " ".join([p.text for p in soup.find_all('p')])
        return article_text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return None

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to extract text from a word file
def extract_text_from_word(word_file):
    doc = docx.Document(word_file)
    return "\n".join([para.text for para in doc.paragraphs])


Preprocessing and Key Phrase Extraction

In [15]:
# Preprocessing function to clean and tokenize text

#Preprocessing
def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])

# Function to extract key phrases using TF-IDF
def extract_key_phrases(text, top_n=10):
    vectorizer = TfidfVectorizer(ngram_range=(3, 5), stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray().flatten()
    sorted_phrases = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)
    return [phrase for phrase, _ in sorted_phrases[:top_n]]


Topic Extraction Using LDA

In [16]:
# Function to extract topics using LDA
def extract_topics(text, num_topics=3, num_words=5):
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in text.split() if word.lower() not in stop_words]
    dictionary = corpora.Dictionary([tokens])
    corpus = [dictionary.doc2bow(tokens)]

    # Training our LDA model
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    return lda_model.print_topics(num_words=num_words)


Key phrases and topics extraction

In [17]:
#function to extract key phrases and topics
def extract_from_source(source_type, source, top_n=10, num_topics=3, num_words=5):
    text = None
    if source_type == "url":
        text = extract_text_from_url(source)
    elif source_type == "pdf":
        text = extract_text_from_pdf(source)
    elif source_type == "word":
        text = extract_text_from_word(source)
    else:
        print("Invalid source type!")
        return

    if text:
        cleaned_text = preprocess_text(text)
        # Extract and display key phrases
        print("\nExtracting key phrases:")
        key_phrases = extract_key_phrases(cleaned_text, top_n)
        print(f"Key Phrases: {key_phrases}")
        # Extract and display topics
        print("\nExtracting topics:")
        topics = extract_topics(cleaned_text, num_topics, num_words)
        for idx, topic in enumerate(topics):
            print(f"Topic {idx + 1}: {topic}")
    else:
        print("Failed to extract text from the source.")


Main Function and User Interface

In [18]:
# Main function to handle the user interface
def main():
    while True:
        print("\nSelect the input method:")
        print("1. URL")
        print("2. PDF file")
        print("3. Word file")
        print("4. Exit")
        choice = input("Enter your choice: ")
        if choice == "1":
            url = input("Enter the URL: ")
            extract_from_source("url", url)
        elif choice == "2":
            pdf_file_path = input("Enter the path to the PDF file: ")
            try:
                with open(pdf_file_path, "rb") as pdf_file:
                    extract_from_source("pdf", pdf_file)
            except FileNotFoundError:
                print(f"File not found: {pdf_file_path}")
        elif choice == "3":
            word_file_path = input("Enter the path to the Word file: ")
            try:
                with open(word_file_path, "rb") as word_file:
                    extract_from_source("word", word_file)
            except FileNotFoundError:
                print(f"File not found: {word_file_path}")
        elif choice == "4":
            print("Program Exit.")
            break
        else:
            print("Invalid choice. Please select a valid option")

if __name__ == "__main__":
    main()



Select the input method:
1. URL
2. PDF file
3. Word file
4. Exit
Enter your choice: 1
Enter the URL: https://www.nbcnews.com/

Extracting key phrases:
Key Phrases: ['access feature free', 'access feature free account', 'access feature free account valid', 'account valid zip', 'account valid zip code', 'account valid zip code nbc', 'advertisement access feature', 'advertisement access feature free', 'advertisement access feature free account', 'alert time advertisement']

Extracting topics:
Topic 1: (0, '0.056*"Advertisement" + 0.056*"account" + 0.056*"ZIP" + 0.056*"alert" + 0.056*"News"')
Topic 2: (1, '0.124*"NBC" + 0.086*"feature" + 0.049*"Sections" + 0.049*"UNIVERSAL" + 0.049*"Access"')
Topic 3: (2, '0.056*"NBC" + 0.056*"feature" + 0.056*"News" + 0.056*"new" + 0.056*"Profile"')

Select the input method:
1. URL
2. PDF file
3. Word file
4. Exit
Enter your choice: 2
Enter the path to the PDF file: /content/Network Security (ch-8).pdf

Extracting key phrases:
Key Phrases: ['bob public k