In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def extract_keywords(documents, top_n=10, ngram_range=(1, 2)):
    """
    Extracts the most relevant keywords from documents using TF-IDF.

    Parameters:
        documents (list): List of text documents.
        top_n (int): Number of top keywords to extract.
        ngram_range (tuple): Range of n-grams to consider (unigrams, bigrams, etc.).

    Returns:
        dict: Dictionary with document index as key and top keywords as values.
    """
    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=ngram_range)

    # Compute TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Get feature names (words/phrases)
    feature_names = vectorizer.get_feature_names_out()

    # Extract top keywords for each document
    keyword_dict = {}

    for i, doc in enumerate(documents):
        tfidf_scores = tfidf_matrix[i].toarray().flatten()
        top_indices = np.argsort(tfidf_scores)[-top_n:][::-1]  # Get top N keyword indices
        top_keywords = [feature_names[idx] for idx in top_indices]
        keyword_dict[i] = top_keywords  # Store top keywords for each document

    return keyword_dict

# Example Documents
documents = [
    "The space shuttle launched successfully and reached orbit.",
    "The baseball game was intense, with the team scoring in the last inning.",
    "Astronauts conduct spacewalks to repair satellites in orbit."
]

# Extract top 5 keywords from each document
keywords = extract_keywords(documents, top_n=5)

# Display the extracted keywords
for doc_id, words in keywords.items():
    print(f"Document {doc_id + 1} Keywords: {', '.join(words)}")


Document 1 Keywords: reached orbit, space, launched successfully, reached, shuttle
Document 2 Keywords: team scoring, game, intense team, intense, scoring
Document 3 Keywords: astronauts, satellites orbit, astronauts conduct, repair, spacewalks repair


In [None]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load SpaCy NLP model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """
    Preprocesses the text by performing lemmatization and removing stopwords & punctuation.

    Parameters:
        text (str): Input text document.

    Returns:
        str: Preprocessed text.
    """
    doc = nlp(text)
    processed_tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and not token.is_punct and not token.like_num
    ]
    return " ".join(processed_tokens)

def extract_keywords(documents, top_n=10, ngram_range=(1, 2)):
    """
    Extracts the most relevant keywords from documents using TF-IDF with lemmatization.

    Parameters:
        documents (list): List of text documents.
        top_n (int): Number of top keywords to extract.
        ngram_range (tuple): Range of n-grams to consider (e.g., unigrams, bigrams).

    Returns:
        dict: Dictionary with document index as key and top keywords as values.
    """
    # Preprocess documents (lemmatization, stopword removal)
    processed_docs = [preprocess_text(doc) for doc in documents]

    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=ngram_range)

    # Compute TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(processed_docs)

    # Get feature names (words/phrases)
    feature_names = vectorizer.get_feature_names_out()

    # Extract top keywords for each document
    keyword_dict = {}

    for i, doc in enumerate(processed_docs):
        tfidf_scores = tfidf_matrix[i].toarray().flatten()
        top_indices = np.argsort(tfidf_scores)[-top_n:][::-1]  # Get top N keyword indices
        top_keywords = [feature_names[idx] for idx in top_indices]
        keyword_dict[i] = top_keywords  # Store top keywords for each document

    return keyword_dict

def extract_named_entities(text):
    """
    Extracts named entities (e.g., names, organizations, locations) from a text document.

    Parameters:
        text (str): Input text document.

    Returns:
        list: List of named entities.
    """
    doc = nlp(text)
    named_entities = [ent.text for ent in doc.ents]  # Extract proper nouns, places, etc.
    return list(set(named_entities))  # Remove duplicates

# Example Documents
documents = [
    "NASA successfully launched a new satellite into orbit for space research.",
    "The World Series baseball game was exciting, with the Yankees winning in extra innings.",
    "Astronauts at the International Space Station conducted experiments on zero gravity."
]

# Extract top 5 keywords from each document
keywords = extract_keywords(documents, top_n=5)

# Extract named entities from each document
named_entities = {i: extract_named_entities(doc) for i, doc in enumerate(documents)}

# Display the extracted keywords and named entities
for doc_id, words in keywords.items():
    print(f"Document {doc_id + 1} Keywords: {', '.join(words)}")
    print(f"Document {doc_id + 1} Named Entities: {', '.join(named_entities[doc_id])}")
    print("-" * 80)


Document 1 Keywords: nasa successfully, orbit space, satellite, satellite orbit, orbit
Document 1 Named Entities: NASA
--------------------------------------------------------------------------------
Document 2 Keywords: yankees win, exciting, innings, yankees, game exciting
Document 2 Named Entities: World Series, Yankees
--------------------------------------------------------------------------------
Document 3 Keywords: astronaut, experiment, astronaut international, international space, international
Document 3 Named Entities: the International Space Station, zero
--------------------------------------------------------------------------------
