In [14]:
!pip install nltk scikit-learn



In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Download necessary NLTK data files

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAGHAVA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAGHAVA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\RAGHAVA\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\RAGHAVA\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\RAGHAVA\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [14]:
print("Enter a paragraph describing a topic: ")
paragraph = input()

Enter a paragraph describing a topic: 
The forest was alive with the quiet hum of twilight, shadows stretching long and dark across the mossy ground as the sun dipped below the horizon. Ancient trees stood like silent sentinels, their gnarled branches twisting against the dimming sky, creating a cathedral of green and gold. A soft rustle of leaves broke the stillness, betraying the fleeting presence of a deer. It moved with effortless grace, its delicate form barely disturbing the undergrowth before disappearing into the thicket, as if it were a whispered secret shared only with the forest. The air was thick with the earthy scent of damp wood and blooming wildflowers, mingling with the cool bite of the evening, which carried with it the faint promise of rain. High above, the canopy swayed gently in a breeze that barely touched the forest floor, causing scattered rays of dying sunlight to flicker like fleeting embers. Somewhere in the treetops, a solitary bird sang a melody—sweet and me

# Step 1: Tokenization and Stopwords Removal

In [15]:
tokens = word_tokenize(paragraph)
stop_words = set(stopwords.words('english'))
clean_tokens = [token.lower() for token in tokens if token.isalnum() and token.lower() not in stop_words]

# Step 2: Named Entity Recognition

In [16]:
def extract_named_entities(pos_tags):
    chunked = ne_chunk(pos_tags, binary=False)
    named_entities = []
    for chunk in chunked:
        if isinstance(chunk, Tree):  # If it's a named entity
            named_entities.append(" ".join(c[0] for c in chunk))
    return named_entities

pos_tags = pos_tag(clean_tokens)
named_entities = extract_named_entities(pos_tags)

# Step 3: TF-IDF Scoring

In [17]:
# Filter paragraph to remove stop words before applying TF-IDF
clean_paragraph = " ".join([word for word in tokens if word.isalnum() and word.lower() not in stop_words])


vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([clean_paragraph])
tfidf_scores = dict(zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[0]))

# Step 4: Word Frequency Analysis

In [18]:
freq_dist = nltk.FreqDist(clean_tokens)  # Frequency distribution of words


# Combine Named Entities with TF-IDF Scores
key_phrases = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:10]
key_phrases = [phrase[0] for phrase in key_phrases if phrase[0].isalpha()]  # Remove any numerical tokens


# Combine Named Entities and Keywords
top_named_entities = named_entities[:3]  # Top 3 named entities
top_keywords = key_phrases[:5]  # Top 5 keywords

# Step 5: Dynamically Generate the Title

In [19]:
# Combine the top named entities and important keywords to create a meaningful title
title_parts = []

In [25]:
if top_named_entities and top_keywords:
    generated_title = f"{top_named_entities[0]}: {', '.join(top_keywords[:3])}"
    
elif top_named_entities:
    generated_title = f"{top_named_entities[0]}: Leading Innovation"
    
elif top_keywords:
    generated_title = f"{', '.join(top_keywords[:3])}: A Revolutionary Approach"
else:
    generated_title = "Innovative Developments in the Field"

In [26]:
# Display the generated title
print("\nGenerated Title:")
print(generated_title)

# Provide additional title options by varying combinations of keywords and named entities
print("\nAdditional Title Options:")
if top_named_entities and top_keywords:
    print(f"1. {top_named_entities[0]} and {top_keywords[0]}: Shaping the Future")
    print(f"2. Exploring {', '.join(top_keywords[:3])}")
    print(f"3. {', '.join(top_named_entities)} and their Vision")
else:
    print("No meaningful additional title options could be generated.")


Generated Title:
forest, barely, fleeting: A Revolutionary Approach

Additional Title Options:
No meaningful additional title options could be generated.


# Step 6: Display Word Frequency

In [27]:
print("\nWord Frequency Analysis:")
for word, freq in freq_dist.most_common(10):  # Top 10 most common words
    print(f"{word}: {freq}")


Word Frequency Analysis:
forest: 3
quiet: 2
like: 2
stillness: 2
fleeting: 2
barely: 2
alive: 1
hum: 1
twilight: 1
shadows: 1


# Step 7: Display Key Phrases and Scores for Insight

In [28]:
print("\nKey Phrases by TF-IDF Scores:")
for phrase, score in sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{phrase}: {score:.4f}")


Key Phrases by TF-IDF Scores:
forest: 0.2611
barely: 0.1741
fleeting: 0.1741
like: 0.1741
quiet: 0.1741
stillness: 0.1741
across: 0.0870
air: 0.0870
alive: 0.0870
ancient: 0.0870
