In [None]:
python -m spacy download en_core_web_sm

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import spacy
from gensim import corpora, models

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Text preprocessing
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords and stem tokens
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    filtered_tokens = [ps.stem(token) for token in tokens if token not in stop_words]
    
    return filtered_tokens

# Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities

# Topic Modeling
def topic_modeling(texts):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    # Train LDA model
    lda_model = models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=10)
    
    return lda_model.print_topics()

# Example usage
article_text = """If the 73-year-old Modi wins, it would only be the second time an Indian leader has retained power for a third term after Jawaharlal Nehru, the country’s first prime minister.

Some three hours into the counting, early leads reported by the Election Commission showed Modi’s Hindu nationalist Bharatiya Janata Party comfortably ahead of the main opposition Congress party.

The preliminary figures showed the BJP ahead in 239 constituencies out of 542 and winning one uncontested race. Congress was leading in 96 constituencies.

The Election Commission does not release data on the percentage of votes tallied, but counting was to go on throughout the day and early figures were expected to change."""  # Replace with your news article text

# Preprocess text
preprocessed_text = preprocess_text(article_text)

# Extract entities
entities = extract_entities(article_text)
print("Entities:", entities)

# Topic modeling
documents = [preprocessed_text]  # Use the preprocessed article text
topics = topic_modeling(documents)
print("Topics:", topics)