step1:load the data set using pandas


In [2]:
import pandas as pd
df=pd.read_excel('/content/LDA-Data.xlsx')
print(df.head())

                             News
0   Virat scored century in match
1            BJP won in elections
2  Bumra took 5 wicket in a match
3  Congress form state government


step2:Text preprocessing


In [3]:
import re
import pandas as pd

# Load dataset from the available Excel file
df = pd.read_excel('/content/LDA-Data.xlsx')

# Define preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Remove URLs (http, https, www)
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove social media mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove special characters (keep only alphanumeric and spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Normalize whitespace (reduce multiple spaces to single space)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply preprocessing to 'News' column from the loaded Excel data
df['processed_news'] = df['News'].apply(preprocess_text)

# Preview results
print(df[['News', 'processed_news']].head())

                             News                  processed_news
0   Virat scored century in match   virat scored century in match
1            BJP won in elections            bjp won in elections
2  Bumra took 5 wicket in a match  bumra took 5 wicket in a match
3  Congress form state government  congress form state government


word tokenization


In [4]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')  # required in newer NLTK versions

df['tokenized_news'] = df['processed_news'].apply(lambda x: word_tokenize(x))
print(df[['processed_news', 'tokenized_news']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


                   processed_news                          tokenized_news
0   virat scored century in match     [virat, scored, century, in, match]
1            bjp won in elections               [bjp, won, in, elections]
2  bumra took 5 wicket in a match  [bumra, took, 5, wicket, in, a, match]
3  congress form state government     [congress, form, state, government]


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


stop word removal


In [5]:
import nltk
from nltk.corpus import stopwords

# Ensure the stopwords resource is available
nltk.download('stopwords')

# Define the set of English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from tokenized_news
df['filtered_news'] = df['tokenized_news'].apply(
    lambda tokens: [w for w in tokens if w.lower() not in stop_words]
)

# Preview results
print(df[['tokenized_news', 'filtered_news']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...


                           tokenized_news                        filtered_news
0     [virat, scored, century, in, match]      [virat, scored, century, match]
1               [bjp, won, in, elections]                     [bjp, elections]
2  [bumra, took, 5, wicket, in, a, match]      [bumra, took, 5, wicket, match]
3     [congress, form, state, government]  [congress, form, state, government]


[nltk_data]   Unzipping corpora/stopwords.zip.


lemmatization

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer

# Ensure the WordNet corpus is available
nltk.download('wordnet')
nltk.download('omw-1.4')  # optional, improves lemmatization coverage

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization to each token list
df['lemmatized_news'] = df['filtered_news'].apply(
    lambda tokens: [lemmatizer.lemmatize(w) for w in tokens]
)

# Preview results
print(df[['filtered_news', 'lemmatized_news']].head())

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


                         filtered_news                      lemmatized_news
0      [virat, scored, century, match]      [virat, scored, century, match]
1                     [bjp, elections]                      [bjp, election]
2      [bumra, took, 5, wicket, match]      [bumra, took, 5, wicket, match]
3  [congress, form, state, government]  [congress, form, state, government]


Rejoin

In [7]:
# Rejoin lemmatized words into a single string
df['clean_news'] = df['lemmatized_news'].apply(lambda tokens: ' '.join(tokens))

# Preview results
print(df[['lemmatized_news', 'clean_news']].head())

                       lemmatized_news                      clean_news
0      [virat, scored, century, match]      virat scored century match
1                      [bjp, election]                    bjp election
2      [bumra, took, 5, wicket, match]       bumra took 5 wicket match
3  [congress, form, state, government]  congress form state government


step4: BOW

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform sentences
bow_matrix = vectorizer.fit_transform(df['clean_news'])

# Convert to array for readability
bow_array = bow_matrix.toarray()

# Show vocabulary
print("Vocabulary:", vectorizer.get_feature_names_out())

# Show Bag-of-Words matrix
print("Bag-of-Words Matrix:\n", bow_array)

Vocabulary: ['bjp' 'bumra' 'century' 'congress' 'election' 'form' 'government' 'match'
 'scored' 'state' 'took' 'virat' 'wicket']
Bag-of-Words Matrix:
 [[0 0 1 0 0 0 0 1 1 0 0 1 0]
 [1 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 1 0 0 1 0 1]
 [0 0 0 1 0 1 1 0 0 1 0 0 0]]


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform sentences
bow_matrix = vectorizer.fit_transform(df['clean_news'])

# Convert to DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Add original sentences for reference
bow_df.insert(0, "Sentence", df['clean_news'])

# Display Bag-of-Words DataFrame
print(bow_df.head())

                         Sentence  bjp  bumra  century  congress  election  \
0      virat scored century match    0      0        1         0         0   
1                    bjp election    1      0        0         0         1   
2       bumra took 5 wicket match    0      1        0         0         0   
3  congress form state government    0      0        0         1         0   

   form  government  match  scored  state  took  virat  wicket  
0     0           0      1       1      0     0      1       0  
1     0           0      0       0      0     0      0       0  
2     0           0      1       0      0     1      0       1  
3     1           1      0       0      1     0      0       0  


step5: Apply LDA

In [10]:
!pip install gensim

from gensim import corpora

# Create dictionary
dictionary = corpora.Dictionary(df['lemmatized_news'])

# Create corpus (bag-of-words representation)
corpus = [dictionary.doc2bow(text) for text in df['lemmatized_news']]

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [11]:
from gensim.models import LdaModel

# Train LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, random_state=42, passes=10)

# Print topics
for idx, topic in lda_model.print_topics(num_words=5):
    print(f"Topic {idx}: {topic}")


Topic 0: 0.199*"bjp" + 0.199*"election" + 0.050*"century" + 0.050*"virat" + 0.050*"scored"
Topic 1: 0.154*"congress" + 0.154*"form" + 0.154*"state" + 0.154*"government" + 0.039*"bjp"
Topic 2: 0.171*"match" + 0.098*"bumra" + 0.098*"5" + 0.098*"wicket" + 0.098*"took"


In [12]:
from gensim.models import LdaModel

# Train LDA model (example with 3 topics)
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, random_state=42, passes=10)

# Print top words for each topic
num_words = 10  # how many words per topic
topics = lda_model.show_topics(num_topics=3, num_words=num_words, formatted=False)

for topic_num, words in topics:
    print(f"\nTopic {topic_num}:")
    for word, weight in words:
        print(f"  {word} ({weight:.3f})")



Topic 0:
  bjp (0.199)
  election (0.199)
  century (0.050)
  virat (0.050)
  scored (0.050)
  match (0.050)
  took (0.050)
  bumra (0.050)
  5 (0.050)
  wicket (0.050)

Topic 1:
  congress (0.154)
  form (0.154)
  state (0.154)
  government (0.154)
  bjp (0.039)
  virat (0.039)
  election (0.039)
  century (0.039)
  match (0.039)
  scored (0.039)

Topic 2:
  match (0.171)
  bumra (0.098)
  5 (0.098)
  wicket (0.098)
  took (0.098)
  scored (0.097)
  virat (0.097)
  century (0.097)
  election (0.024)
  bjp (0.024)


# TASK 2:

In [13]:
import pandas as pd
df=pd.read_csv('/content/arxiv_data.csv')
print(df.head())

                                              titles  \
0  Survey on Semantic Stereo Matching / Semantic ...   
1  FUTURE-AI: Guiding Principles and Consensus Re...   
2  Enforcing Mutual Consistency of Hard Regions f...   
3  Parameter Decoupling Strategy for Semi-supervi...   
4  Background-Foreground Segmentation for Interio...   

                                           summaries  \
0  Stereo matching is one of the widely used tech...   
1  The recent advancements in artificial intellig...   
2  In this paper, we proposed a novel mutual cons...   
3  Consistency training has proven to be an advan...   
4  To ensure safety in automated driving, the cor...   

                         terms  
0           ['cs.CV', 'cs.LG']  
1  ['cs.CV', 'cs.AI', 'cs.LG']  
2           ['cs.CV', 'cs.AI']  
3                    ['cs.CV']  
4           ['cs.CV', 'cs.LG']  


In [14]:
import re
import pandas as pd

# Load dataset from the available CSV file
df = pd.read_csv('/content/arxiv_data.csv')

# Define preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Remove URLs (http, https, www)
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove social media mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove special characters (keep only alphanumeric and spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Normalize whitespace (reduce multiple spaces to single space)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply preprocessing to 'summaries' column from the loaded CSV data
df['processed_news'] = df['summaries'].apply(preprocess_text)

# Preview results
print(df[['summaries', 'processed_news']].head())

                                           summaries  \
0  Stereo matching is one of the widely used tech...   
1  The recent advancements in artificial intellig...   
2  In this paper, we proposed a novel mutual cons...   
3  Consistency training has proven to be an advan...   
4  To ensure safety in automated driving, the cor...   

                                      processed_news  
0  stereo matching is one of the widely used tech...  
1  the recent advancements in artificial intellig...  
2  in this paper we proposed a novel mutual consi...  
3  consistency training has proven to be an advan...  
4  to ensure safety in automated driving the corr...  


In [15]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')  # required in newer NLTK versions

df['tokenized_news'] = df['processed_news'].apply(lambda x: word_tokenize(x))
print(df[['processed_news', 'tokenized_news']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                      processed_news  \
0  stereo matching is one of the widely used tech...   
1  the recent advancements in artificial intellig...   
2  in this paper we proposed a novel mutual consi...   
3  consistency training has proven to be an advan...   
4  to ensure safety in automated driving the corr...   

                                      tokenized_news  
0  [stereo, matching, is, one, of, the, widely, u...  
1  [the, recent, advancements, in, artificial, in...  
2  [in, this, paper, we, proposed, a, novel, mutu...  
3  [consistency, training, has, proven, to, be, a...  
4  [to, ensure, safety, in, automated, driving, t...  


In [16]:
import nltk
from nltk.corpus import stopwords

# Ensure the stopwords resource is available
nltk.download('stopwords')

# Define the set of English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from tokenized_news
df['filtered_news'] = df['tokenized_news'].apply(
    lambda tokens: [w for w in tokens if w.lower() not in stop_words]
)

# Preview results
print(df[['tokenized_news', 'filtered_news']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                      tokenized_news  \
0  [stereo, matching, is, one, of, the, widely, u...   
1  [the, recent, advancements, in, artificial, in...   
2  [in, this, paper, we, proposed, a, novel, mutu...   
3  [consistency, training, has, proven, to, be, a...   
4  [to, ensure, safety, in, automated, driving, t...   

                                       filtered_news  
0  [stereo, matching, one, widely, used, techniqu...  
1  [recent, advancements, artificial, intelligenc...  
2  [paper, proposed, novel, mutual, consistency, ...  
3  [consistency, training, proven, advanced, semi...  
4  [ensure, safety, automated, driving, correct, ...  


In [17]:
import nltk
from nltk.stem import WordNetLemmatizer

# Ensure the WordNet corpus is available
nltk.download('wordnet')
nltk.download('omw-1.4')  # optional, improves lemmatization coverage

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization to each token list
df['lemmatized_news'] = df['filtered_news'].apply(
    lambda tokens: [lemmatizer.lemmatize(w) for w in tokens]
)

# Preview results
print(df[['filtered_news', 'lemmatized_news']].head())

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                                       filtered_news  \
0  [stereo, matching, one, widely, used, techniqu...   
1  [recent, advancements, artificial, intelligenc...   
2  [paper, proposed, novel, mutual, consistency, ...   
3  [consistency, training, proven, advanced, semi...   
4  [ensure, safety, automated, driving, correct, ...   

                                     lemmatized_news  
0  [stereo, matching, one, widely, used, techniqu...  
1  [recent, advancement, artificial, intelligence...  
2  [paper, proposed, novel, mutual, consistency, ...  
3  [consistency, training, proven, advanced, semi...  
4  [ensure, safety, automated, driving, correct, ...  


In [18]:
# Rejoin lemmatized words into a single string
df['clean_news'] = df['lemmatized_news'].apply(lambda tokens: ' '.join(tokens))

# Preview results
print(df[['lemmatized_news', 'clean_news']].head())

                                     lemmatized_news  \
0  [stereo, matching, one, widely, used, techniqu...   
1  [recent, advancement, artificial, intelligence...   
2  [paper, proposed, novel, mutual, consistency, ...   
3  [consistency, training, proven, advanced, semi...   
4  [ensure, safety, automated, driving, correct, ...   

                                          clean_news  
0  stereo matching one widely used technique infe...  
1  recent advancement artificial intelligence ai ...  
2  paper proposed novel mutual consistency networ...  
3  consistency training proven advanced semisuper...  
4  ensure safety automated driving correct percep...  


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform sentences
bow_matrix = vectorizer.fit_transform(df['clean_news'])

# Convert to array for readability
bow_array = bow_matrix.toarray()

# Show vocabulary
print("Vocabulary:", vectorizer.get_feature_names_out())

# Show Bag-of-Words matrix
print("Bag-of-Words Matrix:\n", bow_array)