# Latent Semantic Indexing/Analysis (LSI/LSA)

## Import the Libraries

In [2]:
import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

## Loda Data

In [4]:
ls

[0m[01;34msample_data[0m/


In [None]:
documents_list = []

with open(os.path.join("articles.txt"), "r") as fin:
  for line in fin.readlines():
    text = line.strip()
    documents_list.append(text)


In [None]:
documents_list

## Preprocessing and Extraction of TF-IDF Features

In [5]:
tonekizer = RegexpTokenizer(r'\w+')

## Vectorize the documents using TF-IDF

In [8]:
tfidf  = TfidfVectorizer(lowercase=True, stop_words='english', 
                         ngram_range=(1,1), tokenizer= tonekizer.tokenize)

train_data =  tfidf.fit_transform(documents_list)

## Perform Topic Modeling

In [None]:
# Define the number of topics
num_components = 10

# Create a SVD object
lsa = TruncatedSVD(n_components = num_components,
                   n_iter=100,
                   random_state=42)

# Train the model
lsa.fit_transform(train_data)


## Extract topics and terms

In [None]:
# Print the topics with their terms

terms = tfidf.get_feature_names_out()

for index, component in enumerate(lsa.components_):
  zipperd = zip(terms, component)
  top_terms_key = sorted(zipped, key = lambda t: t[1], reverse=True)[:5]
  top_terms_list = list(dict(top_terms_key).keys())
  print("Topic " + str(index) + ":", top_terms_list)
