In [None]:
# Author : Shubh Garg
# Github : https://github.com/shubhhub

In [2]:
# Latent Semantic Analysis uses the following steps:
# Step 1: Preprocessing of text data, tokenizing and all
# Step 2: Formation of term document matrix, tf-idf vectorization 
# Step 3: Applying Singular Value Decomposition to factorize tfidf vector
# Step 4: Creatinng Summary

In [3]:
# Install all the necessary python libraries
!pip install numpy scikit-learn

print("Installation Complete")

Installation Complete


In [4]:
# Import all the necessary libraries
import numpy as np
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
from collections import Counter

print("All the necessary libraries have been imported successfully")

All the necessary libraries have been imported successfully


In [5]:
# Preprocessing the text file obtained 
# It returns the lower cased tokenized array 
# Example:
# If my sentence is "Hi, my name is Shubh Garg. I am currently doing my BTech from JIIT."
# Then it will return [["hi my name is shubh garg"], [" i am currently doing my btech from jiit"]]
# Space before 'i' in 2nd sentence as splitting on fullstop

def is_alphanumeric(ch):
    # Function checks if character is either alphabet or number 
    return (ch>='a' and ch<='z') or (ch>='A' and ch<='Z') or (ch>='0' and ch<='9')

def is_space_or_underscore(ch):
    # Function checks if character is a black space, underscore or not
    return ch == ' ' or ch == '_'

def is_fullstop(ch):
    # Function checks if character is fullstop or not
    return ch == '.'

def is_valid_char(ch):
    # Checks validity of character
    # Note: Change here if you want to get other types of characters in you paragraph
    # For example, if you want hyphen(-), then write a 'is_hyphen()' function and add ' or is_hyphen(ch)' in the return statement
    return is_alphanumeric(ch) or is_space_or_underscore(ch) or is_fullstop(ch)

def is_upper_to_lower(ch):
    # Checks and convert upper case characters to lower case characters
    if (ch>='A' and ch<='Z'):
        ch = chr(ord(ch) + ord('a') - ord('A')) #ord is used to get ASCII value, and chr is used to convert from ASCII to char
    return ch

def preprocess_text(text):
    # Convert everything to lower case and remove unnecessary characters
    lowercase_text = ""
    for ch in text:
        if is_valid_char(ch):
            lowercase_text += is_upper_to_lower(ch)

    # Split the array of sentences, based of fullstop
    split_sentences = lowercase_text.split('.')

    return split_sentences

def tokenize_document(sentences):
    tokenized_documents = [sentence.lower().split() for sentence in sentences]
    
    # We are dropping single character words like 'a' or 'I' in the next code line as they don't give much value
    # So it return: [["hi", "my", "name", "is", "shubh garg"], ["am", "currently", "doing", "my", "btech", "from", "jiit"]]
    # It dropped 'i'
    tokenized_documents = [[word for word in sub_list if len(word) > 1] for sub_list in tokenized_documents]

    # Note: You can also use stopwords here as they also don't add muchh value
    
    return tokenized_documents

print("Preprocessing functions ready to use")

Preprocessing functions ready to use


In [6]:
def calculate_tf(word, document):
    # TF is calculated as (number of times word is present in the document)/(total number of words in the document)
    word_count = document.count(word)
    total_words = len(document)
    tf = word_count / total_words if total_words > 0 else 0
    return tf

def calculate_idf(word, documents):
    # IDF is (total number of documents or sentences)/(number of documents in which word is present)
    num_documents_with_term = sum(1 for document in documents if word in document)
    total_documents = len(documents)
    # Adding 1 in division for edge case num_documents_with_term = 0
    # Adding 1 overall to deal with negative values of log
    # I refered GFG here, you can also search TFIDF calculation on GFG
    idf = 1 + math.log((total_documents + 1) / (1 + num_documents_with_term))
    return idf

def create_term_document_matrix(tokenized_documents):
    # Get unique words
    unique_words = sorted(list(set(word for document in tokenized_documents for word in document)))
    
    # Calculate TF-IDF values
    term_document_matrix = []
    for document in tokenized_documents:
        tf_idf_values = [calculate_tf(word, document) * calculate_idf(word, tokenized_documents) for word in unique_words]
        term_document_matrix.append(tf_idf_values)
    return term_document_matrix

print("TDM functions are ready to use")

TDM functions are ready to use


In [7]:
# LSA on TDM is pretty standart process
# Use SVD from Linear Algebra provided by numpy
# It requires eigenvectors and eigenvalues, which, if you want to create the matrix, that again will use Linear Algebra to ease the work
def apply_lsa(term_document_matrix, k):
    # Perform Singular Value Decomposition (SVD)
    U, S, Vt = np.linalg.svd(term_document_matrix, full_matrices=False)

    # Keep only the top k singular values and vectors
    U_k = U[:, :k]
    S_k = np.diag(S[:k])
    Vt_k = Vt[:k, :]

    # Compute the reduced term-document matrix
    reduced_term_document_matrix = np.dot(U_k, np.dot(S_k, Vt_k))

    return reduced_term_document_matrix

print("LSA function is ready to use")

LSA function is ready to use


In [8]:
def compute_sentence_scores(reduced_term_document_matrix):

    # It is dot multiplicaton, it is more similar to cosine similarity, if you want to search about it
    # Note: You have to change it with your own scoring function to find similarities if you don't want dot multiplication
    sentence_scores = np.dot(reduced_term_document_matrix, reduced_term_document_matrix.T).diagonal()
    return sentence_scores


# Select the sentences which have a good score, and then combining them
def select_top_sentences(sentence_scores, num_sentences):
    # Sort the sentence scores in descending order
    sorted_sentence_indices = np.argsort(sentence_scores)[::-1]

    # Select the indices of the top num_sentences
    top_sentence_indices = sorted_sentence_indices[:num_sentences]
    return top_sentence_indices

def generate_summary(sentences, top_sentence_indices):
    # Initialize the summary
    summary = ''

    # Concatenate the top sentences
    for i in top_sentence_indices:
        summary += sentences[i] + '. '

    return summary
    
print("Functions to find best sentences and generate summart ready to use")

Functions to find best sentences and generate summart ready to use


In [18]:
def main():
    # Load the text
    # text = "Hi, my name is Shubh Garg. I am currently doing my BTech from JIIT."
    # text = "Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that typically require human intelligence. These tasks include learning, reasoning, problem-solving, perception, language understanding, and even decision-making. AI systems are designed to emulate human cognitive functions, utilizing algorithms and data to analyze patterns and make informed decisions. There are two main types of AI: narrow or weak AI, which is designed for a specific task, and general or strong AI, which aims to replicate human cognitive abilities across various domains. Machine learning is a subset of AI that involves training algorithms to recognize patterns and make predictions based on data. Natural Language Processing (NLP) is another essential component of AI, enabling machines to understand, interpret, and generate human-like language. AI applications are widespread and impact various industries, including healthcare, finance, transportation, and entertainment. The ethical implications of AI, such as bias in algorithms and potential job displacement, have also become important considerations. As AI continues to advance, researchers are exploring ways to ensure responsible and transparent development, addressing concerns about privacy, security, and the societal impact of these technologies. The quest for achieving artificial general intelligence, where machines can perform any intellectual task that a human can, remains a long-term goal in the field of AI."
    text = (input("Enter your text: "))
    
    # Print the text
    
    print("\nText:")
    print(text)
    
    # Preprocess the text
    sentences = preprocess_text(text)
    # We have not tokenized the sentences here as we need 'sentences' in the later process
    # Instead, we have tokinized them in another function
    tokenized_document = tokenize_document(sentences)
    
    # Create the term-document matrix
    term_document_matrix = create_term_document_matrix(tokenized_document)

    # Apply LSA
    k = 10
    # k refers to top k singular values and vectors to obtain the reduced term-document matrix
    # You can change k
    reduced_term_document_matrix = apply_lsa(term_document_matrix, k)

    # Select the top sentences for the summary
    num_sentences = int(input("\nHow many sentences do you want in your summary: "))
    sentence_scores = compute_sentence_scores(reduced_term_document_matrix)
    top_sentence_indices = select_top_sentences(sentence_scores, num_sentences)
    summary = generate_summary(sentences, top_sentence_indices)

    # Print the summary
    print("\nSummary:")
    print(summary)

In [19]:
if __name__ == "__main__":
    main()

Enter your text:  Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that typically require human intelligence. These tasks include learning, reasoning, problem-solving, perception, language understanding, and even decision-making. AI systems are designed to emulate human cognitive functions, utilizing algorithms and data to analyze patterns and make informed decisions. There are two main types of AI: narrow or weak AI, which is designed for a specific task, and general or strong AI, which aims to replicate human cognitive abilities across various domains. Machine learning is a subset of AI that involves training algorithms to recognize patterns and make predictions based on data. Natural Language Processing (NLP) is another essential component of AI, enabling machines to understand, interpret, and generate human-like language. AI applications are widespread and impact various industries, including healthcare, finance, transportation, and 


Text:
Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that typically require human intelligence. These tasks include learning, reasoning, problem-solving, perception, language understanding, and even decision-making. AI systems are designed to emulate human cognitive functions, utilizing algorithms and data to analyze patterns and make informed decisions. There are two main types of AI: narrow or weak AI, which is designed for a specific task, and general or strong AI, which aims to replicate human cognitive abilities across various domains. Machine learning is a subset of AI that involves training algorithms to recognize patterns and make predictions based on data. Natural Language Processing (NLP) is another essential component of AI, enabling machines to understand, interpret, and generate human-like language. AI applications are widespread and impact various industries, including healthcare, finance, transportation, and entertainme


How many sentences do you want in your summary:  3



Summary:
 these tasks include learning reasoning problemsolving perception language understanding and even decisionmaking.  ai applications are widespread and impact various industries including healthcare finance transportation and entertainment.  natural language processing nlp is another essential component of ai enabling machines to understand interpret and generate humanlike language. 
