In [1]:
#TO DROP UNNEEDED COLUMNS FROM DATASET
import pandas as pd

# Load the dataset
file_path = "C:\\Users\\hp\\Documents\\NLP\\Reviews.csv"
data = pd.read_csv(file_path)

# Drop specified columns
columns_to_drop = ['UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time']
data = data.drop(columns=columns_to_drop)

# Save the modified dataset to a new CSV file
new_file_path = "C:\\Users\\hp\\Documents\\NLP\\Reviews_modified.csv"
data.to_csv(new_file_path, index=False)

print("Columns dropped and modified dataset saved to Reviews_modified.csv")

Columns dropped and modified dataset saved to Reviews_modified.csv


In [2]:
# TO CREATE AND SAVE INVERTED INDEX USING TF-IDF
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Load the modified dataset
file_path = "C:\\Users\\hp\\Documents\\NLP\\test100000.csv"
data = pd.read_csv(file_path)

# Preprocess the 'Text' column for TF-IDF
text_data = data['Text'].astype(str).tolist()

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

# Get feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Construct Inverted Index
inverted_index = {}
for doc_id, doc in enumerate(text_data):
    terms = tfidf_vectorizer.transform([doc])
    feature_index = terms.indices
    for idx in feature_index:
        term = feature_names[idx]
        if term not in inverted_index:
            inverted_index[term] = []
        inverted_index[term].append((doc_id, tfidf_matrix[doc_id, idx]))

# Save the inverted index to a file using pickle
with open('C:\\Users\\hp\\Documents\\NLP\\inverted_index.pkl', 'wb') as file:
    pickle.dump(inverted_index, file)


In [24]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...


True

In [8]:
# RETREIVAL OF DOCUMENT TEXT USING INDEX
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter

# Load the modified dataset
file_path = "C:\\Users\\hp\\Documents\\NLP\\test100000.csv"
data = pd.read_csv(file_path)

# Load the inverted index from the file
with open('C:\\Users\\hp\\Documents\\NLP\\inverted_index.pkl', 'rb') as file:
    inverted_index = pickle.load(file)

# Function to retrieve top 3 relevant documents for a query using the loaded inverted index
# Function to retrieve top 3 relevant documents for a query using the loaded inverted index
def retrieve_top_documents(query, data, inverted_index, tfidf_vectorizer, tfidf_matrix):
    # Preprocess the query
    tokens = word_tokenize(query.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]

    # Get documents containing each term from the query in the inverted index
    doc_ids_with_terms = []
    for token in tokens:
        if token in inverted_index:
            doc_ids_with_terms.extend([doc[0] for doc in inverted_index[token]])

    # Remove duplicates to keep unique document IDs
    doc_ids_with_terms = list(set(doc_ids_with_terms))

    # Calculate cosine similarity between query and documents containing query terms
    query_vector = tfidf_vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix[doc_ids_with_terms])

    # Get top 3 document indices with highest similarity scores
    top_indices = similarity_scores.argsort(axis=1)[0][-3:][::-1]

    # Get IDs, texts, and product IDs of top 3 relevant documents
    top_documents = []
    for idx in top_indices:
        doc_id = doc_ids_with_terms[idx]
        document = {
            'ID': data.iloc[doc_id]['Id'],
            'Text': data.iloc[doc_id]['Text'],
            'Product Id': data.iloc[doc_id]['ProductId'],
            'Similarity Score': similarity_scores[0][idx]
        }
        top_documents.append(document)
    
    return top_documents

# Example: Retrieve top 3 relevant documents for a query using the loaded inverted index
query_to_search = input("Enter your query: ")

# Use the existing TF-IDF vectorizer and matrix created during index creation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Text'])

relevant_docs = retrieve_top_documents(query_to_search, data, inverted_index, tfidf_vectorizer, tfidf_matrix)

# Display the top 3 relevant documents with IDs, similarity scores, and text
print(f"Top 3 Relevant Documents for Query '{query_to_search}':")
for doc in relevant_docs:
    print(f"ID: {doc['ID']}, Similarity Score: {doc['Similarity Score']}, Product Id:{doc['Product Id']}")
    print(f"Text: {doc['Text']}\n")


Enter your query: cat
Top 3 Relevant Documents for Query 'cat':
ID: 1141, Similarity Score: 0.5964083387300386, Product Id:B002OHOC6A
Text: My two cats love this cat food more than the kirkland signature brand.  It's also very cheap and affordable cat food at walmart. I would recommend this cat food over meow mix or friskies any day.  Get 9lives dry cat food.  It's one of the better brands out there.

ID: 1134, Similarity Score: 0.5428559458430748, Product Id:B002OHOC6A
Text: my cats favorite cat food he is very pickey and will not eat a lot of different cat foods but he loves this one

ID: 2943, Similarity Score: 0.377433420977121, Product Id:B0002TJAZK
Text: This is the perfect food for my cat & in fact, the only one she will eat - wet or dry.



In [1]:
import pandas as pd
import tkinter as tk
from tkinter import ttk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment.vader import SentimentIntensityAnalyzer  # Import VADER
import numpy as np
from collections import defaultdict
import time  # Import the time module


# Load the modified dataset (change the file_path accordingly)
file_path = "C:\\Users\\hp\\Documents\\NLP\\test100000.csv"
data = pd.read_csv(file_path)

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to preprocess text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    return tokens

# Function to generate extractive summary using TextRank
def generate_summary(text, num_sentences=1):
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())  # Tokenize and convert to lowercase

    # Calculate word frequency
    word_freq = Counter(words)

    # Find the most frequent term
    most_frequent_term = word_freq.most_common(1)[0][0]

    # Find the first sentence containing the most frequent term
    summary = next((sentence for sentence in sentences if most_frequent_term in word_tokenize(sentence.lower())), None)

    return summary if summary else sentences[0]  # Return the summary or default to the first sentence if not found


# Function to retrieve top documents and generate extractive summary with sentiment analysis
def retrieve_top_documents_with_summary(query):
    tokens = word_tokenize(query.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]

    # Get documents containing each term from the query in the inverted index
    doc_ids_with_terms = []
    for token in tokens:
        if token in inverted_index:
            doc_ids_with_terms.extend([doc[0] for doc in inverted_index[token]])

    # Remove duplicates to keep unique document IDs
    doc_ids_with_terms = list(set(doc_ids_with_terms))

    # Check if no relevant documents are found
    if not doc_ids_with_terms:
        return []  # Return an empty list if no relevant documents are found

    # Calculate cosine similarity between query and documents containing query terms
    query_vector = tfidf_vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix[doc_ids_with_terms])

    # Get top 5 document indices with highest similarity scores
    top_indices = similarity_scores.argsort(axis=1)[0][-5:][::-1]

    # Get IDs, texts, and product IDs of top 5 relevant documents along with summary and sentiment
    top_documents = []
    for idx in top_indices:
        doc_id = doc_ids_with_terms[idx]
        document_text = data.iloc[doc_id]['Text']

        # Calculate sentiment score for the document text
        sentiment_score = sid.polarity_scores(document_text)['compound']
        
        # Determine sentiment label based on the sentiment score
        sentiment_label = 'positive' if sentiment_score >= 0 else 'negative'

        document = {
            'ID': data.iloc[doc_id]['Id'],
            'Text': document_text,
            'Product ID': data.iloc[doc_id]['ProductId'],
            'Similarity Score': similarity_scores[0][idx],
            'Extractive Summary': generate_summary(document_text),
            'Sentiment': sentiment_label  # Add sentiment label to the document
        }
        top_documents.append(document)
    
    return top_documents


# Tkinter GUI
def retrieve_and_display_summary():
    start_time = time.time()  # Record start time
    query = query_entry.get()
    results = retrieve_top_documents_with_summary(query)
    
    # Check if no relevant documents are found
    if not results:
        tree_view.delete(*tree_view.get_children())  # Clear existing table
        time_label.config(text="No documents found")  # Display message in the time_label
        return  # Exit the function if no results are found
    
    populate_table(results)
    end_time = time.time()  # Record end time
    time_taken = int((end_time - start_time) * 1000)  # Calculate time taken in milliseconds
    time_label.config(text=f"Results fetched in {time_taken}ms")  # Display time taken in milliseconds


def populate_table(results):
    for row in tree_view.get_children():
        tree_view.delete(row)

    for idx, doc in enumerate(results):
        tree_view.insert("", "end", values=(doc['ID'], doc['Similarity Score'], doc['Product ID'], doc['Extractive Summary'], doc['Sentiment']))

root = tk.Tk()
root.title("Document Retrieval")
query_frame = ttk.Frame(root)
query_frame.pack(padx=10, pady=10, fill='x')

query_label = ttk.Label(query_frame, text="Enter query:")
query_label.pack(side='left')

query_entry = ttk.Entry(query_frame)
query_entry.pack(side='left', padx=5)

search_button = ttk.Button(root, text="Search", command=retrieve_and_display_summary)
search_button.pack(padx=10, pady=5)

columns = ('ID', 'SIMILARITY SCORE', 'PRODUCT ID', 'EXTRACTIVE SUMMARY', 'SENTIMENT')
tree_view = ttk.Treeview(root, columns=columns, show='headings')
for col in columns:
    tree_view.heading(col, text=col)
    tree_view.column(col, width=150)  # Adjust the width of columns as needed
tree_view.pack(padx=10, pady=10, expand=True, fill='both')  # Expand the Treeview widget to fill available space

# Adding a label to display the time taken
time_label = ttk.Label(root)
time_label.pack(padx=10, pady=5)

root.mainloop()


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\tkinter\__init__.py", line 1892, in __call__
    return self.func(*args)
  File "C:\Users\hp\AppData\Local\Temp\ipykernel_2248\4284988443.py", line 100, in retrieve_and_display_summary
    results = retrieve_top_documents_with_summary(query)
  File "C:\Users\hp\AppData\Local\Temp\ipykernel_2248\4284988443.py", line 49, in retrieve_top_documents_with_summary
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
  File "C:\Users\hp\AppData\Local\Temp\ipykernel_2248\4284988443.py", line 49, in <listcomp>
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
NameError: name 'string' is not defined


In [3]:
import pandas as pd
import tkinter as tk
from tkinter import ttk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter
import string
import pickle
import time

# Load the modified dataset
file_path = "C:\\Users\\hp\\Documents\\NLP\\test100000.csv"
data = pd.read_csv(file_path)

# Load the inverted index from the file
with open('C:\\Users\\hp\\Documents\\NLP\\inverted_index.pkl', 'rb') as file:
    inverted_index = pickle.load(file)

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Text'])

# Function to preprocess text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    return tokens

# Function to generate extractive summary using TextRank
def generate_summary(text, num_sentences=1):
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())  # Tokenize and convert to lowercase

    # Calculate word frequency
    word_freq = Counter(words)

    # Find the most frequent term
    most_frequent_term = word_freq.most_common(1)[0][0]

    # Find the first sentence containing the most frequent term
    summary = next((sentence for sentence in sentences if most_frequent_term in word_tokenize(sentence.lower())), None)

    return summary if summary else sentences[0]  # Return the summary or default to the first sentence if not found

# Function to retrieve top documents and generate extractive summary with sentiment analysis
def retrieve_top_documents_with_summary(query):
    tokens = word_tokenize(query.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]

    # Get documents containing each term from the query in the inverted index
    doc_ids_with_terms = []
    for token in tokens:
        if token in inverted_index:
            doc_ids_with_terms.extend([doc[0] for doc in inverted_index[token]])

    # Remove duplicates to keep unique document IDs
    doc_ids_with_terms = list(set(doc_ids_with_terms))

    # Check if no relevant documents are found
    if not doc_ids_with_terms:
        return []  # Return an empty list if no relevant documents are found

    # Calculate cosine similarity between query and documents containing query terms
    query_vector = tfidf_vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix[doc_ids_with_terms])

    # Get top 5 document indices with highest similarity scores
    top_indices = similarity_scores.argsort(axis=1)[0][-5:][::-1]

    # Get IDs, texts, and product IDs of top 5 relevant documents along with summary and sentiment
    top_documents = []
    for idx in top_indices:
        doc_id = doc_ids_with_terms[idx]
        document_text = data.iloc[doc_id]['Text']

        # Calculate sentiment score for the document text
        sentiment_score = sid.polarity_scores(document_text)['compound']
        
        # Determine sentiment label based on the sentiment score
        sentiment_label = 'positive' if sentiment_score >= 0 else 'negative'

        document = {
            'ID': data.iloc[doc_id]['Id'],
            'Text': document_text,
            'Product ID': data.iloc[doc_id]['ProductId'],
            'Similarity Score': similarity_scores[0][idx],
            'Extractive Summary': generate_summary(document_text),
            'Sentiment': sentiment_label  # Add sentiment label to the document
        }
        top_documents.append(document)
    
    return top_documents

# Tkinter GUI
def retrieve_and_display_summary():
    start_time = time.time()  # Record start time
    query = query_entry.get()
    results = retrieve_top_documents_with_summary(query)
    
    # Clear existing table
    for row in tree_view.get_children():
        tree_view.delete(row)

    # Check if no relevant documents are found
    if not results:
        time_label.config(text="No documents found")  # Display message in the time_label
        return  # Exit the function if no results are found
    
    for idx, doc in enumerate(results):
        tree_view.insert("", "end", values=(doc['ID'], doc['Similarity Score'], doc['Product ID'], doc['Extractive Summary'], doc['Sentiment']))

    time_label.config(text=f"Results fetched")  # Display time taken in milliseconds
    end_time = time.time()  # Record end time
    time_taken = int((end_time - start_time) * 1000)  # Calculate time taken in milliseconds
    time_label.config(text=f"Results fetched in {time_taken}ms")  # Display time taken in milliseconds

root = tk.Tk()
root.title("Document Retrieval")

query_frame = ttk.Frame(root)
query_frame.pack(padx=10, pady=10, fill='x')

query_label = ttk.Label(query_frame, text="Enter query:")
query_label.pack(side='left')

query_entry = ttk.Entry(query_frame)
query_entry.pack(side='left', padx=5)

search_button = ttk.Button(root, text="Search", command=retrieve_and_display_summary)
search_button.pack(padx=10, pady=5)

columns = ('ID', 'SIMILARITY SCORE', 'PRODUCT ID', 'EXTRACTIVE SUMMARY', 'SENTIMENT')
tree_view = ttk.Treeview(root, columns=columns, show='headings')
for col in columns:
    tree_view.heading(col, text=col)
    tree_view.column(col, width=150)  # Adjust the width of columns as needed
tree_view.pack(padx=10, pady=10, expand=True, fill='both')  # Expand the Treeview widget to fill available space

# Adding a label to display the time taken
time_label = ttk.Label(root)
time_label.pack(padx=10, pady=5)

root.mainloop()


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\hp\\Documents\\NLP\\test5000.csv'