In [3]:
import pickle
import pandas as pd
# Load the modified dataset
file_path = "C:\\Users\\hp\\Documents\\NLP\\test100000.csv"
data = pd.read_csv(file_path)



In [4]:
# Load the inverted index from the file
with open('C:\\Users\\hp\\Documents\\NLP\\inverted_index.pkl', 'rb') as file:
    inverted_index = pickle.load(file)

In [15]:
import pandas as pd
import tkinter as tk
from tkinter import ttk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter
import string
import pickle
import time



# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Text'])

# Function to preprocess text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    return tokens

# Function to generate extractive summary using TextRank
def generate_summary(text, num_sentences=1):
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())  # Tokenize and convert to lowercase

    # Calculate word frequency
    word_freq = Counter(words)

    # Find the most frequent term
    most_frequent_term = word_freq.most_common(1)[0][0]

    # Find the first sentence containing the most frequent term
    summary = next((sentence for sentence in sentences if most_frequent_term in word_tokenize(sentence.lower())), None)

    return summary if summary else sentences[0]  # Return the summary or default to the first sentence if not found

# Function to retrieve top documents and generate extractive summary with sentiment analysis
def retrieve_top_documents_with_summary(query):
    tokens = word_tokenize(query.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]

    # Get documents containing each term from the query in the inverted index
    doc_ids_with_terms = []
    for token in tokens:
        if token in inverted_index:
            doc_ids_with_terms.extend([doc[0] for doc in inverted_index[token]])

    # Remove duplicates to keep unique document IDs
    doc_ids_with_terms = list(set(doc_ids_with_terms))

    # Check if no relevant documents are found
    if not doc_ids_with_terms:
        return []  # Return an empty list if no relevant documents are found

    # Calculate cosine similarity between query and documents containing query terms
    query_vector = tfidf_vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix[doc_ids_with_terms])

    # Get top 5 document indices with highest similarity scores
    top_indices = similarity_scores.argsort(axis=1)[0][-10:][::-1]

    # Get IDs, texts, and product IDs of top 5 relevant documents along with summary and sentiment
    top_documents = []
    for idx in top_indices:
        doc_id = doc_ids_with_terms[idx]
        document_text = data.iloc[doc_id]['Text']

        # Calculate sentiment score for the document text
        sentiment_score = sid.polarity_scores(document_text)['compound']
        
        # Determine sentiment label based on the sentiment score
        sentiment_label = 'positive' if sentiment_score >= 0 else 'negative'

        document = {
            'ID': data.iloc[doc_id]['Id'],
            'Text': document_text,
            'Product ID': data.iloc[doc_id]['ProductId'],
            'Similarity Score': similarity_scores[0][idx],
            'Extractive Summary': generate_summary(document_text),
            'Sentiment': sentiment_label  # Add sentiment label to the document
        }
        top_documents.append(document)
    
    return top_documents

# Tkinter GUI
def retrieve_and_display_summary():
    start_time = time.time()  # Record start time
    query = query_entry.get()
    results = retrieve_top_documents_with_summary(query)
    
    # Clear existing table
    for row in tree_view.get_children():
        tree_view.delete(row)

    # Check if no relevant documents are found
    if not results:
        time_label.config(text="No documents found")  # Display message in the time_label
        return  # Exit the function if no results are found
    
    for idx, doc in enumerate(results):
        tree_view.insert("", "end", values=(doc['ID'], doc['Similarity Score'], doc['Product ID'], doc['Extractive Summary'], doc['Sentiment']))

    time_label.config(text=f"Results fetched")  # Display time taken in milliseconds
    end_time = time.time()  # Record end time
    time_taken = int((end_time - start_time) * 1000)  # Calculate time taken in milliseconds
    time_label.config(text=f"Results fetched in {time_taken}ms")  # Display time taken in milliseconds

root = tk.Tk()
root.title("Document Retrieval")

query_frame = ttk.Frame(root)
query_frame.pack(padx=10, pady=10, fill='x')

query_label = ttk.Label(query_frame, text="Enter query:")
query_label.pack(side='left')

query_entry = ttk.Entry(query_frame)
query_entry.pack(side='left', padx=5)

search_button = ttk.Button(root, text="Search", command=retrieve_and_display_summary)
search_button.pack(padx=10, pady=5)

columns = ('ID', 'SIMILARITY SCORE', 'PRODUCT ID', 'EXTRACTIVE SUMMARY', 'SENTIMENT')
tree_view = ttk.Treeview(root, columns=columns, show='headings')
for col in columns:
    tree_view.heading(col, text=col)
    tree_view.column(col, width=150)  # Adjust the width of columns as needed
tree_view.pack(padx=10, pady=10, expand=True, fill='both')  # Expand the Treeview widget to fill available space

# Adding a label to display the time taken
time_label = ttk.Label(root)
time_label.pack(padx=10, pady=5)

root.mainloop()