In [1]:
from google.colab import files
import os

# Install necessary packages
!pip install -q pdfplumber openpyxl transformers sentence-transformers

import pdfplumber
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load semantic model (force it to use CPU)
model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

# Text extraction function
def extract_text(file_path):
    ext = file_path.split('.')[-1].lower()
    text = ""
    if ext == "pdf":
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
    elif ext == "txt":
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
    elif ext in ["xls", "xlsx"]:
        df = pd.read_excel(file_path)
        text = df.astype(str).apply(lambda x: ' '.join(x), axis=1).str.cat(sep='\n')
    return text

# Semantic search-based response
def find_best_answer(user_question, document_text):
    if not document_text:
        return "Sorry, I couldn't extract any text from the document."

    # Split the document into sentences (or paragraphs)
    document_sentences = document_text.split('\n')

    # Encode the document text and user question
    document_embeddings = model.encode(document_sentences, convert_to_tensor=True, device='cpu')
    query_embedding = model.encode(user_question, convert_to_tensor=True, device='cpu')

    # Calculate cosine similarity
    scores = util.pytorch_cos_sim(query_embedding, document_embeddings)[0]

    # Find the best matching sentence/paragraph
    best_match_idx = int(scores.argmax())
    best_score = float(scores[best_match_idx])

    # Return the best match if score is above a threshold
    if best_score > 0.5:
        return document_sentences[best_match_idx]
    else:
        return "Sorry, I couldn't find a relevant answer."

# Function to save unanswered questions to a file
def save_unanswered_question(question):
    with open('unanswered_questions.txt', 'a') as file:
        file.write(f"{question}\n")

# Main function
def chat_with_document():
    print("\n✅ File processed successfully!\n")
    print("📌 You can ask questions based on the uploaded document. Type 'exit' to end the chat.\n")

    while True:
        user_query = input("Enter your question (or type 'exit'): ")

        # Graceful exit condition
        if user_query.lower().strip() in ['exit', 'bye', 'quit']:
            print("💬 Answer: Thank you for reaching out! Have a great day 😊")
            break

        # Get the answer based on the query
        answer = find_best_answer(user_query, document_text)

        # If no relevant answer is found, save the unanswered question
        if answer == "Sorry, I couldn't find a relevant answer.":
            save_unanswered_question(user_query)

        print("\n💬 Answer:", answer)
        print("\nYou can ask again or type 'exit' to end the chat.")

# Load the uploaded file
uploaded = files.upload()
file_path = next(iter(uploaded))

# Extract text from the uploaded file
document_text = extract_text(file_path)

# Run the function
if __name__ == '__main__':
    chat_with_document()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Saving support_faq.txt to support_faq.txt

✅ File processed successfully!

📌 You can ask questions based on the uploaded document. Type 'exit' to end the chat.

Enter your question (or type 'exit'): cod available ?

💬 Answer: Sorry, I couldn't find a relevant answer.

You can ask again or type 'exit' to end the chat.
Enter your question (or type 'exit'): exit
💬 Answer: Thank you for reaching out! Have a great day 😊


In [2]:
# Display contents of unanswered_questions.txt
with open('unanswered_questions.txt', 'r') as f:
    print(f.read())

cod available ?

