In [12]:
from transformers import pipeline
from docx import Document
import re
import fitz  # PyMuPDF for PDF handling
import pandas as pd
from pptx import Presentation
from io import BytesIO
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load transformers pipelines for summarization and question-answering
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")
question_answering_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Model for similarity comparison

# Load text from different file formats
def load_text_from_file(file_path, file_type):
    if file_type == 'docx':
        doc = Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    elif file_type == 'pdf':
        text = ""
        pdf_doc = fitz.open(file_path)
        for page in pdf_doc:
            text += page.get_text()
        return text
    elif file_type == 'pptx':
        prs = Presentation(file_path)
        text = ""
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + "\n"
        return text
    elif file_type == 'xlsx':
        text = ""
        xls = pd.ExcelFile(file_path)
        for sheet_name in xls.sheet_names:
            sheet_data = xls.parse(sheet_name)
            text += sheet_data.to_string(index=False)
        return text
    else:
        raise ValueError("Unsupported file type")

# Set the file path and type
file_path = '/content/bahubali1 (1).docx'  # Change file path as needed
file_type = 'docx'  # Adjust the file type as per file being used (pdf, pptx, xlsx)

input_text = load_text_from_file(file_path, file_type)

# Split text into chunks with overlap
def split_text_with_overlap(text, max_chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_chunk_size - overlap):
        chunk = " ".join(words[i:i + max_chunk_size])
        chunks.append(chunk)
    return chunks

# Summarize each chunk
def generate_summary(text_chunks):
    summaries = []
    for chunk in text_chunks:
        summarized_text = summarization_pipeline(chunk, max_length=130, min_length=30, do_sample=False)
        summaries.append(summarized_text[0]['summary_text'])
    return " ".join(summaries)

# Summarize and calculate accuracy (compression ratio)
text_chunks = split_text_with_overlap(input_text)
summary = generate_summary(text_chunks)
compression_ratio = len(summary) / len(input_text)  # Compression as a ratio

# Similarity score for accuracy evaluation
def calculate_summary_accuracy(original_text, summary_text):
    original_embedding = similarity_model.encode([original_text])
    summary_embedding = similarity_model.encode([summary_text])
    similarity_score = cosine_similarity(original_embedding, summary_embedding)[0][0]
    return similarity_score * 100  # Convert to percentage

accuracy = calculate_summary_accuracy(input_text, summary)

# Display results
print("Document Summary:", summary)
print(f"Compression Ratio: {compression_ratio:.2f}%")
print(f"Accuracy : {accuracy:.2f}%")

# Initialize a variable to hold user suggestions
user_suggestion = ""

# Question validation function
def is_valid_question(question, context):
    # Remove punctuation and lowercase the question for comparison
    question_keywords = re.findall(r'\w+', question.lower())
    context_keywords = re.findall(r'\w+', context.lower())

    # Check if any question keywords are in the context
    for keyword in question_keywords:
        if keyword in context_keywords:
            return True
    return False

# Question-answering function that incorporates suggestions
def answer_question(context, question, suggestion=""):
    # Integrate the suggestion into the context to modify the answer if provided
    if suggestion:
        context = f"Consider this suggestion: {suggestion}. " + context

    # Check if the question is valid based on the context
    if not is_valid_question(question, context):
        return "Invalid question. Please ask a relevant question."

    qa_input = {
        "question": question,
        "context": context
    }
    return question_answering_pipeline(qa_input)['answer']

# Interactive loop for asking questions with suggestion-based modification
while True:
    user_question = input("\nYou can now ask a question about the text (or type 'exit' to stop): ")
    if user_question.lower() == 'exit':
        break

    # Answer the question with the latest suggestion if any
    answer = answer_question(summary, user_question, user_suggestion)
    print(f"Answer: {answer}")

    # Allow user to provide a suggestion to refine the next answer
    new_suggestion = input("\nIf you'd like to modify the answer, provide a suggestion (or press Enter to continue without): ")
    if new_suggestion:
        user_suggestion = new_suggestion
    else:
        user_suggestion = ""

Your max_length is set to 130, but your input_length is only 64. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)


Document Summary: Baahubali: The Beginning is a 2015 Indian epic action film co-written and directed by S. S. Rajamouli. The film was filmed in both Telugu and Tamil languages. It features Prabhas in a dual role alongside Rana Daggubati, Anushka Shetty, Tamannaah Bhatia, Ramya Krishnan, Sathyaraj, and Nassar. The first of a duology of films, it follows Sivudu, an adventurous young man who helps his love Avantika rescue Devasena, the former queen of Mahishmati who is Siva is the son of Amarendra Baahubali, the famous warrior prince of Mahishmati. Siva is obsessed with ascending the mountain despite his mother's pleas, but fails every time. As a young man, he shows superhuman strength while lifting a lingam of Lord Shiva. Kattapa revealing himself as a traitor is the biggest cliffhanger in Indian cinema,Everyone wants to know why kattappa killed baahubali. upon hearing this tale, wish to meet Baahubli.
Compression Ratio: 0.15%
Accuracy : 96.15%

You can now ask a question about the text 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
!pip install langchain
!pip install langchain_community
!pip install python-docx
!pip install PyMuPDF
!pip install python-pptx
!pip install sentence_transformers
!pip install python-docx
!pip install python-docx

Collecting langchain_community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain_community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadat