In [8]:
import PyPDF2
import ipywidgets as widgets
from IPython.display import display, clear_output
import io

# Step 1: Create a file upload widget
upload_widget = widgets.FileUpload(accept='.pdf', multiple=False)
output_widget = widgets.Output()  # Widget to display output
display(upload_widget, output_widget)

# Step 2: Function to extract text from the uploaded PDF
def extract_text_from_pdf(pdf_bytes):
    reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
    text = ""
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text += page.extract_text() + "\n"  # Adding newline for better readability
    return text

extracted_text = ""

# Step 3: Callback function to handle the upload
def on_upload_change(change):
    global extracted_text  # Declare as global to use it later for question generation

    # Clear previous output
    with output_widget:
        clear_output(wait=True)

        # Get the uploaded file content (in-memory)
        uploaded_file = next(iter(upload_widget.value))  # Correct way to get file
        pdf_file_content = uploaded_file['content']  # Get the file content in bytes
        
        # Step 4: Extract text from the PDF bytes
        extracted_text = extract_text_from_pdf(pdf_file_content)
        
        # Step 5: Display the extracted text
        print("Extracted Text:")
        print(extracted_text)  # Print the extracted text inside the output widget

# Step 6: Attach the callback to the upload widget
upload_widget.observe(on_upload_change, names='value')


FileUpload(value=(), accept='.pdf', description='Upload')

Output()

In [9]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nisht\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nisht\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
%pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)
Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   -- ------------------------------------- 1.3/24.0 MB 5.2 MB/s eta 0:00:05
   ----- ---------------------------------- 3.1/24.0 MB 6.8 MB/s eta 0:00:04
   ------- -------------------------------- 4.7/24.0 MB 7.3 MB/s eta 0:00:03
   ---------- ----------------------------- 6.6/24.0 MB 7.7 MB/s eta 0:00:03
   -------------- ------------------------- 8.9/24.0 MB 8.4 MB/s eta 0:00:02
   ------------------ --------------------- 11.0/24.0 MB 8.6 MB/s eta 0:00:02
   -------------------- ------------------- 12.6/24.0 MB 8.5 MB/s eta 0:00:02
   --------------------- ------------------ 13.1/24.0 MB 8.5 MB/s eta 0:00:02
   ------------------------- -------------- 15.2/24.0 MB 8.0 MB/s eta 0:00:02
   ----------

In [11]:
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def extract_topics(text, num_topics=5):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]

    dictionary = corpora.Dictionary([filtered_words])
    doc_term_matrix = [dictionary.doc2bow(filtered_words)]

    lda_model = LdaModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=50)
    topics = lda_model.print_topics(num_words=4)
    return topics


In [12]:
%pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [13]:
%pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [14]:
%pip install transformers

from transformers import T5Tokenizer, pipeline

# Load the fast tokenizer (if available)
tokenizer = T5Tokenizer.from_pretrained("valhalla/t5-base-qg-hl", use_fast=True)

# Initialize the question generation pipeline with a specific model
question_generator = pipeline("text2text-generation", model="valhalla/t5-base-qg-hl", num_beams=5)

# Function to generate questions from the extracted text
def generate_questions(text, num_questions=5, max_length=128):
    questions = []

    # Tokenize the input text
    tokenized_text = tokenizer(text, return_tensors="pt", truncation=True)

    # Calculate the number of tokens in the input
    num_tokens = tokenized_text['input_ids'].shape[1]

    # Adjust chunk size based on token limits
    chunk_size = 512  # Adjust if necessary based on model limits
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

    for chunk in chunks:
        # Add a task prefix to guide the model
        input_text = f"generate questions: {chunk}"

        try:
            # Generate questions using the question-generation pipeline
            generated_questions = question_generator(input_text, max_length=max_length, num_return_sequences=num_questions)

            # Collect generated questions
            questions.extend([q['generated_text'] for q in generated_questions])  # Access generated_text
        except Exception as e:
            print(f"Error generating questions for chunk: {chunk}\nError: {str(e)}")

    return questions


# Generate questions
# extracted_text = "Your text here."  # Replace this with your actual extracted text
questions = generate_questions(extracted_text, num_questions=5)

# Print the questions
for i, question in enumerate(questions, 1):
    print(f"Question {i}: {question}")


Note: you may need to restart the kernel to use updated packages.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/15.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Question 1: What does Prajna AI Wizzify Your Data do?
Question 2: Prajna AI - Confidential & Proprietary HACKATHON PROBLEM STATEMENT What does Prajna AI Wizzify Your Data do?
Question 3: Prajna AI - Confidential & Proprietary HACKATHON PROBLEM STATEMENT What does Prajna AI Wizzify Your Data?
Question 4: Prajna AI - Confidential & Proprietary HACKATHON PROBLEM STATEMENT Prajna AI Wizzify Your Data
Question 5: Prajna AI - Confidential & Proprietary HACKATHON PROBLEM STATEMENT Prajna AI Wizzify Your Data What is the name of the problem?
Question 6: What is the goal of the hackathon?
Question 7: What is the goal of this hackathon?
Question 8: What is the purpose of the hackathon?
Question 9: What is the aim of the hackathon?
Question 10: What is the goal of the Hackathon?
Question 11: What does Citation and Validation do?
Question 12: What does Citation and Validation provide?
Question 13: What does Citation and Validation do for PDF documents?
Question 14: What does Citation and Validatio