In [1]:
import fitz  # PyMuPDF
import pandas as pd
import os
import re

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Function to remove known redundant sections
def remove_redundant_sections(text):
    text = text.split("ANSWER KEY")[0]
    # Add patterns for known redundant sections
    patterns = [
        '\n•\nThis question paper contains two sections, section A & B.\n•\nSection A contains 20 multiple choice questions (SCQs) with four options (A),(B),(C),(D) out\nof which only one option is correct.\n•\nSection B contains 10 Integer Type questions, out of which candidate have to attempt only 5\nquestions.\nSection-I\n•\nThis Section contain 20 questions (Q.No. 1 to Q.No. 20)\n•\nAnswer to each question in Section A will be evaluated according to the following marking\nscheme:\nFull Marks \n:    +𝟒   for correct answer\nZero Marks \n:       0   If the question is unanswered;\nNegative Marks :    −𝟏  for incorrect answer\nCLASS 12th',
        'This question paper contains two sections',
        '\n•\nThis question paper contains two sections',
        '\nJEE MAIN PAPER\nVIJETA BATCH \nCLASS 12th',
        '\nJEE MAIN PAPER\nVIJETA BATCH \nCLASS 12th\n3',
        'CLASS 12th.*?\n',
        'question paper contains two sections'
        '\nSection-II',
        '\n•\nThis Section contain 10 questions (Q.No. 21 to Q.No. 30) whose answer to be filled as\nnumerical value (Attempt any five)\n•\nAnswer to each question in Section B will be evaluated according to the following marking\nscheme:\nFull Marks \n:    +𝟒   for correct answer \nZero Marks \n:       0   If the question is unanswered;\nZero Marks \n:       𝟎  for incorrect answer \n4',
        '5\nPART – 2 : PHYSICS',
        '\n•\n, section A & B.\n•\nSection A contains 20 multiple choice questions (SCQs) with four options (A),(B),(C),(D) out\nof which only one option is correct.\n•\nSection B contains 10 Integer Type questions, out of which candidate have to attempt only 5\nquestions.\nSection-I\n•\nThis Section contain 20 questions (Q.No. 1 to Q.No. 20)\n•\nAnswer to each question in Section A will be evaluated according to the following marking\nscheme:\nFull Marks \n:    +𝟒   for correct answer\nZero Marks \n:       0   If the question is unanswered;\nNegative Marks :    −𝟏  for incorrect answer\n6',
        '\n•\nThis Section contain 10 questions (Q.No. 21 to Q.No. 30) whose answer to be filled as\nnumerical value (Attempt any five)\n•\nAnswer to each question in Section B will be evaluated according to the following marking\nscheme:\nFull Marks \n:    +𝟒   for correct answer \nZero Marks \n:       0   If the question is unanswered;\nZero Marks \n:       𝟎  for incorrect answer \n11',
        '\nPART – 3 : CHEMISTRY',
        '\n•\n, section A & B.\n•\nSection A contains 20 multiple choice questions (SCQs) with four options (A),(B),(C),(D) out\nof which only one option is correct.\n•\nSection B contains 10 Integer Type questions, out of which candidate have to attempt only 5\nquestions.\nSection-I\n•\nThis Section contain 20 questions (Q.No. 1 to Q.No. 20)\n•\nAnswer to each question in Section A will be evaluated according to the following marking\nscheme:\nFull Marks \n:   +𝟒   for correct answer\nZero Marks \n:       0   If the question is unanswered;\nNegative Marks :    −𝟏  for incorrect answer\n13',
        '\n•\nThis Section contain 10 questions (Q.No. 21 to Q.No. 30) whose answer to be filled as\nnumerical value (Attempt any five)\n•\nAnswer to each question in Section B will be evaluated according to the following marking\nscheme:\nFull Marks \n:    +𝟒   for correct answer \nZero Marks \n:       0   If the question is unanswered;\nZero Marks \n:       𝟎  for incorrect answer \n17',
        '\nJEE MAIN PAPER\nVIJETA BATCH \n',
        
    ]
    for pattern in patterns:
        text = text.replace(pattern,'')
    return text

# Function to split the text into individual questions
def split_questions(text):
    # Assuming questions are numbered and each starts with a new line and a number (e.g., 1., 2., ...)
    questions = re.split(r'\n\d+\.\s+', text)
    # Remove the first element if it's empty (because of the split)
    if questions[0].strip() == "":
        questions.pop(0)
    return questions

def remove_n(question):
    # Removing /n
    return re.sub(r'\s+', ' ', question.replace('\n', ' ')).strip()

def remove_solution_part(question):
    return question.split('Sol.')[0].strip()
results = []
cnt=0
# Function to process the PDF and categorize each question
def pdf_to_csv(pdf_path,results,subject,chapter):
    text = extract_text_from_pdf(pdf_path)
    text = remove_redundant_sections(text)
    questions = split_questions(text)
    #unrecognizable_symbols_pattern = re.compile(r'[■]')
    #unrecognizable_symbols={'■', '', '�', '●', '◆', '▲', '▼', '♦', '▪', '◾', '◽', '○', '◇', '❖', '⬛', '⬜', '✖', '✗', '✘', '✔', '➔', '➕', '➖', '➗', '➽', '➤', '➲', '➩', '➬', '➭', '➮', '➯', '➱', '✰', '★', '☆', '✪', '✫', '✬'}
    for idx,question in enumerate (questions):
        if len(question) <= 15 or idx==0: #or unrecognizable_symbols_pattern.search(question):
            continue
        #if any(symbol in question for symbol in unrecognizable_symbols):
            #continue
        question = remove_solution_part(question)
        results.append({
            'question_id': idx ,
            'question_no': idx ,
            'subject': subject,
            'chapter': chapter,
            'question': question
        })

In [2]:
def all_pdf_to_csv():
    path = '../pdf'
    chapter_array=[]
    path_array=[]
    subject_array=[]
    for dirpath, dirnames, filenames in os.walk(path):
        # Skip the 'csv' folder
        if 'csv' in dirnames:
            dirnames.remove('csv')
        
        # Iterate over each file in the directory
        for file in filenames:
            # Check if the file is a .docx file
            if file.endswith('.pdf'):
                # Append the name of the file (without extension) to the chapter array
                chapter_name = os.path.splitext(file)[0]
                chapter_name=chapter_name.strip()
                chapter_array.append(chapter_name)
                subject_name = os.path.basename(dirpath)
                subject_array.append(subject_name)
                path_array.append(os.path.join(dirpath, file))
        
    for path, subject, chapter in zip(path_array, subject_array, chapter_array):
        pdf_to_csv(path, results, subject, chapter)

In [3]:
all_pdf_to_csv()
# Save the DataFrame to a CSV file
df = pd.DataFrame(results)
csv_path = '../new_jee_data/csv/questions_database_bert_text.csv'
df.to_csv(csv_path, index=False)