In [1]:
import re
import pandas as pd
import pdfplumber
import fitz
import tabula
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [4]:
# Step 1: Detect tables and get their coordinates using tabula
def get_table_regions(pdf_path):
    tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True, output_format="json", encoding='ISO-8859-1')
    table_regions = []

    # Extract bounding boxes of the detected tables
    for table in tables:
        bbox = table['top'], table['left'], table['width'], table['height']
        table_regions.append(bbox)

    return table_regions

# Step 2: Extract text from PDF while ignoring table regions
def extract_text_excluding_tables(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf:
        for page_num in range(pdf.page_count):  # Iterate using page numbers
            page = pdf.load_page(page_num)  # Load each page by index
            non_table_text = page.get_text("text")  # Extract plain text
            if non_table_text:
                text += non_table_text
    return text

# Step 3: Use regex to extract course names and codes
def extract_courses(text):
    # Improved regex to capture course codes like "MDS131", "MDS161A", etc.
    pattern = r'(MDS\d{3}[A-Z]?)\s*([A-Za-z\s\-]+)'
    courses = re.findall(pattern, text,re.DOTALL)

    # Step 4: Clean and organize the results
    course_data = []
    for code, name in courses:
        course_name = ' '.join(name.split())  # Handle multi-word names
        course_data.append([code, course_name])

    return pd.DataFrame(course_data, columns=['Course Code', 'Course Name'])

# Step 5: Main function to extract courses while excluding tables
def course_table(pdf_path):
    # Get table regions using tabula
    table_regions = get_table_regions(pdf_path)

    # Extract text excluding tables
    text = extract_text_excluding_tables(pdf_path)


    review=re.sub('[^a-zA-Z0-9]',' ',text)
    review=review.split()
    review=" ".join(review)
    # Extract courses
    df = extract_courses(review)

    return df

# Step 6: Specify the PDF path
pdf_path = "./MDS2024_25.pdf"  # Use the path to your uploaded PDF
final_df=course_table(pdf_path)
final_df.to_excel("Course_info.xlsx")