In [3]:
import re
import json
import docx
import pdfplumber
import pandas as pd

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def extract_course_details(text):
    details = {}
    
    course_match = re.search(r'([A-Z]{2,4}\s?\d{3,4})\s*-?\s*(.*)', text)
    if course_match:
        details['Course Code'] = course_match.group(1)
        details['Course Title'] = course_match.group(2)
    
    objectives_match = re.search(r'(Objectives|Learning Outcomes)[\s\S]*?(?=Week|Topics|Schedule|\n\n)', text, re.IGNORECASE)
    if objectives_match:
        details['Objectives'] = objectives_match.group().strip()
    
    weekly_topics = re.findall(r'(Week\s*\d+|\d+\.|•|-)(.*)', text)
    details['Weekly Topics'] = [topic[1].strip() for topic in weekly_topics]
    
    study_materials_match = re.search(r'(Recommended Reading|References|Study Materials)[\s\S]*', text, re.IGNORECASE)
    if study_materials_match:
        details['Study Materials'] = study_materials_match.group().strip()
    
    return details

def save_results(details, output_path):
    with open(output_path, 'w') as f:
        json.dump(details, f, indent=4)

def process_course_outline(file_path, output_path='course_details.json'):
    if file_path.endswith('.docx'):
        text = extract_text_from_docx(file_path)
    elif file_path.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    else:
        raise ValueError("Unsupported file format. Use DOCX or PDF.")
    
    details = extract_course_details(text)
    save_results(details, output_path)
    return details

file_path = "C:/Users/Home/Desktop/Comp 3.2/Comp 306/COMP 306 CO.pdf"
course_data = process_course_outline(file_path)
print(json.dumps(course_data, indent=4))


{
    "Course Code": "COMP 306",
    "Course Title": ": ADVANCED DATABASE SYSTEMS",
    "Objectives": "Learning outcomes of the Course:\nBy the end of this course the learner should:\ni. Perform complex SQL queries and understand advanced indexing techniques.\nii. Manage transactions and concurrency in distributed environments.\niii. Design and implement distributed databases and understand the CAP theorem.\niv. Work with NoSQL databases and big data technologies, understanding their\napplications and limitations.\nv. Leverage cloud-based databases and understand the implications of DBaaS.\nCourse content:\n1. Advanced Data Modeling and Database Design\n\uf0b7 Extended entity relationship model\n\uf0b7 Specialization hierarchy\n\uf0b7 Disjoint, overlapping and completeness constraints\n\uf0b7 Specialization, generalization & aggregation\n\uf0b7 Object-relational mapping and its applications\n2. Advanced SQL\n\uf0b7 Joins\n\uf0b7 Union Queries\n\uf0b7 Recursive queries with Common Table

In [2]:
import fitz  # PyMuPDF for PDFs
import docx
import re
import json

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

def extract_course_details(text):
    course_details = {}
    
    # Extract Course Title & Code
    match = re.search(r"(\b[A-Z]{2,}\s*\d{3,}\b)[-:]*\s*(.*)", text)
    if match:
        course_details["Course Code"] = match.group(1)
        course_details["Course Title"] = match.group(2).strip()
    
    # Extract Objectives
    objectives = re.findall(r"Objectives?[:\n](.*?)(?:\n[A-Z]|\Z)", text, re.DOTALL)
    course_details["Objectives"] = objectives[0].strip() if objectives else "Not found"
    
    # Extract Weekly Topics
    weekly_topics = re.findall(r"(\b(?:Week\s*\d+|\d+\.|-\s*\d+|•\s*\d+)\s*-?\s*(.*?))(?:\n[A-Z]|\Z)", text, re.DOTALL)
    course_details["Weekly Topics"] = [topic[1].strip() for topic in weekly_topics]
    
    # Extract Recommended Study Materials
    study_materials = re.findall(r"Recommended.*?:\s*(.*?)(?:\n[A-Z]|\Z)", text, re.DOTALL)
    course_details["Study Materials"] = study_materials[0].strip() if study_materials else "Not found"
    
    return course_details

# Example Usage
pdf_path = "C:/Users/Home/Desktop/Comp 3.2/Comp 306/COMP 306 CO.pdf"  # Replace with actual file
text = extract_text_from_pdf(pdf_path)
course_data = extract_course_details(text)
print(json.dumps(course_data, indent=4))


{
    "Course Code": "COMP 306",
    "Course Title": "ADVANCED DATABASE SYSTEMS",
    "Objectives": "Not found",
    "Weekly Topics": [
        "Advanced Data Modeling and Database Design \n\uf0b7 Extended entity relationship model \n\uf0b7 Specialization hierarchy \n\uf0b7 Disjoint, overlapping and completeness constraints \n\uf0b7 Specialization, generalization & aggregation \n\uf0b7 Object-relational mapping and its applications \n2. Advanced SQL \n\uf0b7 Joins \n\uf0b7 Union Queries \n\uf0b7 Recursive queries with Common Table Expressions (CTEs) \n\uf0b7 Complex subqueries and correlated subqueries \n\uf0b7 SQL analytics functions \n\uf0b7 Views  \n3. Database performance tuning and query optimization \n\uf0b7 Database performance tuning concepts \n\uf0b7 Query processing \n\uf0b7 Indexes and query optimization \n\uf0b7 SQL performance tuning \n\uf0b7 DBMS performance tuning \n\uf0b7 Managing Database physical storage space & performance tuning \n \n\n4. Transaction management and 