# System Prompt

In [14]:
UNIT_OUTLINE_SYSTEM_PROMPT_TEMPLATE = """
You are an expert academic assistant tasked with parsing a university unit outline document and extracting key information into a structured JSON format.

The input will be the raw text content of a unit outline. Your goal is to identify and extract the following details and structure them precisely as specified in the JSON schema below.

**JSON Output Schema:**

```json
{{
  "unitInformation": {{
    "unitCode": "string | null",
    "unitName": "string | null",
    "creditPoints": "integer | null",
    "unitRationale": "string | null",
    "prerequisites": "string | null"
  }},
  "learningOutcomes": [
    "string"
  ],
  "assessments": [
    {{
      "taskName": "string",
      "description": "string",
      "dueWeek": "string | null",
      "weightingPercent": "integer | null",
      "learningOutcomesAssessed": "string | null"
    }}
  ],
  "weeklySchedule": [
    {{
      "week": "string",
      "contentTopic": "string",
      "requiredReading": "string | null"
    }}
  ],
  "requiredReadings": [
    "string"
  ],
  "recommendedReadings": [
    "string"
  ]
}}

Instructions for Extraction:
Unit Information: Locate Unit Code, Unit Name, Credit Points. Capture 'Unit Overview / Rationale' as unitRationale. Identify prerequisites.
Learning Outcomes: Extract each learning outcome statement.
Assessments: Each task as an object. Capture full task name, description, Due Week, Weighting % (number), and Learning Outcomes Assessed.
weeklySchedule: Each week as an object. Capture Week, contentTopic, and requiredReading.
Required and Recommended Readings: List full text for each.
**Important Considerations for the LLM**:
Pay close attention to headings and table structures.
If information is missing, use null for string/integer fields, or an empty list [] for array fields.
Do no change keys in the template given
Ensure the output is ONLY the JSON object, starting with {{{{ and ending with }}}}. No explanations or conversational text before or after the JSON. 
Now, parse the following unit outline text:
--- UNIT_OUTLINE_TEXT_START ---
{outline_text}
--- UNIT_OUTLINE_TEXT_END ---
"""

# Extrac Unit outline details to process following steps - output raw json with UO details 

In [17]:

# --- Standard Libraries ---
import os
import re
import json
import logging
import warnings # Added for managing warnings if libraries are missing

# --- Third-Party Libraries ---
# Make sure to install these: pip install python-docx ollama tenacity
try:
    from docx import Document
    docx_available = True
except ImportError:
    warnings.warn("python-docx library not found. DOCX parsing will fail. `pip install python-docx`")
    docx_available = False

try:
    import pdfplumber 
    pdfplumber_available = True 
except ImportError: 
    warnings.warn("pdfplumber library not found. PDF parsing will fail. `pip install pdfplumber`") 
    pdfplumber_available = False 


try:
    import ollama
    ollama_available = True
except ImportError:
    warnings.warn("ollama library not found. Ollama functionality disabled. `pip install ollama`")
    ollama_available = False

try:
    from tenacity import retry, stop_after_attempt, wait_exponential
    tenacity_available = True
except ImportError:
    warnings.warn("tenacity library not found. Retry functionality will not work. `pip install tenacity`")
    tenacity_available = False

# --- Logger Setup ---
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# --- LLM Configuration (Global for this script) ---
# These would typically be managed more dynamically as in your original script
LLM_PROVIDER = "ollama" # Can be "ollama", "openai", "gemini"
OLLAMA_HOST = "http://localhost:11434"
OLLAMA_MODEL = "mistral:latest" # Or "llama3.2:3b", "mistral:latest", "openthinker:7b ", "" - choose a capable model
# Placeholder for other providers if you extend this script
OPENAI_MODEL = "gpt-3.5-turbo"
GEMINI_MODEL = "gemini-1.5-flash-latest"


PATH_UO = "/home/sebas_dev_linux/projects/course_generator/data/UO/ICT312 Digital Forensic_Final.docx"

# Retry configuration (if tenacity is available)
if tenacity_available:
    LLM_MAX_RETRIES = 3
    LLM_RETRY_WAIT_MIN = 2
    LLM_RETRY_WAIT_MAX = 10

    def log_retry_attempt(retry_state):
        logger.warning(f"LLM call attempt {retry_state.attempt_number} failed with {retry_state.outcome.exception()}, retrying...")

    retry_decorator = retry(
        stop=stop_after_attempt(LLM_MAX_RETRIES),
        wait=wait_exponential(min=LLM_RETRY_WAIT_MIN, max=LLM_RETRY_WAIT_MAX),
        retry_error_callback=log_retry_attempt,
    )
else:
    # Define a dummy decorator if tenacity is not available
    def retry_decorator(func):
        def wrapper(*args, **kwargs):
            return func(*args, **kwargs) # Just call the function directly
        return wrapper

# Global LLM clients (initialize before use)
client_ollama = None
# client_openai = None # Placeholder
# gemini_model_obj = None # Placeholder


# --- Helper Functions ---
def extract_text_from_file(filepath): # Renamed for clarity
    _, file_extension = os.path.splitext(filepath)
    file_extension = file_extension.lower()

    if file_extension == '.docx':
        if not docx_available:
            logger.error("python-docx library is not available. Cannot extract text from DOCX.")
            return None
        try:
            doc = Document(filepath)
            full_text = []
            for para in doc.paragraphs:
                full_text.append(para.text)
            for table in doc.tables:
                for row in table.rows:
                    row_text = [cell.text.strip() for cell in row.cells]
                    full_text.append(" | ".join(row_text))
            return '\n'.join(full_text)
        except Exception as e:
            logger.error(f"Error reading docx file {filepath}: {e}", exc_info=True)
            return None
    elif file_extension == '.pdf':
        if not pdfplumber_available:
            logger.error("pdfplumber library is not available. Cannot extract text from PDF.")
            return None
        try:
            full_text = []
            with pdfplumber.open(filepath) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    logger.debug(f"Extracting text from PDF page {page_num + 1}")
                    page_text = page.extract_text()
                    if page_text:
                        full_text.append(page_text)
                    # You could also try extracting tables with page.extract_tables()
                    # and format them if needed.
            return '\n'.join(full_text)
        except Exception as e:
            logger.error(f"Error reading PDF file {filepath}: {e}", exc_info=True)
            return None
    elif file_extension == '.txt':
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception as e:
            logger.error(f"Error reading TXT file {filepath}: {e}", exc_info=True)
            return None
    else:
        logger.error(f"Unsupported file type: {file_extension}. Cannot extract text from {filepath}")
        return None

def parse_llm_unit_outline_json(content: str, llm_provider_name: str):
    if not isinstance(content, str) or not content.strip():
        logger.warning(f"Received empty/non-string content for unit outline from {llm_provider_name}.")
        return None
    try:
        content_cleaned = re.sub(r"^```json\s*|\s*```$", "", content.strip(), flags=re.MULTILINE)
        json_start = content_cleaned.find("{")
        json_end = content_cleaned.rfind("}") + 1
        if json_start != -1 and json_end != 0:
            json_str = content_cleaned[json_start:json_end]
            data = json.loads(json_str)
            # Optional: Add jsonschema validation here if desired
            return data
        else:
            logger.warning(f"Could not find JSON object in unit outline response from {llm_provider_name}: {content_cleaned}")
            return None
    except json.JSONDecodeError as e:
        logger.warning(f"JSON Decode Error for unit outline ({llm_provider_name}): {e}\nRaw Output:\n{content_cleaned}")
        return None
    except Exception as e:
        logger.error(f"Unexpected error parsing unit outline JSON from {llm_provider_name}: {e}\nRaw Output:\n{content_cleaned}", exc_info=True)
        return None

@retry_decorator # Apply the retry decorator
def call_ollama_to_parse_outline(prompt_with_outline_text):
    global client_ollama # Uses the globally initialized client
    if not ollama_available or client_ollama is None:
        logger.warning("Ollama client not available or not initialized for outline parsing.")
        # To make retry work properly, we should raise an error that tenacity can catch
        raise ConnectionError("Ollama client not available or not initialized.")

    logger.debug(f"Attempting Ollama call for unit outline parsing (Model: {OLLAMA_MODEL} Host: {OLLAMA_HOST})...")
    try:
        response = client_ollama.chat(
            model=OLLAMA_MODEL,
            messages=[{"role": "user", "content": prompt_with_outline_text}],
            options={"temperature": 0.00}, # Low temp for factual, top_p for some diversity if needed
            format="json",
        )
        if not response or 'message' not in response or 'content' not in response['message'] or not response['message']['content']:
            logger.warning(f"Ollama returned empty/invalid response for outline: {response}")
            raise ValueError("Ollama returned empty or invalid response for outline")
        content = response['message']['content']
        return parse_llm_unit_outline_json(content, "ollama")
    except Exception as e:
        # Log specific connection errors differently for clarity
        if isinstance(e, (ollama.ResponseError, ollama.RequestError)) and ("Connection refused" in str(e) or "Max retries exceeded with url" in str(e)):
             logger.error(f"Ollama connection error for outline parsing to {OLLAMA_HOST}: {e}.")
        elif isinstance(e, ValueError) and "Ollama returned empty" in str(e):
            pass # Already logged, just re-raise for tenacity
        else:
            logger.error(f"Ollama API call for outline parsing failed: {e}", exc_info=False)
        raise # Reraise to trigger tenacity retry if applicable

# --- Main Parsing Orchestration Function ---
def parse_unit_outline_with_llm(outline_filepath, system_prompt_template):
    global LLM_PROVIDER, client_ollama # Use global provider and client

    # Ensure client is initialized for the selected provider
    if LLM_PROVIDER == "ollama" and ollama_available:
        if client_ollama is None:
            try:
                logger.info(f"Initializing Ollama client for host: {OLLAMA_HOST}")
                client_ollama = ollama.Client(host=OLLAMA_HOST)
                client_ollama.list() # Test connection
                logger.info(f"Ollama client connected to {OLLAMA_HOST}.")
            except Exception as e:
                logger.error(f"Failed to initialize Ollama client to {OLLAMA_HOST}: {e}. Cannot parse outline.")
                return None
    # Add elif for openai, gemini if you implement them

    outline_text = extract_text_from_file(outline_filepath)
    if not outline_text:
        logger.error(f"Failed to extract text from unit outline: {outline_filepath}")
        return None

    prompt_for_llm = system_prompt_template.format(outline_text=outline_text)
    # logger.debug(f"Unit Outline Parsing Prompt (first 500 chars):\n{prompt_for_llm[:500]}...")

    parsed_data = None
    try:
        if LLM_PROVIDER == "ollama" and ollama_available:
            parsed_data = call_ollama_to_parse_outline(prompt_for_llm)
        # elif LLM_PROVIDER == "openai" and openai_available:
        #     # parsed_data = call_openai_to_parse_outline(prompt_for_llm) # Implement this
        #     logger.warning("OpenAI outline parsing not yet fully implemented in this example.")
        # elif LLM_PROVIDER == "gemini" and gemini_available:
        #     # parsed_data = call_gemini_to_parse_outline(prompt_for_llm) # Implement this
        #     logger.warning("Gemini outline parsing not yet fully implemented in this example.")
        else:
            logger.error(f"Unsupported or unavailable LLM provider for outline parsing: {LLM_PROVIDER}")
            return None

        if parsed_data:
            logger.info(f"Successfully parsed unit outline '{outline_filepath}' using {LLM_PROVIDER} model {OLLAMA_MODEL if LLM_PROVIDER == 'ollama' else 'N/A'}.")
            output_json_filename = os.path.splitext(os.path.basename(outline_filepath))[0] + "_parsed.json"
            # output_json_path = os.path.join(os.getcwd(), output_json_filename) # Save in current dir for Jupyter
            try:
                with open(output_json_filename, 'w') as f:
                    json.dump(parsed_data, f, indent=2)
                logger.info(f"Parsed outline saved to: {output_json_filename}")
            except Exception as e:
                logger.error(f"Could not save parsed JSON to {output_json_filename}: {e}")
            return parsed_data
        else:
            logger.error(f"Failed to parse unit outline '{outline_filepath}' using {LLM_PROVIDER} after retries.")
            return None

    except Exception as e:
        logger.error(f"Error during unit outline parsing with {LLM_PROVIDER}: {e}", exc_info=True)
        return None

# --- Example Usage (Jupyter cell execution) ---
if __name__ == '__main__': # This block will run if you execute the script directly
                           # Or if you run this cell in Jupyter after the first one.

    # Ensure the prompt template is defined (it should be if Cell 1 was run)
    if 'UNIT_OUTLINE_SYSTEM_PROMPT_TEMPLATE' not in globals():
        print("ERROR: UNIT_OUTLINE_SYSTEM_PROMPT_TEMPLATE not defined. Please run the first cell.")
    else:
        # Create a dummy DOCX file for testing if it doesn't exist
        # IMPORTANT: Replace this with the actual path to your 'ICT312 Digital Forensic_Final.docx'
        test_outline_filename = PATH_UO # ✅🔴 Make sure path is correct

        # if not os.path.exists(test_outline_filename) and docx_available:
        #     print(f"Test file '{test_outline_filename}' not found. Creating a dummy one.")
        #     doc = Document()
        #     doc.add_heading('SECTION 1 – GENERAL INFORMATION', level=1)
        #     doc.add_paragraph('1.0 Name of School\nSchool of Business')
        #     doc.add_paragraph('1.1 Unit Details\nUnit Code\tICT000\nUnit Name\tDummy Unit\nCredit Points\t6')
        #     doc.add_paragraph('SECTION 2 – ACADEMIC INFORMATION')
        #     doc.add_paragraph('2.0 Learning Outcomes\n1. Learn things.\n2. Do stuff.')
        #     doc.add_paragraph('2.2 Weekly Schedule\nWeek & Date\tContent/Topic(s)\tRequired Reading\nWeek 1\tIntro to Dummy\tChapter 1')
        #     doc.save(test_outline_filename)
        #     print(f"Dummy file '{test_outline_filename}' created. Please replace with your actual file for real testing.")
        # elif not docx_available:
        #     print(f"Cannot create dummy DOCX because python-docx is not available.")


        if os.path.exists(test_outline_filename):
            print(f"\nAttempting to parse: {test_outline_filename}")
            print(f"Using LLM Provider: {LLM_PROVIDER}")
            if LLM_PROVIDER == "ollama":
                print(f"Ollama Host: {OLLAMA_HOST}, Model: {OLLAMA_MODEL}")

            # Call the main parsing function
            parsed_data = parse_unit_outline_with_llm(test_outline_filename, UNIT_OUTLINE_SYSTEM_PROMPT_TEMPLATE)

            if parsed_data:
                print("\n--- Successfully Parsed Outline Data (first few lines) ---")
                # Print a snippet to avoid flooding output if very large
                parsed_data_str = json.dumps(parsed_data, indent=2)
                print(parsed_data_str[:1000] + ("..." if len(parsed_data_str) > 1000 else ""))
                print("----------------------------------------------------------")
                print(f"Full parsed data saved to '{os.path.splitext(os.path.basename(test_outline_filename))[0] + '_parsed.json'}'")
            else:
                print(f"\nFailed to parse {test_outline_filename}.")
        else:
            print(f"\nTest outline file '{test_outline_filename}' not found and could not be created. Please provide the file.")

2025-06-09 23:56:34,881 - __main__ - INFO - Initializing Ollama client for host: http://localhost:11434
2025-06-09 23:56:34,896 - httpx - INFO - HTTP Request: GET http://localhost:11434/api/tags "HTTP/1.1 200 OK"
2025-06-09 23:56:34,898 - __main__ - INFO - Ollama client connected to http://localhost:11434.



Attempting to parse: /home/sebas_dev_linux/projects/course_generator/data/UO/ICT312 Digital Forensic_Final.docx
Using LLM Provider: ollama
Ollama Host: http://localhost:11434, Model: mistral:latest


2025-06-09 23:57:22,074 - httpx - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-06-09 23:57:22,075 - __main__ - INFO - Successfully parsed unit outline '/home/sebas_dev_linux/projects/course_generator/data/UO/ICT312 Digital Forensic_Final.docx' using ollama model mistral:latest.
2025-06-09 23:57:22,075 - __main__ - INFO - Parsed outline saved to: ICT312 Digital Forensic_Final_parsed.json



--- Successfully Parsed Outline Data (first few lines) ---
{
  "unitInformation": {
    "unitCode": "ICT312",
    "unitName": "Digital Forensic",
    "creditPoints": 6,
    "unitRationale": "Course topics include principles of forensic analysis, forensics and the law, forensics on various types of infrastructure, and management of forensic methodologies, along with a variety of real-life case studies. Students will apply forensic methods in controlled environments and gain an understanding of the technical process of uncovering hidden data and other metadata that may reveal user behaviour.",
    "prerequisites": null
  },
  "learningOutcomes": [
    "On successful completion of this unit, students will be able to: Evaluate theories of digital forensics",
    "Understand the structure of forensic evidence",
    "Implement forensically sound digital security practices in the industry",
    "Demonstrate competence in applying industry-standard forensic analysis techniques",
    "Manage, 