In [None]:
!pip install jsonschema==4.19.0

In [None]:
my_schema="""
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://raw.githubusercontent.com/rarensu/docsummary/v1.0/tex/schema.json",
  "title": "Richard's custom document summary schema (v1.0)",
  "description": "JSON schema for organizing summaries of a long latex document with a hierarchical structure.",
  "type": "object",
  "properties": {
    "document_name": {
      "description": "The name of the document being summarized.",
      "type": "string"
    },
    "contents": {
      "description": "An itemized list of summaries from the document, representing a hierarchical structure.",
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "prefix": {
            "type": "string",
            "description": "The type of document element.",
            "enum": ["Document","Chapter", "Section", "Subsection", "Subsubsection", "Paragraph", "Figure", "Table", "Equation", "Footnote", "Quote", "Other", "None"]
          },
          "label": {
            "type": "string",
            "description": "A unique label for the document element (used with \\\\label{}).  Cannot contain spaces.",
            "pattern": "^[^\\\\n]+$"
          },
          "parent": {
            "type": "string",
            "description": "The label of the parent document element. Cannot contain spaces.",
            "pattern": "^[^\\\\n]+$"
          },
          "children": {
            "type": "array",
            "description": "An array of labels for child document elements.",
            "items": {
              "type": "string",
              "pattern": "^[^\\\\n]+$"
            }
          },
          "title": {
            "type": "string",
            "description": "The title of the document element."
          },
          "summary": {
            "type": "string",
            "description": "A brief summary of the document element."
          },
          "keywords": {
            "type": "array",
            "description": "Keywords relevant to the document element.",
            "items": {
              "type": "string"
            }
          },
          "comment": {
            "type": "string",
            "description": "Any additional comments or notes about the document element."
          }
        },
        "required": [
          "label",
          "prefix"
        ],
        "oneOf": [
          {"required": ["title"]},
          {"required": ["summary"]},
          {"required": ["keywords"]},
          {"required": ["comment"]}
        ]
      }
    },
    "response_status": {
      "description": "The agent's self-reported success or failure in summarizing the document.",
      "type": "string",
      "enum": ["success", "failure", "partial_success"]
    }
  },
  "required": [
    "document_name",
    "contents",
    "response_status"
  ]
}
"""
import json
import jsonschema

schema_dict = json.loads(my_schema)
with open("schema.json", 'w') as f:
  json.dump(schema_dict, f, indent=4)

In [None]:
import os
import re

def rename_files(directory):
    for filename in os.listdir(directory):
        new_filename = filename.replace("appendix", "2")  # Replace 'appendix' with '2'

        # Remove 'section' and pad the following number
        match = re.search(r"section(\d+)", new_filename, re.IGNORECASE)
        if match:
            section_number = match.group(1)
            padded_number = section_number.zfill(2)  # Pad with leading zeros
            new_filename = re.sub(r"section\d+", padded_number, new_filename, flags=re.IGNORECASE)

        # Rename the file
        old_path = os.path.join(directory, filename)
        new_path = os.path.join(directory, new_filename)
        os.rename(old_path, new_path)
        print(f"Renamed '{filename}' to '{new_filename}'")

# Usage:
directory_path = "/content"  # Replace with the actual path
rename_files(directory_path)

In [None]:
class entry:
  def __init__(self):
    self.label=""
    self.parent=""
    self.children=[]
    self.prefix=""

In [None]:
index_of = {
    "other":-7,
    "quote":-6,
    "paragraph":-5,
    "footnote":-4,
    "equation":-3,
    "table":-2,
    "figure":-1,
    "document":0,
    "chapter":1,
    "section":2,
    "subsection":3,
    "subsubsection":4,
    "subsubsubsection":5
}
prefix_list = index_of.keys()
prefix_l_of = {
    -7: "other",
    -6: "quote",
    -5: "paragraph",
    -4: "footnote",
    -3: "equation",
    -2: "table",
    -1: "figure",
    0: "document",
    1: "chapter",
    2: "section",
    3: "subsection",
    4: "subsubsection",
    5: "subsubsubsection"
}
prefix_t_of = {
    -7: "Other",
    -6: "Quote",
    -5: "Paragraph",
    -4: "Footnote",
    -3: "Equation",
    -2: "Table",
    -1: "Figure",
    0: "Document",
    1: "Chapter",
    2: "Section",
    3: "Subsection",
    4: "Subsubsection",
    5: "Subsubsubsection",
}
command_of={
  "other": "?????????",
  "quote": "\\begin{quote}",
  "paragraph": "?????????",
  "footnote": "\\footnote",
  "equation": "\\begin{equation}",
  "table": "\\begin{table}",
  "figure": "\\begin{figure}",
  "document": "?????????",
  "chapter": "\\chapter",
  "section": "\\section",
  "subsection": "\\subsection",
  "subsubsection": "\\subsubsection",
  "subsubsubsection": "\\subsubsubsection"
}
command_prefix={
  "?????????": "other",
  "\\begin{quote}": "quote",
  "\\footnote": "footnote",
  "\\begin{equation}": "equation",
  "\\begin{table}": "table",
  "\\begin{figure}": "figure",
  "\\chapter": "chapter",
  "\\section": "section",
  "\\subsection": "subsection",
  "\\subsubsection": "subsubsection",
  "\\subsubsubsection": "subsubsubsection"
}
command_list=[
  "\\begin{quote}",
  "\\footnote",
  "\\begin{equation}",
  "\\begin{table}",
  "\\begin{figure}",
  "\\chapter",
  "\\section",
  "\\subsection",
  "\\subsubsection",
  "\\subsubsubsection"
]


In [None]:
import re
def process_entry(current_prefix, current_label):
    global data
    global stack
    global labels
    current_index=index_of[current_prefix]
    if current_index>0:
      while current_index < len(stack):
        stack.pop()
    current_parent=stack[-1]
    if current_index>0:
      stack.append(current_label)
    labels.append(current_label)
    current_entry=entry()
    current_entry.label=current_label
    current_entry.parent=current_parent
    current_entry.prefix=prefix_t_of[current_index]
    data[current_label]=current_entry
    data[current_parent].children.append(current_label)
def process_line(line):
    global last_label
    global last_prefix
    for command in command_list:
       if command in line:
        last_prefix=command_prefix[command]
    if "\\label{" in line:
      last_label=re.search(r"\\label\{(.*?)\}", line).group(1)
      process_entry(last_prefix, last_label)

In [None]:
import glob
first_label="document:0"
first_entry=entry()
first_entry.label=first_label
first_entry.parent=None
first_entry.children=[]
first_entry.prefix="Document"
labels=[first_entry.label]
stack=[first_entry.label]
data={first_entry.label: first_entry}
last_prefix=None
last_label=None
texfiles = glob.glob("*.tex")
texfiles.sort()
for filename in texfiles:
    print(filename)
    with open(filename, 'r') as file:
        for line in file:
            process_line(line)


In [None]:
for key in data:
  print(data[key].prefix,data[key].label,data[key].parent,data[key].children)

In [None]:
import json

def dump_data_to_json(data, filename="output.json"):
    # Prepare the JSON structure
    json_data = {
        "document_name": "Your Document Name",  # Replace with the actual name
        "contents": [],
        "response_status": "success"  # You might adjust this based on your process
    }

    # Populate the "contents" array
    for key, current_entry in data.items():
        item_data = {
            "prefix": current_entry.prefix,
            "label": current_entry.label,
            "comment": "Not summarized yet."
        }

        # Add 'parent' only if it's not None
        if current_entry.parent is not None:
            item_data["parent"] = current_entry.parent

        # Add 'children' only if it's not empty
        if current_entry.children:  # Checks if the list is not empty
            item_data["children"] = current_entry.children

        json_data["contents"].append(item_data)

    # Dump to JSON file
    with open(filename, 'w') as f:
        json.dump(json_data, f, indent=4)  # indent for readability

# Usage:
dump_data_to_json(data)  # Assuming 'data' and 'my_schema' are defined

In [None]:
import json

def load_data_from_json(filename="output.json"):
    with open(filename, 'r') as f:
        json_data = json.load(f)

    # Assuming your 'entry' class is defined
    data = {}
    for item in json_data["contents"]:
        current_entry = entry()
        current_entry.prefix = item["prefix"]
        current_entry.label = item["label"]

        # Load 'parent' only if it exists in the JSON item
        if "parent" in item:
            current_entry.parent = item["parent"]
        else:
            current_entry.parent = None  # Or any default value

        # Load 'children' only if it exists and is not empty
        if "children" in item and item["children"]:
            current_entry.children = item["children"]
        else:
            current_entry.children = []  # Or any default value
        # ... (load other properties as needed) ...

        data[current_entry.label] = current_entry

    return data

# Usage:
loaded_data = load_data_from_json()

In [None]:
import json
import jsonschema

schema_dict = json.loads(my_schema)

def validate_json_data(json_data, schema):
    try:
        jsonschema.validate(instance=json_data, schema=schema)
        print("JSON data is valid against the schema.")
        return True  # Indicate successful validation
    except jsonschema.exceptions.ValidationError as e:
        print("JSON data is invalid:", e)
        return False  # Indicate validation failure

# Example usage
with open("output.json", 'r') as f:
    json_data = json.load(f)

# Assuming my_schema is a dictionary containing your schema
is_valid = validate_json_data(json_data, schema_dict)