<a href="https://colab.research.google.com/github/ruchirlives/Python/blob/main/Onenote_Word_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install --quiet pandas
%pip install --quiet python-docx beautifulsoup4

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m174.1/244.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h

## Converter classes

In [13]:
import requests

from docx import Document
from io import BytesIO
from bs4 import BeautifulSoup
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
import os

# Utility classes

class OneNoteHandler:
    def __init__(self, token, onenote_url):
        self.token = token
        self.base_url = "https://graph.microsoft.com/v1.0/me/onenote/notebooks"
        self.request_url = self.get_notebook_request_url(onenote_url)

    def get_notebook_request_url(self, webUrl):
        getOnenote = f"{self.base_url}/GetNotebookFromWebUrl"
        body = {'webUrl': webUrl}

        response = requests.post(getOnenote, headers={'Authorization': f'Bearer {self.token}'}, json=body).json()
        return response.get('self')

    def get_notebook_by_name(self, name):
        listNotebooks = f"{self.base_url}"
        response = requests.get(listNotebooks, headers={'Authorization': f'Bearer {self.token}'}).json()
        notebooks = response.get('value', [])
        for notebook in notebooks:
            if notebook['displayName'] == name:
                return notebook
        print(f"Notebook '{name}' not found.")
        return None

    def list_notebook_sections(self):
        listSections = f"{self.request_url}/sections"
        response = requests.get(listSections, headers={'Authorization': f'Bearer {self.token}'}).json()

        sections = response.get('value', [])
        print("\nSections:")
        for section in sections:
            print(section['displayName'])

        return sections

    def get_pages(self, section_url):
        listPages = f"{section_url}/pages"
        response = requests.get(listPages, headers={'Authorization': f'Bearer {self.token}'}).json()
        return response.get('value', [])

    def get_page_content(self, contentUrl):
        response = requests.get(contentUrl, headers={'Authorization': f'Bearer {self.token}'})
        return response.content

    def get_page(self, page_name, request_url):
        sections = self.list_notebook_sections(request_url)
        for section in sections:
            pages = self.get_pages(section['self'])
            for page in pages:
                if page['title'] == page_name:
                    print(f"Found page: {page['title']}")
                    return page
        return None

    def get_page_text(self, page):
        if page:
            page_content = self.get_page_content(page['contentUrl'])
            soup = BeautifulSoup(page_content, 'html.parser')
            return soup.get_text()
        return None

    def get_page_html(self, page):
        if page:
            return self.get_page_content(page['contentUrl'])
        return None

    def write_page_text(self, page, text):
        if page:
            body = {
                "target": "body",
                "action": "replace",
                "content": text
            }
            response = requests.patch(page['contentUrl'], headers={'Authorization': f'Bearer {self.token}'}, json=[body])
            return response
        return None

class GraphAPIClient:
    def __init__(self, token):
        self.token = token

    def get(self, url):
        headers = {'Authorization': f'Bearer {self.token}'}
        return requests.get(url, headers=headers)

    def put(self, url, data):
        headers = {'Authorization': f'Bearer {self.token}'}
        return requests.put(url, headers=headers, data=data)

class SharePointHandler:
    def __init__(self, graph_client, hostname, site_name, doc_name):
        self.graph_client = graph_client
        self.hostname = hostname
        self.site_name = site_name
        self.doc_name = doc_name
        self.site_id, self.item_id = self._get_ids()

    def _get_ids(self):
        request = f"https://graph.microsoft.com/v1.0/sites/{self.hostname}:/sites/{self.site_name}"
        response = self.graph_client.get(request).json()
        if 'id' not in response:
            return None, None
        site_id = response['id']

        drive_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root/children"
        response = self.graph_client.get(drive_url).json()
        doc_id = self._get_doc_id(response)

        return site_id, doc_id

    def _get_doc_id(self, response):
        if 'value' not in response:
            return None
        for item in response['value']:
            if item['name'] == self.doc_name:
                return item['id']

    def get_word_document(self):
        get_word_url = f"https://graph.microsoft.com/v1.0/sites/{self.site_id}/drive/items/{self.item_id}/content"
        response = self.graph_client.get(get_word_url)
        if response.status_code == 200:
            doc_stream = BytesIO(response.content)
            return Document(doc_stream)
        else:
            print(f"Failed to download document. Status code: {response.status_code}")
            return None

    def upload_document(self, document, title):
        upload_url = f"https://graph.microsoft.com/v1.0/sites/{self.site_id}/drive/root:/FINAL/{title}:/content"
        with open(title, 'rb') as file:
            response = self.graph_client.put(upload_url, file)
            if response.status_code == 200:
                print("File uploaded successfully.")
            else:
                print(f"Failed to upload file. Status code: {response.status_code}")

# Word document processing classes
class WordDocumentProcessor:
    def __init__(self, document, onenote_handler):
        self.document = document
        self.onenote_handler = onenote_handler

    def delete_paragraph(self, paragraph):
        p = paragraph._element
        p.getparent().remove(p)
        p._element = None

    def replace_word_placeholder(self):
        sections = self.onenote_handler.list_notebook_sections()
        pages = self.onenote_handler.get_pages(sections[0]['self'])

        for page in pages:
            page_title = page['title']
            page_content = self.onenote_handler.get_page_content(page['contentUrl'])
            for paragraph in self.document.paragraphs[:]:
                if not paragraph.style.name.startswith('Heading'):
                    continue
                if page_title in paragraph.text:
                    print("Matched...", page_title)
                    self.document = self.merge_onenote(paragraph, page_content)
        return self.document

    def merge_onenote(self, target_paragraph, page_content):
        soup = BeautifulSoup(page_content, 'html.parser')
        tag = soup.html
        new_paragraph = target_paragraph.insert_paragraph_before()
        self.process_html_to_word(new_paragraph, tag)
        self.delete_paragraph(target_paragraph)
        return self.document

    def process_html_to_word(self, paragraph, tag):
        while tag is not None:
            tag = self.get_next_tag(tag)
            if tag is None:
                break
            self.translate_tag(paragraph, tag)

    def get_next_tag(self, tag):
        """
        Helper function to get the next valid tag in the paragraph.
        """
        next_tag = tag.next_element

        if next_tag is None:
            print("No more elements")
            return None
        elif next_tag.name is None:  # This is a text node or character
            # Skip this and look for the next valid tag
            return self.get_next_tag(next_tag)
        elif next_tag.name == 'div':
            # Skip the div tag, but continue checking next elements
            return self.get_next_tag(next_tag)
        elif next_tag.name in ['p', 'ul', 'ol', 'h1', 'h2', 'h3']:
            # Found a valid tag, return it
            return next_tag
        else:
            # Skip any other tags and keep looking
            return self.get_next_tag(next_tag)

    def translate_tag(self, paragraph, tag):
        if isinstance(tag, str):
            run = paragraph.add_run(tag)
        elif tag.name == 'p':
            new_paragraph = paragraph.insert_paragraph_before()
            new_paragraph.add_run(tag.get_text())
        elif tag.name in ['strong', 'b']:
            run = paragraph.add_run(tag.get_text())
            run.bold = True
        elif tag.name == 'i':
            run = paragraph.add_run(tag.get_text())
            run.italic = True
        elif tag.name.startswith('h') and tag.name in ['h1', 'h2', 'h3']:
            new_paragraph = paragraph.insert_paragraph_before()
            new_paragraph.style = f'Heading {tag.name[-1]}'
            new_paragraph.add_run(tag.get_text())
        elif tag.name == 'br':
            paragraph.add_run().add_break()
        elif tag.name in ['ul', 'ol']:
            for li in tag.find_all('li'):
                self.translate_tag(paragraph, li)
        elif tag.name == 'li':
            list_paragraph = paragraph.insert_paragraph_before(tag.get_text(), style='List Bullet')
        elif tag.name == 'a':
            self.add_hyperlink(paragraph, tag.get('href', ''), tag.get_text())

    def add_hyperlink(self, paragraph, url, text):
        run = paragraph.add_run(text)
        r_id = paragraph.part.relate_to(url, "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink", is_external=True)
        hyperlink = OxmlElement('w:hyperlink')
        hyperlink.set(qn('r:id'), r_id)
        hyperlink_run = OxmlElement('w:r')
        rPr = OxmlElement('w:rPr')
        underline = OxmlElement('w:u')
        underline.set(qn('w:val'), 'single')
        color = OxmlElement('w:color')
        color.set(qn('w:val'), '0000FF')
        rPr.append(underline)
        rPr.append(color)
        hyperlink_run.append(rPr)
        hyperlink_run.append(run._r)
        hyperlink.append(hyperlink_run)
        paragraph._element.append(hyperlink)

# Main conversion class
class SharePointDocumentConverter:
    def __init__(self, graph_client, hostname, site_name, doc_name, onenote_url):
        self.graph_client = graph_client
        self.hostname = hostname
        self.site_name = site_name
        self.doc_name = doc_name
        self.onenote_handler = OneNoteHandler(token, onenote_url)
        self.sharepoint_handler = SharePointHandler(self.graph_client, self.hostname, self.site_name, self.doc_name)

    def convert_document(self):
        word_doc = self.sharepoint_handler.get_word_document()
        if word_doc:
            processor = WordDocumentProcessor(word_doc, self.onenote_handler)
            updated_doc = processor.replace_word_placeholder()
            title = updated_doc.paragraphs[0].text + "_Draft.docx"
            updated_doc.save(title)
            self.sharepoint_handler.upload_document(updated_doc, title)
