# Bookmark Utilities
[*v0.1*]

Prototyping notebook for bookmark scraping utilities.

# Setup

Cells in this section handle notebook setup, like importing packages and functions/vars from scripts.

## Imports

Import `stdlib` packages (i.e. `pathlib.Path`) and package dependencies.

### stdlib

In [None]:
from pathlib import Path
import json
from typing import Any, Optional, Union
from dataclasses import dataclass, field
import random

In [None]:
from core.config import app_settings, logging_settings
from utils.logger import get_logger


from core.db import Base, get_engine, get_session, create_base_metadata
from utils.validators import valid_dir

from main import get_files, read_bookmarks

### Dependencies

Packages installed with `pip` (or some equivalent tool)

In [None]:
import bs4
from bs4 import BeautifulSoup, Tag, ResultSet

from pydantic import BaseModel, validator, ValidationError, Field

## Global Vars

Variables for use throughout the notebook

In [None]:
nb_log: bool = True
nb_verbose: bool = False

In [None]:
bookmarks_dir: str = "bookmarks"
bookmarks_file: str = "bookmarks_6_13_23.html"
bookmarks_file_path: str = f"{bookmarks_dir}/{bookmarks_file}"

## Functions

Notebook-level functions. These differ from functions imported from scripts in that they are either prototypes, or functions meant only for the notebook.

### Notebook Functions

### Prototype

## Classes

Notebook-level classes. These differ from classes/models imported from scripts in that they are either prototypes, or functions meant only for the notebook.

In [None]:
## Choose between using Dataclasses or Pydantic models for script
#  One of the vars must be True. Both vars cannot be True simultaneously.

USE_DATACLASS: bool = False
USE_PYDANTIC: bool = True

In [None]:
if not USE_DATACLASS and not USE_PYDANTIC:
    raise ValueError(f"USE_DATACLASS or USE_PYDANTIC must be 'True'.")

if USE_DATACLASS and USE_PYDANTIC:
    raise ValueError(
        f"USE_DATACLASS and USE_PYDANTIC cannot both be 'True' you must set only 1 to 'True'."
    )

In [None]:
if USE_DATACLASS:

    @dataclass
    class FileObj:
        path_obj: Path = field(default=None)
        absolute_path: str = field(default=None)
        type: str = field(default=None)
        name: str = field(default=None)
        # resolve_path: str = field(default=None)
        # posix_path: str = field(default=None)
        # uri: str = field(default=None)
        is_symlink: bool = field(default=False)
        owner: str = field(default=None)
        parents: list[str] = field(default=None)
        parent: str = field(default=None)

        @property
        def path(self) -> str:
            _path: str = str(self.path_obj)

            return _path

        # @property
        # def ext(self) -> Union[str, None]:
        #     if not isinstance(self.)

        @property
        def allowed_types(self) -> list[str]:
            allowed_types = ["file", "dir"]

            return allowed_types

        def __post_init__(self):
            if not self.type in self.allowed_types:
                raise ValueError(
                    f"File object 'type' must be one of {self.allowed_types}"
                )

In [None]:
if USE_PYDANTIC:

    class FileObj(BaseModel):
        path_obj: Path = Field(default=None)
        absolute_path: str = Field(default=None)
        type: str = Field(default=None)
        name: str = Field(default=None)
        # resolve_path: str = field(default=None)
        # posix_path: str = field(default=None)
        # uri: str = field(default=None)
        is_symlink: bool = Field(default=False)
        owner: str = Field(default=None)
        parents: list[str] = Field(default=None)
        parent: str = Field(default=None)

        @property
        def path(self) -> str:
            _path: str = str(self.path_obj)

            return _path

        # @property
        # def ext(self) -> Union[str, None]:
        #     if not isinstance(self.)

        @property
        def allowed_types(self) -> list[str]:
            allowed_types = ["file", "dir"]

            return allowed_types

        @validator("type")
        def valid_type(cls, v) -> str:
            if not v in cls.allowed_types:
                raise ValidationError(f"'type' must be one of {cls.allowed_types}")

            return v

In [None]:
if USE_DATACLASS:

    @dataclass
    class BookmarkFolder:
        name: str = field(default=None)
        add_date: str = field(default=None)
        last_modified: str = field(default=None)
        ## Original bs4.Tag object
        bs4_tag: Tag = field(default=None)

In [None]:
if USE_PYDANTIC:

    class BookmarkFolder(BaseModel):
        name: str = Field(default=None)
        add_date: str = Field(default=None)
        last_modified: str = Field(default=None)
        ## Original bs4.Tag object
        bs4_tag: Tag = Field(default=None)

        class Config:
            arbitrary_types_allowed = True

In [None]:
if USE_DATACLASS:

    @dataclass
    class Bookmark:
        href: str = field(default=None)
        add_date: str = field(default=None)
        icon: str = field(default=None)
        description: str = field(default=None)
        url: str = field(default=None)
        name: str = field(default=None)
        folder: str = field(default=None)
        bs4_tag: Tag = field(default=None)

In [None]:
if USE_PYDANTIC:

    class Bookmark(BaseModel):
        href: str = Field(default=None)
        add_date: str = Field(default=None)
        icon: str = Field(default=None)
        description: str = Field(default=None)
        url: str = Field(default=None)
        name: str = Field(default=None)
        folder: str = Field(default=None)
        bs4_tag: Tag = Field(default=None)

        class Config:
            arbitrary_types_allowed = True

# Operations

Functions & data operations.

### Create SQLAlchemy table metadata

In [None]:
engine = get_engine(connection="db/nb_demo.sqlite", echo=True)
create_base_metadata(base_obj=Base, engine=engine)
SessionLocal = get_session(engine=engine)

In [None]:
display(f"Getting files from {bookmarks_dir}")

bookmark_files = get_files(bookmarks_dir)

display(f"Bookmark files: {bookmark_files}")

In [None]:
## Grab a sample bookmark from the list of files
bookmark_files_len = len(bookmark_files["files"])
sel: int = random.randint(0, bookmark_files_len - 1)

display(f"Pulling sample from bookmark files list. Random index: [{sel}]")

_sample: Path = bookmark_files["files"][sel]

display(f"Sample: {_sample}")

In [None]:
## Read contents of sample bookmark
html_contents = read_bookmarks(file=_sample)

if nb_verbose:
    display(f"Bookmark file contents:")
    display(html_contents)

In [None]:
## Create BeautifulSoup from html_contents
soup = BeautifulSoup(html_contents, "lxml")

## Get all <dt> tags
_dt: ResultSet = soup.find_all("dt")

if nb_verbose:
    display(f"All <dt> tags:")
    display(_dt)

In [None]:
## Initialize empty lists to store bookmark folders and links
html_folders: list[BookmarkFolder] = []
html_bookmarks: list[Bookmark] = []

In [None]:
## Loop over results from <dt> tag search
for line in _dt:
    # if nb_log:
    #     display(f"line: {line}")

    item: Tag = line.find_next()
    # if nb_verbose:
    #     display(f"Item: {item}")

    if item.name == "h3":
        # for k in item.attrs.keys():
        #     display(f"Attribute key [{k}] ({type(item.attrs[k])}): {item.attrs[k]}")

        folder_name = item.text

        folder_dict = {
            "name": folder_name,
            "add_date": item.attrs["add_date"],
            "last_modified": item.attrs["last_modified"],
            "bs4_tag": item,
        }

        if USE_PYDANTIC:
            _folder = BookmarkFolder.parse_obj(folder_dict)

        if USE_DATACLASS:
            _folder = BookmarkFolder(**folder_dict)

        html_folders.append(_folder)

        continue

    else:
        # log.debug(
        #     f"URL: {item.get('href')}, Website Name: {item.text}, Add Date: {item.get('add_date')}, Folder name: {folder_name}"
        # )
        # html_bookmarks.append(item)

        # display(f"Bookmark dict: {item.__dict__}")

        # display(f"Bookmark attrs: {item.attrs}")

        bookmark_dict = {
            "href": None,
            "add_date": None,
            "icon": None,
            "description": None,
            "url": None,
            "name": None,
            "folder": None,
            "bs4_tag": item,
        }

        for k in item.attrs.keys():
            attrs_dict = item.attrs

            if not k in item.attrs.keys():
                bookmark_dict[k] = None

            else:
                bookmark_dict[k] = item.attrs[k]

        bookmark_dict["url"] = item.get("href")
        bookmark_dict["name"] = item.text
        bookmark_dict["folder"] = folder_name

        # display(f"Bookmark dict: {bookmark_dict}")

        if USE_PYDANTIC:
            bookmark: Bookmark = Bookmark.parse_obj(bookmark_dict)

        if USE_DATACLASS:
            bookmark: Bookmark = Bookmark(**bookmark_dict)

        # display(f"Bookmark class: {bookmark}")

        html_bookmarks.append(bookmark)

In [None]:
display(f"Bookmark folders: {len(html_folders)}")
display(f"Bookmarks: {len(html_bookmarks)}")

In [None]:
## Grab a random bookmark folder
folder_index = random.randint(0, len(html_folders) - 1)
display(f"Sampling bookmark folder at index [{folder_index}]")

_sample_bookmark_folder: BookmarkFolder = html_folders[folder_index]
display(f"Sample bookmark folder:")
display(_sample_bookmark_folder)

In [None]:
## Grab a random bookmark
bookmark_index = random.randint(0, len(html_bookmarks) - 1)
display(f"Sampling bokmark at index [{bookmark_index}]")

_sample_bookmark: Bookmark = html_bookmarks[bookmark_index]
display(f"Sample bookmark:")
display(_sample_bookmark)

## Examples