# Chrome Bookmark Organizer

This notebook analyzes and reorganizes Chrome bookmarks while:
- Maintaining top-level folder structure
- Identifying duplicate bookmarks
- Flagging very long URLs
- Handling bookmarks with missing titles

## Usage
1. Export Chrome bookmarks to HTML (Chrome -> Bookmarks -> Bookmark Manager -> ⋮ -> Export bookmarks)
2. Place the exported file in this directory
3. Run all cells in this notebook
4. Review the analysis before applying any changes


In [13]:
import os
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from collections import defaultdict
from urllib.parse import urlparse
import matplotlib.pyplot as plt
# Seaborn import removed since it's not installed
from IPython.display import display, HTML
from datetime import datetime

# Constants
BOOKMARK_FILE = 'bookmarks_20250825.html'
LONG_URL_THRESHOLD = 100  # URLs longer than this will be flagged


In [None]:
def parse_bookmarks(file_path):
    """
    Parse Chrome bookmarks HTML file into a structured format.
    Returns a tuple of (all_bookmarks, folder_structure)
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Bookmark file not found: {file_path}")
        
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        
    # Fix potential HTML issues
    content = content.replace('&', '&amp;')
    soup = BeautifulSoup(content, 'html.parser')
    
    bookmarks = []
    folder_structure = defaultdict(list)
    current_folder = ["root"]
    
    def process_node(node, depth=0):
        if not node:
            return
            
        if isinstance(node, str):
            return
            
        # Handle DL tags
        if node.name == 'dl':
            for item in node.find_all('dt', recursive=False):
                process_node(item, depth)
            return
            
        # Handle DT tags
        if node.name == 'dt':
            # Check for folder (H3 tag)
            h3 = node.find('h3', recursive=False)
            if h3:
                folder_name = h3.text.strip()
                if h3.get('personal_toolbar_folder') == 'true':
                    folder_name = "Bookmarks Bar"
                current_folder.append(folder_name)
                
                # Process folder contents
                dl = node.find('dl')
                if dl:
                    process_node(dl, depth + 1)
                current_folder.pop()
                return
                
            # Check for bookmark (A tag)
            a = node.find('a', recursive=False)
            if a:
                url = a.get('href', '')
                if url.startswith('javascript:'):
                    return
                    
                title = a.text.strip() or url
                add_date = a.get('add_date', '')
                if add_date:
                    try:
                        add_date = datetime.fromtimestamp(int(add_date)).isoformat()
                    except:
                        add_date = ''
                
                bookmark = {
                    'title': title,
                    'url': url,
                    'folder_path': '/'.join(current_folder),
                    'depth': depth,
                    'add_date': add_date,
                    'icon': a.get('icon', '')
                }
                bookmarks.append(bookmark)
                folder_structure['/'.join(current_folder)].append(bookmark)
                return
                
            # Process any nested DL
            dl = node.find('dl')
            if dl:
                process_node(dl, depth)
    
    # Start processing from the root DL tag
    root_dl = soup.find('dl')
    if root_dl:
        process_node(root_dl)
    
    if not bookmarks:
        print("Warning: No bookmarks were found. This might indicate a parsing issue.")
        print("First 100 characters of file:", content[:100])
        
    df = pd.DataFrame(bookmarks)
    if len(df) == 0:
        print("DataFrame Info:")
        print(df.info())
        print("\nDataFrame Columns:", list(df.columns))
        
    return df, dict(folder_structure)


In [None]:
# Test the parser
print("Testing bookmark parser...")
try:
    df, folder_structure = parse_bookmarks(BOOKMARK_FILE)
    print(f"\nFound {len(df)} bookmarks in {len(folder_structure)} folders")
    print("\nSample of parsed bookmarks:")
    if len(df) > 0:
        print(df[['title', 'url', 'folder_path']].head())
    else:
        print("No bookmarks found!")
except Exception as e:
    print(f"Error parsing bookmarks: {str(e)}")


In [20]:
def analyze_bookmarks(df, folder_structure):
    """
    Analyze bookmarks for issues and patterns.
    Returns a dictionary of analysis results.
    """
    analysis = {
        'total_bookmarks': len(df),
        'total_folders': len(folder_structure),
        'empty_titles': df[df['title'] == df['url']].shape[0],
        'long_urls': df[df['url'].str.len() > LONG_URL_THRESHOLD].shape[0],
        'avg_depth': df['depth'].mean(),
        'duplicates': {}
    }
    
    # Find duplicate URLs
    duplicates = df[df.duplicated(['url'], keep=False)].sort_values('url')
    analysis['duplicate_urls'] = len(duplicates['url'].unique())
    analysis['duplicate_bookmarks'] = len(duplicates)
    
    # Group duplicates for detailed review
    if not duplicates.empty:
        analysis['duplicates'] = duplicates.groupby('url').apply(
            lambda x: x[['title', 'folder_path', 'add_date']].to_dict('records')
        ).to_dict()
    
    # Analyze folder distribution
    folder_counts = df.groupby('folder_path').size()
    analysis['folder_distribution'] = folder_counts.to_dict()
    analysis['largest_folders'] = folder_counts.nlargest(5).to_dict()
    
    return analysis

def display_analysis(analysis, df):
    """
    Create a formatted display of the bookmark analysis.
    """
    html_output = [
        "<h2>Bookmark Analysis Summary</h2>",
        f"<p>Total Bookmarks: {analysis['total_bookmarks']}</p>",
        f"<p>Total Folders: {analysis['total_folders']}</p>",
        f"<p>Bookmarks without titles: {analysis['empty_titles']}</p>",
        f"<p>URLs longer than {LONG_URL_THRESHOLD} characters: {analysis['long_urls']}</p>",
        f"<p>Average folder depth: {analysis['avg_depth']:.1f}</p>",
        
        "<h3>Largest Folders</h3>",
        "<ul>"
    ]
    
    for folder, count in analysis['largest_folders'].items():
        html_output.append(f"<li>{folder}: {count} bookmarks</li>")
    
    html_output.append("</ul>")
    
    if analysis['duplicate_urls'] > 0:
        html_output.extend([
            "<h3>Duplicate URLs</h3>",
            f"<p>Found {analysis['duplicate_urls']} URLs duplicated across {analysis['duplicate_bookmarks']} bookmarks</p>",
            "<table border='1'>",
            "<tr><th>URL</th><th>Occurrences</th><th>Locations</th></tr>"
        ])
        
        for url, instances in analysis['duplicates'].items():
            locations = [f"{b['folder_path']} ({b['title']})" for b in instances]
            html_output.append(
                f"<tr><td>{url[:50]}{'...' if len(url) > 50 else ''}</td>"
                f"<td>{len(instances)}</td>"
                f"<td>{' | '.join(locations)}</td></tr>"
            )
        
        html_output.append("</table>")
    
    # Display long URLs
    if analysis['long_urls'] > 0:
        html_output.extend([
            "<h3>Long URLs</h3>",
            "<table border='1'>",
            "<tr><th>Title</th><th>URL Length</th><th>URL</th></tr>"
        ])
        
        long_urls = df[df['url'].str.len() > LONG_URL_THRESHOLD]
        for _, row in long_urls.iterrows():
            html_output.append(
                f"<tr><td>{row['title']}</td>"
                f"<td>{len(row['url'])}</td>"
                f"<td>{row['url'][:50]}...</td></tr>"
            )
        
        html_output.append("</table>")
    
    return HTML(''.join(html_output))


In [22]:
# Load and analyze bookmarks
try:
    # Print the contents of the bookmark file for debugging
    print("Attempting to read bookmark file...")
    with open(BOOKMARK_FILE, 'r', encoding='utf-8') as f:
        print(f"First 500 characters of file:\n{f.read(500)}\n...")
    
    df, folder_structure = parse_bookmarks(BOOKMARK_FILE)
    
    # Debug dataframe contents
    print("\nDataFrame Info:")
    print(df.info())
    print("\nDataFrame Columns:", df.columns.tolist())
    print("\nFirst few rows:")
    print(df.head())
    
    # Verify required columns exist before analysis
    required_cols = ['title', 'url', 'folder_path']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {', '.join(missing_cols)}")
        
    analysis = analyze_bookmarks(df, folder_structure)
    display(display_analysis(analysis, df))
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("\nPlease export your Chrome bookmarks and place the file in this directory.")
except ValueError as e:
    print(f"Error: {e}")
    print("\nThe bookmark file structure is unexpected. Please check the parsing logic.")
    print("\nExpected columns: title, url, folder_path")
    print("\nPlease ensure the parse_bookmarks() function is correctly extracting these fields.")
except Exception as e:
    print(f"Unexpected error: {e}")
    print("\nPlease ensure you exported bookmarks correctly from Chrome:")
    print("Chrome -> Bookmarks -> Bookmark Manager -> ⋮ -> Export bookmarks")


Attempting to read bookmark file...
First 500 characters of file:
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!-- This is an automatically generated file.
     It will be read and overwritten.
     DO NOT EDIT! -->
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
    <DT><H3 ADD_DATE="1706913652" LAST_MODIFIED="1755640208" PERSONAL_TOOLBAR_FOLDER="true">Bookmarks Bar</H3>
    <DL><p>
        <DT><H3 ADD_DATE="1603242345" LAST_MODIFIED="1749235301">ADSK</H3>
        <DL><p>
            <DT><A HREF=
...

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
None

DataFrame Columns: []

First few rows:
Empty DataFrame
Columns: []
Index: []
Error: Missing required columns: title, url, folder_path

The bookmark file structure is unexpected. Please check the parsing logic.

Expected columns: title, url, folder_path

Please ensure the parse_bookmarks() function is correctly extracting these f

In [None]:
def visualize_folder_structure(df, folder_structure):
    """
    Create visualizations of the bookmark folder structure
    """
    # Set up the plotting style
    plt.style.use('seaborn')
    
    # Create a figure with multiple subplots
    fig = plt.figure(figsize=(15, 10))
    
    # 1. Folder size distribution
    plt.subplot(2, 2, 1)
    folder_sizes = df.groupby('folder_path').size()
    sns.histplot(folder_sizes, bins=20)
    plt.title('Folder Size Distribution')
    plt.xlabel('Number of Bookmarks')
    plt.ylabel('Number of Folders')
    
    # 2. Bookmark depth distribution
    plt.subplot(2, 2, 2)
    sns.histplot(df['depth'], bins=range(df['depth'].max() + 2))
    plt.title('Bookmark Depth Distribution')
    plt.xlabel('Folder Depth')
    plt.ylabel('Number of Bookmarks')
    
    # 3. URL length distribution
    plt.subplot(2, 2, 3)
    url_lengths = df['url'].str.len()
    sns.histplot(url_lengths, bins=30)
    plt.axvline(x=LONG_URL_THRESHOLD, color='r', linestyle='--', label=f'Long URL threshold ({LONG_URL_THRESHOLD})')
    plt.title('URL Length Distribution')
    plt.xlabel('URL Length (characters)')
    plt.ylabel('Number of Bookmarks')
    plt.legend()
    
    # 4. Top-level folder distribution
    plt.subplot(2, 2, 4)
    top_level_folders = df['folder_path'].apply(lambda x: x.split('/')[1] if len(x.split('/')) > 1 else 'root')
    top_folder_counts = top_level_folders.value_counts()
    plt.pie(top_folder_counts.values, labels=top_folder_counts.index, autopct='%1.1f%%')
    plt.title('Top-level Folder Distribution')
    
    plt.tight_layout()
    return fig

# Create and display visualizations if bookmarks are loaded
try:
    fig = visualize_folder_structure(df, folder_structure)
    plt.show()
except NameError:
    print("Please load bookmarks first")


In [None]:
def suggest_reorganization(df, folder_structure):
    """
    Suggest reorganization of bookmarks while preserving top-level structure.
    Returns a dictionary of suggested changes.
    """
    suggestions = {
        'duplicates_to_remove': [],
        'moves': [],
        'long_urls_to_review': []
    }
    
    # Handle duplicates
    duplicates = df[df.duplicated(['url'], keep=False)].sort_values(['url', 'add_date'])
    for url in duplicates['url'].unique():
        dupes = duplicates[duplicates['url'] == url]
        # Keep the newest bookmark in its current location
        keep_idx = dupes['add_date'].idxmax()
        remove_idx = dupes.index.difference([keep_idx])
        
        for idx in remove_idx:
            suggestions['duplicates_to_remove'].append({
                'url': dupes.loc[idx, 'url'],
                'from_folder': dupes.loc[idx, 'folder_path'],
                'keep_in': dupes.loc[keep_idx, 'folder_path']
            })
    
    # Suggest moves for bookmarks in deep folders
    deep_bookmarks = df[df['depth'] > 3]  # Suggest reorganizing bookmarks deeper than 3 levels
    for _, bookmark in deep_bookmarks.iterrows():
        folder_parts = bookmark['folder_path'].split('/')
        if len(folder_parts) > 2:  # Has at least a top-level folder
            top_level = folder_parts[1]
            suggested_folder = f"root/{top_level}"
            
            suggestions['moves'].append({
                'url': bookmark['url'],
                'title': bookmark['title'],
                'from_folder': bookmark['folder_path'],
                'to_folder': suggested_folder,
                'reason': 'Reduce folder depth'
            })
    
    # Flag very long URLs for review
    long_urls = df[df['url'].str.len() > LONG_URL_THRESHOLD]
    for _, bookmark in long_urls.iterrows():
        suggestions['long_urls_to_review'].append({
            'url': bookmark['url'],
            'title': bookmark['title'],
            'folder': bookmark['folder_path'],
            'length': len(bookmark['url'])
        })
    
    return suggestions

def display_suggestions(suggestions):
    """
    Display reorganization suggestions in a formatted way
    """
    html_output = ["<h2>Suggested Reorganization</h2>"]
    
    if suggestions['duplicates_to_remove']:
        html_output.extend([
            "<h3>Duplicate Bookmarks to Remove</h3>",
            "<table border='1'>",
            "<tr><th>URL</th><th>Remove from</th><th>Keep in</th></tr>"
        ])
        
        for dup in suggestions['duplicates_to_remove']:
            html_output.append(
                f"<tr><td>{dup['url'][:50]}...</td>"
                f"<td>{dup['from_folder']}</td>"
                f"<td>{dup['keep_in']}</td></tr>"
            )
        html_output.append("</table>")
    
    if suggestions['moves']:
        html_output.extend([
            "<h3>Suggested Moves</h3>",
            "<table border='1'>",
            "<tr><th>Title</th><th>From Folder</th><th>To Folder</th><th>Reason</th></tr>"
        ])
        
        for move in suggestions['moves']:
            html_output.append(
                f"<tr><td>{move['title']}</td>"
                f"<td>{move['from_folder']}</td>"
                f"<td>{move['to_folder']}</td>"
                f"<td>{move['reason']}</td></tr>"
            )
        html_output.append("</table>")
    
    if suggestions['long_urls_to_review']:
        html_output.extend([
            "<h3>Long URLs to Review</h3>",
            "<table border='1'>",
            "<tr><th>Title</th><th>Folder</th><th>URL Length</th><th>URL</th></tr>"
        ])
        
        for url in suggestions['long_urls_to_review']:
            html_output.append(
                f"<tr><td>{url['title']}</td>"
                f"<td>{url['folder']}</td>"
                f"<td>{url['length']}</td>"
                f"<td>{url['url'][:50]}...</td></tr>"
            )
        html_output.append("</table>")
    
    return HTML(''.join(html_output))

# Generate and display reorganization suggestions
try:
    suggestions = suggest_reorganization(df, folder_structure)
    display(display_suggestions(suggestions))
except NameError:
    print("Please load bookmarks first")
