# Chamoru.Info Dictionary Scraper

**About This Project**

This script scrapes and processes Chamorro dictionary data from chamoru.info as part of a broader effort to collect and analyze online Chamorro language resources. The goal is to support language revitalization by enabling further linguistic analysis and the development of digital learning tools.

**Name:** Schyuler Lujan <br>
**Date Completed:** 24-November-2020 <br>
**Date Updated:** 13-May-2025

# Import Libraries

In [122]:
# Import libraries for webscraping
import requests
from bs4 import BeautifulSoup
import time

# Import libraries for exporting data
import json
import csv
import pandas as pd
import os

# Create URLs

In this section we will create the URLs that contain the data we want to scrape. The dictionary website at Chamoru.info has a predictable structure, allowing us to create each URL.

In [10]:
def create_url():
    """
    Creates a list of all the urls for every page of the online dictionary on the chamoru.info website
    """
    # Create dictionary of letters and page ranges
    letters = {'A': [1, 7], '%C3%85': [1, 22], 'B': [1, 19], 'CH': [1, 10], 'D': [1, 15], 'E': [1, 11], 'F': [1, 13], 
               'G': [1, 13], 'H': [1, 10], 'I': [1, 7], 'K': [1, 24], 'L': [1, 11], 'M': [1, 22], 'N': [1, 10], 
               '%C3%91': [1, 2], 'NG': [1,1], 'O': [1, 4], 'P': [1, 20], 'R': [1, 5], 'S': [1, 17], 'T': [1, 19], 
               'U': [1, 3], 'Y': [1, 3]}
    
    #letters = {'A': [1, 7]}

    # Create variables for constructing web addresses
    address1 = 'http://www.chamoru.info/dictionary/display.php?action=search&by='
    address2 = '&nr_page='
    
    # Create web addresses for all letters and append them to a list
    web_addresses = [] # initialize list
    
    for key in letters:
        head = address1+key+address2
        start = letters[key][0]
        end = letters[key][1]+1
    
        # loop thru key values in letters to append the appropriate page number
        for i in range(start, end):
            i = str(i) # convert int to string for concatenation
            web_addresses.append(head+i)
    
    return web_addresses

In [12]:
# Create the URLS
urls = create_url()

## Export URLs to CSV

In [120]:
# Convert Python list to dataframe
urls_df = pd.DataFrame(urls)

# Define folder name and filename
base_path = #"PUT_YOUR_ABSOLUTE_PATH_HERE"
folder_path = base_path + "Chamorro-Dictionary-Scraper/exports/csv"
file_name = "chamoru_info_urls.csv"
file_path = os.path.join(folder_path, file_name)

# Export to CSV
urls_df.to_csv(file_path, index=False)

# Get Dictionary Contents

In this section we will navigate to each of the URLs created in the previous section and scrape the dictionary entries from the website.

In [36]:
def get_dictionary_content(addresses):
    """
    Gets the contents of the websites with the urls generated from create_url and parses the contents
    """
    # initialize the dictionary for storing all terms and definitions
    dictionary_data = {}

    current_term = None
    current_def = None

    for url in addresses:

        # Go to the url and parse the html
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
    
        soup = BeautifulSoup(response.text, "html.parser")
    
        # Find the terms and definitions, and add them to the dictionary
        for tag in soup.body.descendants:
            if tag.name == 'dd':
                current_term = tag.get_text(strip=True)
            elif tag.name == 'dt':
                current_def = tag.get_text(strip=True)
                if current_term and current_def:
                    dictionary_data[current_term] = current_def
        
    return dictionary_data

In [38]:
# Get dictionary contents
dictionary_contents = get_dictionary_content(urls)

# Export Dictionary Contents

## Export to JSON

In [80]:
# Convert dictionary_contents to a regular dictionary before exporting
dictionary_dict = {word: definition for word, definition in dictionary_contents.items()}

# Define folder name and filename
folder_path = base_path +"Chamorro-Dictionary-Scraper/exports/json"
file_name = "chamoru_info_dictionary.json"
file_path = os.path.join(folder_path, file_name)

# Verify the folder exists; if it doesnt, create it
os.makedirs(folder_path, exist_ok=True)

# Write dictionary_contents to JSON
with open(file_path, mode="w", encoding="utf-8") as file:
    json.dump(dictionary_dict, file, ensure_ascii=False, indent=2)

## Export to CSV

In [107]:
# Convert dictionary_contents to dataframe
dictionary_df = pd.DataFrame(list(dictionary_contents.items()), columns=["term", "definition"])

# Define folder and filename
folder_path = base_path +"Chamorro-Dictionary-Scraper/exports/csv"
file_name = "chamoru_info_dictionary.csv"
file_path = os.path.join(folder_path, file_name)

# Export dataframe to CSV
dictionary_df.to_csv(file_path, index=False)

# Online Resources

How to create a Dictionary in Python || Web Scraping

https://www.youtube.com/watch?v=atDgcb-ImMo