In [2]:
'''Web Scraping Tool for Chamorro English Dictionary      Schyuler Lujan      November 24, 2020'''

'Web Scraping Tool for Chamorro English Dictionary      Schyuler Lujan      November 24, 2020'

In [1]:
import os

In [30]:
# Import relevant libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [5]:
# Load GitHub token from the environment
token = os.getenv("GITHUB_PAT")

# Construct the repo URL securely
repo_url = f"https://<schyuler-lujan>:${token}@github.com/<schyuler-lujan>/<Chamorro-Dictionary-Scraper>.git"

# Git commands to add, commit, and push
!git config user.name "Schyuler Lujan"
!git config user.email "schyuler.lujan@gmail.com"

!git add .
!git commit -m "Reorganize into new file tree"
!git push {https://www.github.com/schyuler/Chamorro-Dictionary-Scraper}

On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	deleted:    ../../../Chamoru - English Dictionary Web Scraping Tool.ipynb
	deleted:    ../../../chamoru_english_dictionary_char.csv

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	../../../exports/
	../../../input/

no changes added to commit (use "git add" and/or "git commit -a")


fatal: protocol '{https' is not supported


In [2]:
def create_url():
    """Creates a list of all the urls for every page of the online dictionary on the chamoru.info website"""
    # Create dictionary of letters and page ranges
    letters = {'A': [1, 7], '%C3%85': [1, 22], 'B': [1, 19], 'CH': [1, 10], 'D': [1, 15], 'E': [1, 11], 'F': [1, 13], 
               'G': [1, 13], 'H': [1, 10], 'I': [1, 7], 'K': [1, 24], 'L': [1, 11], 'M': [1, 22], 'N': [1, 10], 
               '%C3%91': [1, 2], 'NG': [1,1], 'O': [1, 4], 'P': [1, 20], 'R': [1, 5], 'S': [1, 17], 'T': [1, 19], 
               'U': [1, 3], 'Y': [1, 3]}
    
    #letters = {'A': [1, 7]}

    # Create variables for constructing web addresses
    address1 = 'http://www.chamoru.info/dictionary/display.php?action=search&by='
    address2 = '&nr_page='
    
    # Create web addresses for all letters and append them to a list
    web_addresses = [] # initialize list
    
    for key in letters:
        head = address1+key+address2
        start = letters[key][0]
        end = letters[key][1]+1
    
        # loop thru key values in letters to append the appropriate page number
        for i in range(start, end):
            i = str(i) # convert int to string for concatenation
            web_addresses.append(head+i)
    
    return web_addresses

In [3]:
def read_urls(addresses):
    """Gets the contents of the websites with the urls generated from create_url and parses the contents"""
    # initialize the lists for storing all terms and definitions
    terms = []
    definitions = []
    
    # Read the contents of the addresses list
    for i in addresses:
        web = requests.get(i)
        
        # store the web content
        data = web.content
        
        # parse the web content in html form
        soup = BeautifulSoup(data, features="html.parser")
        
        # parse the content by terms and definitions
        term = soup.find_all("dd")
        definition = soup.find_all("dt")
        
        # clean up terms by removing html tags
        temp_terms = [] # temporary list for storing terms
        for i in term:
            temp_terms.append(i.text)
            
        # clean up definitions by removing html tags
        temp_def = [] # temporary list for storing meanings
        for i in definition:
            temp_def.append(i.text)
            
        # append contents to terms and definitions lists
        for i in temp_terms:
            terms.append(i)
            
        # append contents to definitions list
        for i in temp_def:
            definitions.append(i)
        
    return terms, definitions

In [4]:
def create_df(lists):
    """Converts the contents of the term and meaning lists to a dataframe"""
    # Append lists to a dictionary for prep to convert to a data frame

    # Initialize dictionary
    dictionary = {'Term': [], 'Definition': []}
    list_length = len(lists[0])

    # Assign outputs from read_urls to variables
    t = lists[0]
    d = lists[1]

    # Append term_list and meaning_list to dictionary
    for i in range(list_length):
        dictionary['Term'].append(t[i])
        dictionary['Definition'].append(d[i])
    
    # Convert dictionary to a dataframe and export to csv
    dictionary_df = pd.DataFrame(dictionary)
    
    return dictionary_df

In [5]:
urls = create_url()

In [6]:
word_lists = read_urls(urls)

In [35]:
dataframe = create_df(word_lists)

In [39]:
# Remove the adsbygoogle rows, which contain no relevant data
dataframe_filtered = dataframe[~dataframe['Term'].str.contains('adsbygoogle')]

In [37]:
dataframe_filtered

Unnamed: 0,Term,Definition
0,a'abang,Type of plant--eugenia reinwardtiana (Beach Ch...
1,a'adahi,Protector; guard; caretaker; body guard.
2,a'adda',Mimicker; imitator; mocker.
3,a'addak,noun. Knocker; person who knocks; pounder.
4,a'aga,Type of fish-family labridae (Wrasses).
5,a'akonseha,Adviser; counselor.
6,a'amte,Physician; doctor; nurse; surgeon; healer.
7,a'aposta,Gambler--one who bets or wagers.
8,a'arekla,Fixer--one who fixes; repairman; arranger
9,a'atte,Magician; wizard; sorcerer; trickster; clairvo...


In [40]:
# Export dataframe to csv
dataframe_filtered.to_csv('chamoru_english_dictionary.csv', encoding="utf8")

Online Resources:

How to create a Dictionary in Python || Web Scraping

https://www.youtube.com/watch?v=atDgcb-ImMo