# Scrape epub frequency dictionary

In [2]:
#from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
#driver = webdriver.Firefox()
#driver.get("https://www.oxfordlearnersdictionaries.com/wordlists/oxford3000-5000")

In [3]:
# Open book with calibre editor -> chapters as *.xlink.html -> export file
# Read xhtml data from file.
with open('data/alphabetical.xlink.xhtml', 'r') as chapter_alphabetical:
    content = chapter_alphabetical.read()
    soup = BeautifulSoup(content, 'html')
print(soup)

<?xml version='1.0' encoding='utf-8'?>
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>A Frequency Dictionary of Spanish: Core Vocabulary for Learners</title>
<link href="css/stylesheet.css" rel="stylesheet" type="text/css"/>
<meta content="urn:uuid:ef0bf3dd-6627-4ef2-a354-0a8c4f5aefb7" name="Adept.expected.resource"/>
</head>
<body>
<p class="pagenum"><span id="page258">p.258</span></p>
<p class="chtitle"><a href="toc.xlink.xhtml#alpha"><span class="hyperlink">Alphabetical index</span></a></p>
<div class="box">
<p class="noindent"><span class="bold">Word family head</span>, <span class="italic">pos</span>, gloss<span class="italic">,</span> <span class="trade5">rank</span></p>
</div>
<p class="hh1">A</p>
<p class="ulist"><span class="bold">a</span> <span class="lightoblique">prep</span> to, at <span class="trade5">8</span></p>
<p class="ulist"><span class="bold">abajo</span> <span class="lightoblique">adv</span> down, below, downward <span class="trad

In [4]:
words_span = soup.find_all("p", attrs={"class": "ulist"})
# Word family head, pos, gloss, rank
print(words_span[0])

<p class="ulist"><span class="bold">a</span> <span class="lightoblique">prep</span> to, at <span class="trade5">8</span></p>


In [5]:
# Each entry is structured as:
# Word family head, pos, gloss, rank
entry = words_span[0]
entry_stripped_strings = list(entry.stripped_strings)
word = entry_stripped_strings[0]
type = entry_stripped_strings[1]
english = entry_stripped_strings[2]
frequency_rank = entry_stripped_strings[3] # lower is more frequent

In [6]:
# Create empty a frame with columns we wish to extract
columns = ["word", "type", "english", "frequency_rank"]
df = pd.DataFrame(columns=columns)

# Extract fields from wordlist
words = soup.find_all("p", attrs={"class": "ulist"})

for entry in words:
    #if word.has_attr("class"):
    #    # Skip where class="hidden"
    #    continue
    try:
    # Each entry is structured as:
    # Word family head, pos, gloss, rank
        entry_stripped_strings = list(entry.stripped_strings)
        word = entry_stripped_strings[0]
        type = entry_stripped_strings[1]
        english = entry_stripped_strings[2]
        frequency_rank = entry_stripped_strings[3] # lower is more frequent
    except Exception as e:
        print(f"Exception {e} for {word}")
    # Add data to dataframe
    data = {
        "word": word,
        "type": type,
        "english": english,
        "frequency_rank": frequency_rank
    }
    df_new = pd.DataFrame.from_records([data])
    df = pd.concat([df, df_new])

In [7]:
df.head()

Unnamed: 0,word,type,english,frequency_rank
0,a,prep,"to, at",8
0,abajo,adv,"down, below, downward",788
0,abandonado,adj,abandoned,2896
0,abandonar,v,"to abandon, leave (a place)",680
0,abandono,nm,"abandonment, desertion",3463


In [8]:
# Save data to pkl file
df.to_pickle("./data/spanish.pkl")
#df = pd.read_pickle("./data/spanish.pkl")

In [9]:
df = pd.read_pickle("./data/spanish.pkl")
df['frequency_rank'] = pd.to_numeric(df['frequency_rank'], errors='coerce')
df.frequency_rank.quantile([0.25,0.5,0.75])
df.frequency_rank.max()

5010.0

In [3]:
#read data by cefr

# Open book with calibre editor -> chapters as *.xlink.html -> export file
# Read xhtml data from file.
with open('data/frequency.xlink.xhtml', 'r') as chapter_alphabetical:
    content = chapter_alphabetical.read()
    soup = BeautifulSoup(content, 'html')
list_of_lists = soup.find_all("div", attrs={"class": ["listb", "list"]}) # freq_rank, word, type, translation

In [4]:
# fix for item: 269 se2_ti instead
# fix for item: 1327 juventud 2 entrances for explanation; fixed in .xhtml
nlistd40 = [] # freq_rank, word, type, translation
blist1 = [] # explanation
for list_of_words in list_of_lists:
    nlistd40 += list_of_words.find_all("p", attrs={"class": ["nlist4d0", "se2_ti"]})
    blist1 += list_of_words.find_all("p", attrs={"class": "blist1"})

In [7]:
# data: <p class="nlist4d0">      <span class="bold">1    el, la</span> <span class="lightoblique">art</span> the (<span class="symbol"><span class="flag-symbol-font">+</span></span>m, f)</p>
# explanation: <p class="blist1">•    el diccionario tenía también frases útiles – the dictionary also had useful phrases</p>
columns = ["word", "type", "english", "frequency_rank", "spanish"]
df = pd.DataFrame(columns=columns)

import re
for data, explanation in zip(nlistd40, blist1):
    first_span = data.find("span").text
    frequency_rank = re.search(r"\d*(?:\s+\d+)*", first_span)[0]
    word = re.search(r"\d+\s+(\w+)", first_span)[1]

    #second_span = data.find_next('span').find_next('span').text
    stripped_strings = list(data.stripped_strings)
    type = stripped_strings[1]
    #english = stripped_strings[2]

    exp_test = explanation.text
    stripped_strings_xp = list(explanation.stripped_strings)
    splitted = exp_test.split('–')
    spanish, english = splitted[0].strip(), splitted[1].strip()
    spanish = spanish.encode('ascii', 'ignore').decode()
    english = english.encode('ascii', 'ignore').decode()

    data = {
        "word": word,
        "type": type,
        "english": english,
        "frequency_rank": frequency_rank,
        "spanish": spanish
    }
    df_new = pd.DataFrame.from_records([data])
    df = pd.concat([df, df_new])


# Spanish done

In [8]:
# Save data to pkl file
df.to_pickle("./data/spanish2.pkl")
#df = pd.read_pickle("./data/spanish.pkl")