Skip to content

Commit

Permalink
Upgrading parser.
Browse files Browse the repository at this point in the history
  • Loading branch information
sidneycadot committed Sep 19, 2015
1 parent 24c8452 commit 58fc6da
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 112 deletions.
50 changes: 25 additions & 25 deletions charmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@
# These are the characters that actually occur:

occuring_characters = {
'%N' : ASCII | frozenset("\xa0\xad°´·ºÁÃ×àáäåèéíîóöøúüĀńőŜσωआटभयर्ṭ’•…∈≤≥⌈⌉fffifl"),
'%C' : ASCII | frozenset("¢£§«°±²´·º»½ÁÇ×ÜßàáäåçèéëíîïñòóôõöøùúüýāăćčęěħıłńőřśşšťžΧβγμπρστωϱавдеилмнопрстучшыьяաבוכלᵣᵤḠ\u200b—‘’“”…′ℕ↑⇒∈∏∑∞∩∫≅≈≠≤≥⊂⊆⊗⌈⌉\u3000八發\uf020fifl\ufeff𝒩𝓁"),
'%D' : ASCII | frozenset("\x7f§«°±´¸»ÁÇÉÖ×ÚÜßàáäåçèéêëíîïñóôõöøùúüýăąćČčěłńőŒřŚŞşŠšũūżžǎ́Λλμπϕ\u2002\u2009\u200e‐—’“”…∞∪≡fffi"),
'%H' : ASCII | frozenset("\x81£§©«®°±´µ·»ÁÂÃÅÆÉÕÖ×ÚÜßàáâäåçèéêëíîïñòóôõöøúûüýĀāăćČčěğĭıłńņňőœřśşŠšţūŽžΓΔΛΣΨαβγδζθπστφωϕНРСагдезийклнопрстхчыяאבגדוכלקרשתṭ\u200e—’“”…∏∑√∣≡⌊⌋fffifl"),
'%F' : ASCII | frozenset("°²´·ºÁÇ×ÜàáäçèéêíñóôöøúüćńőřşžΓβλ‐‘’”…∞≍≤≥⌈⌉\u3000\ufeff;"),
'%e' : ASCII | frozenset("¢¨¯°´·×ßáäçèéíôöüīńβλρω\u200b—‘’“”•…∆⊗│:"),
'%p' : ASCII | frozenset("Äéóöø‘’"),
'%t' : ASCII | frozenset("\x8a®°¹¼×áçèéíñóöúüŠπ…\u2028√≠≤≥\u3000\uf08a\uf0a3\uf0ae\uf0b3\uf0b9"),
'%o' : ASCII | frozenset("\x8d£«¯´·»Áßáäçèéêíîïðòö÷üπ“”…€←∪≠⊤⌊⌿⍳⍴⍸○"),
'%Y' : ASCII | frozenset("ßáéñöøńőΧ’…⊂\u3000"),
'%A' : ASCII | frozenset("ÁÅÆÇÉØÜßàáâäçèéëíñóôöøúüČńņőşš"),
'%E' : ASCII | frozenset("´ÁÉßàáãäçèéíñóôöøüýčěłńőš’"),
'N' : ASCII | frozenset("\xa0\xad°´·ºÁÃ×àáäåèéíîóöøúüĀńőŜσωआटभयर्ṭ’•…∈≤≥⌈⌉fffifl"),
'C' : ASCII | frozenset("¢£§«°±²´·º»½ÁÇ×ÜßàáäåçèéëíîïñòóôõöøùúüýāăćčęěħıłńőřśşšťžΧβγμπρστωϱавдеилмнопрстучшыьяաבוכלᵣᵤḠ\u200b—‘’“”…′ℕ↑⇒∈∏∑∞∩∫≅≈≠≤≥⊂⊆⊗⌈⌉\u3000八發\uf020fifl\ufeff𝒩𝓁"),
'D' : ASCII | frozenset("\x7f§«°±´¸»ÁÇÉÖ×ÚÜßàáäåçèéêëíîïñóôõöøùúüýăąćČčěłńőŒřŚŞşŠšũūżžǎ́Λλμπϕ\u2002\u2009\u200e‐—’“”…∞∪≡fffi"),
'H' : ASCII | frozenset("\x81£§©«®°±´µ·»ÁÂÃÅÆÉÕÖ×ÚÜßàáâäåçèéêëíîïñòóôõöøúûüýĀāăćČčěğĭıłńņňőœřśşŠšţūŽžΓΔΛΣΨαβγδζθπστφωϕНРСагдезийклнопрстхчыяאבגדוכלקרשתṭ\u200e—’“”…∏∑√∣≡⌊⌋fffifl"),
'F' : ASCII | frozenset("°²´·ºÁÇ×ÜàáäçèéêíñóôöøúüćńőřşžΓβλ‐‘’”…∞≍≤≥⌈⌉\u3000\ufeff;"),
'e' : ASCII | frozenset("¢¨¯°´·×ßáäçèéíôöüīńβλρω\u200b—‘’“”•…∆⊗│:"),
'p' : ASCII | frozenset("Äéóöø‘’"),
't' : ASCII | frozenset("\x8a®°¹¼×áçèéíñóöúüŠπ…\u2028√≠≤≥\u3000\uf08a\uf0a3\uf0ae\uf0b3\uf0b9"),
'o' : ASCII | frozenset("\x8d£«¯´·»Áßáäçèéêíîïðòö÷üπ“”…€←∪≠⊤⌊⌿⍳⍴⍸○"),
'Y' : ASCII | frozenset("ßáéñöøńőΧ’…⊂\u3000"),
'A' : ASCII | frozenset("ÁÅÆÇÉØÜßàáâäçèéëíñóôöøúüČńņőşš"),
'E' : ASCII | frozenset("´ÁÉßàáãäçèéíñóôöøüýčěłńőš’"),
}

# These are the characters that are deemed acceptable:
Expand All @@ -31,18 +31,18 @@
# - fullwidth colon character ';' (0xff1b)

acceptable_characters = {
'%N' : ASCII | frozenset("\xa0\xad°´·ºÁÃ×àáäåèéíîóöøúüĀńőŜσωआटभयर्ṭ’•…∈≤≥⌈⌉"),
'%C' : ASCII | frozenset("¢£§«°±²´·º»½ÁÇ×ÜßàáäåçèéëíîïñòóôõöøùúüýāăćčęěħıłńőřśşšťžΧβγμπρστωϱавдеилмнопрстучшыьяաבוכלᵣᵤḠ\u200b—‘’“”…′ℕ↑⇒∈∏∑∞∩∫≅≈≠≤≥⊂⊆⊗⌈⌉\u3000八發\uf020fifl\ufeff𝒩𝓁"),
'%D' : ASCII | frozenset("\x7f§«°±´¸»ÁÇÉÖ×ÚÜßàáäåçèéêëíîïñóôõöøùúüýăąćČčěłńőŒřŚŞşŠšũūżžǎ́Λλμπϕ\u2002\u2009\u200e‐—’“”…∞∪≡"),
'%H' : ASCII | frozenset("\x81£§©«®°±´µ·»ÁÂÃÅÆÉÕÖ×ÚÜßàáâäåçèéêëíîïñòóôõöøúûüýĀāăćČčěğĭıłńņňőœřśşŠšţūŽžΓΔΛΣΨαβγδζθπστφωϕНРСагдезийклнопрстхчыяאבגדוכלקרשתṭ\u200e—’“”…∏∑√∣≡⌊⌋"),
'%F' : ASCII | frozenset("°²´·ºÁÇ×ÜàáäçèéêíñóôöøúüćńőřşžΓβλ‐‘’”…∞≍≤≥⌈⌉\u3000\ufeff"),
'%e' : ASCII | frozenset("¢¨¯°´·×ßáäçèéíôöüīńβλρω\u200b—‘’“”•…∆⊗│"),
'%p' : ASCII | frozenset("Äéóöø‘’"),
'%t' : ASCII | frozenset("\x8a®°¹¼×áçèéíñóöúüŠπ…\u2028√≠≤≥\u3000\uf08a\uf0a3\uf0ae\uf0b3\uf0b9"),
'%o' : ASCII | frozenset("\x8d£«¯´·»Áßáäçèéêíîïðòö÷üπ“”…€←∪≠⊤⌊⌿⍳⍴⍸○"),
'%Y' : ASCII | frozenset("ßáéñöøńőΧ’…⊂\u3000"),
'%A' : ASCII | frozenset("ÁÅÆÇÉØÜßàáâäçèéëíñóôöøúüČńņőşš"),
'%E' : ASCII | frozenset("´ÁÉßàáãäçèéíñóôöøüýčěłńőš’"),
'N' : ASCII | frozenset("\xa0\xad°´·ºÁÃ×àáäåèéíîóöøúüĀńőŜσωआटभयर्ṭ’•…∈≤≥⌈⌉"),
'C' : ASCII | frozenset("¢£§«°±²´·º»½ÁÇ×ÜßàáäåçèéëíîïñòóôõöøùúüýāăćčęěħıłńőřśşšťžΧβγμπρστωϱавдеилмнопрстучшыьяաבוכלᵣᵤḠ\u200b—‘’“”…′ℕ↑⇒∈∏∑∞∩∫≅≈≠≤≥⊂⊆⊗⌈⌉\u3000八發\uf020fifl\ufeff𝒩𝓁"),
'D' : ASCII | frozenset("\x7f§«°±´¸»ÁÇÉÖ×ÚÜßàáäåçèéêëíîïñóôõöøùúüýăąćČčěłńőŒřŚŞşŠšũūżžǎ́Λλμπϕ\u2002\u2009\u200e‐—’“”…∞∪≡"),
'H' : ASCII | frozenset("\x81£§©«®°±´µ·»ÁÂÃÅÆÉÕÖ×ÚÜßàáâäåçèéêëíîïñòóôõöøúûüýĀāăćČčěğĭıłńņňőœřśşŠšţūŽžΓΔΛΣΨαβγδζθπστφωϕНРСагдезийклнопрстхчыяאבגדוכלקרשתṭ\u200e—’“”…∏∑√∣≡⌊⌋"),
'F' : ASCII | frozenset("°²´·ºÁÇ×ÜàáäçèéêíñóôöøúüćńőřşžΓβλ‐‘’”…∞≍≤≥⌈⌉\u3000\ufeff"),
'e' : ASCII | frozenset("¢¨¯°´·×ßáäçèéíôöüīńβλρω\u200b—‘’“”•…∆⊗│"),
'p' : ASCII | frozenset("Äéóöø‘’"),
't' : ASCII | frozenset("\x8a®°¹¼×áçèéíñóöúüŠπ…\u2028√≠≤≥\u3000\uf08a\uf0a3\uf0ae\uf0b3\uf0b9"),
'o' : ASCII | frozenset("\x8d£«¯´·»Áßáäçèéêíîïðòö÷üπ“”…€←∪≠⊤⌊⌿⍳⍴⍸○"),
'Y' : ASCII | frozenset("ßáéñöøńőΧ’…⊂\u3000"),
'A' : ASCII | frozenset("ÁÅÆÇÉØÜßàáâäçèéëíñóôöøúüČńņőşš"),
'E' : ASCII | frozenset("´ÁÉßàáãäçèéíñóôöøüýčěłńőš’"),
}

def main():
Expand All @@ -55,4 +55,4 @@ def main():
print("key {} has unwanted characters: {}".format(key, ", ".join("{!r}".format(c) for c in sorted(unwanted_characters))))

if __name__ == "__main__":
main()
main()
137 changes: 50 additions & 87 deletions parse_oeis_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,21 @@

# Code to analyze OEIS entries.

import sys
import os
import sqlite3
import collections
import sys
import re
import time
import logging
import pickle
import re
import collections
import logging
import sqlite3

from OeisEntry import OeisEntry
from charmap import acceptable_characters
from timer import start_timer

logger = logging.getLogger(__name__)

#def filter_main_content(text, oeis_id):

# We are interested in the lines that start with a '%' followed by
# a directive identification character, followed by a single space,
# followed by an OEIS id ('Axxxxxx'), followed by directive data.
#
# The directive data will be either an empty string or a string
# staring with a space character.


# content = re.findall(directive_line_pattern, text, re.MULTILINE)

# if len(content) == 0:
# raise OeisEntryEmptyError("no valid content lines found")

# content = "\n".join(directive + directive_data for (directive, directive_data) in content)

# return content



# As described here: https://oeis.org/eishelp1.html

# The directives %S, %T, and %U were originally intended as the absolute values of the sequence entries,
Expand Down Expand Up @@ -141,17 +120,27 @@

def parse_main_content(main_content):

# Select only lines that have the proper directive format:
# Select only lines that have the proper directive format.

directive_line_pattern = "(%.) A{06d}(.*)$".format(oeis_id)
directive_line_pattern = "%(.) A{06d}(.*)$".format(oeis_id)

lines = re.findall(directive_line_pattern, main_content, re.MULTILINE)

lines = [directive + directive_data for (directive, directive_data) in lines]
stripped_lines = []
for (directive, directive_value) in lines:
if directive_value.startswith(" "):
directive_value = directive_value[1:]
stripped_directive_value = directive_value.strip()
if len(stripped_directive_value) != len(directive_value):
logger.warning("[A{:06}] Value of %{} directive ({!r}) has superfluous whitespace.".format(oeis_id, directive, directive_value)
stripped_lines.append((directive, stripped_directive_value))

lines = stripped_lines
del stripped_lines

# ========== check order of directives

directive_order = "".join(line[1] for line in lines)
directive_order = "".join(directive for (directive, directive_value) in lines)

assert expected_directive_order.match(directive_order)

Expand All @@ -169,61 +158,56 @@ def parse_main_content(main_content):
line_O = None
lines_A = []

for line in lines:

directive = line[:2]
assert directive in expected_directives
for (directive, directive_value) in lines:

if directive in acceptable_characters:
unacceptable_characters = set(line) - acceptable_characters[directive]
unacceptable_characters = set(directive_value) - acceptable_characters[directive]
if unacceptable_characters:
logger.warning("[A{:06}] Unacceptable characters in directive {!r}: {}.".format(oeis_id, line, ", ".join(["{!r}".format(c) for c in sorted(unacceptable_characters)])))
logger.warning("[A{:06}] Unacceptable characters in value of %{} directive ({!r}): {}.".format(oeis_id, directive, directive_value, ", ".join(["{!r}".format(c) for c in sorted(unacceptable_characters)])))

if directive == "%I":
assert line_I is None # only one %I directive is allowed
line_I = line
line_I = directive_value
if directive == "%S":
assert line_S is None # only one %S directive is allowed
line_S = line
line_S = directive_value
elif directive == "%T":
assert line_T is None # only one %T directive is allowed
line_T = line
line_T = directive_value
elif directive == "%U":
assert line_U is None # only one %U directive is allowed
line_U = line
line_U = directive_value
if directive == "%N":
assert line_N is None # only one %N directive is allowed
line_N = line
line_N = directive_value
elif directive == "%C":
lines_C.append(line) # multiple %C directives are allowed
lines_C.append(directive_value) # multiple %C directives are allowed
elif directive == "%D":
lines_D.append(line) # multiple %D directives are allowed
lines_D.append(directive_value) # multiple %D directives are allowed
elif directive == "%H":
lines_H.append(line) # multiple %H directives are allowed
lines_H.append(directive_value) # multiple %H directives are allowed
elif directive == "%K":
assert line_K is None # only one %K directive is allowed
line_K = line
line_K = directive_value
elif directive == "%O":
assert line_O is None # only one %O directive is allowed
line_O = line
line_O = directive_value
elif directive == "%A":
lines_A.append(line) # multiple %A directives are allowed
lines_A.append(directive_value) # multiple %A directives are allowed

# ========== process I directive

assert (line_I is not None)

if line_I == "%I":
identification = line_I
if identification == "":
identification = None
else:
assert line_I.startswith("%I ")
identification = line_I[3:]

for identification_pattern in identification_patterns:
if identification_pattern.match(identification) is not None:
break
else:
logger.warning("[A{:06}] Ill-formatted %I directive: '{}'.".format(oeis_id, line_I))
logger.warning("[A{:06}] Unusual %I directive value: '{}'.".format(oeis_id, identification))

# ========== process S/T/U directives

Expand All @@ -236,17 +220,12 @@ def parse_main_content(main_content):

# Synthesize numbers.

if line_S == "%S":
logger.warning("[A{:06}] Unusual line: '{}' (without space).".format(oeis_id, line_S))
line_S = "%S "
if line_S == "":
logger.warning("[A{:06}] Unusual %S directive without value.".format(oeis_id))

assert (line_S is None) or line_S.startswith("%S ")
assert (line_T is None) or line_T.startswith("%T ")
assert (line_U is None) or line_U.startswith("%U ")

S = "" if line_S is None else line_S[3:]
T = "" if line_T is None else line_T[3:]
U = "" if line_U is None else line_U[3:]
S = "" if line_S is None else line_S
T = "" if line_T is None else line_T
U = "" if line_U is None else line_U

STU = S + T + U

Expand All @@ -257,29 +236,14 @@ def parse_main_content(main_content):
# ========== process N directive

assert (line_N is not None)
assert line_N.startswith("%N ")
assert line_N.startswith(" ")

name = line_N[3:]
name = line_N[1:]

# ========== process C directive

for line_C in lines_C:

assert line_C.startswith("%C ")
comment = line_C[3:]

# ========== process D directive

for line_D in lines_D:
assert line_D.startswith("%D ")
detailed_reference = line_D[3:]

# ========== process H directive

for line_H in lines_H:
assert line_H.startswith("%H ")
link = line_H[3:]

# ========== process A directive

if len(lines_A) == 0:
Expand All @@ -291,17 +255,16 @@ def parse_main_content(main_content):
logger.warning("[A{:06}] Missing %O directive.".format(oeis_id))
offset = () # empty tuple
else:
assert line_O.startswith("%O ")
offset = line_O[3:]
offset = line_O

offset = tuple(int(o) for o in offset.split(","))
if len(offset) != 2:
logger.warning("[A{:06}] Ill-formatted %O directive: {!r}.".format(oeis_id, line_O))
logger.warning("[A{:06}] Unusual %O directive value only has a single number: {!r}.".format(oeis_id, line_O))

# ========== process K directive

assert (line_K is not None) and line_K.startswith("%K ")
keywords = line_K[3:]
assert line_K is not None
keywords = line_K

keywords = keywords.split(",")

Expand All @@ -311,16 +274,16 @@ def parse_main_content(main_content):

for unexpected_keyword in sorted(unexpected_keywords):
if unexpected_keyword == "":
logger.warning("[A{:06}] Unexpected empty keyword in %K directive: {!r}.".format(oeis_id, line_K))
logger.warning("[A{:06}] Unexpected empty keyword in %K directive value: {!r}.".format(oeis_id, line_K))
else:
logger.warning("[A{:06}] Unexpected keyword '{}' in %K directive: {!r}.".format(oeis_id, unexpected_keyword, line_K))
logger.warning("[A{:06}] Unexpected keyword '{}' in %K directive value: {!r}.".format(oeis_id, unexpected_keyword, line_K))

# Check for duplicate keywords.

keyword_counter = collections.Counter(keywords)
for (keyword, count) in keyword_counter.items():
if count > 1:
logger.warning("[A{:06}] Keyword '{}' occurs {} times in %K directive: {!r}.".format(oeis_id, keyword, count, line_K))
logger.warning("[A{:06}] Keyword '{}' occurs {} times in %K directive value: {!r}.".format(oeis_id, keyword, count, line_K))

# Canonify keywords: remove empty keywords and duplicates.
# We not sort.
Expand Down

0 comments on commit 58fc6da

Please sign in to comment.