# Show a list of words whose syllable count is computed to be zero

## Purpose

I am using a syllable counter "countSyllables" (below), and some tokens come up with a count of zero. The purpose of this notebook is to find out why this is happening, and to find a way of handling the tokens.

In [None]:
from pandas                import DataFrame, read_csv
from os                    import walk
from os.path               import join
from spacy                 import explain, load

nlp = load("en_core_web_sm")

# Load data

## Data Dictionary

|Train|Test|Description|
|--------------|--------------|----------------------------------------------------|
|id|id|unique ID for excerpt|
|url_legal|url_legal| URL of source (Omitted from some records in the test set)|
|license|license |license of source material (Omitted from some records in the test set)|
|excerpt|excerpt|text for predicting readability|
|target|-|readability|
|standard_error|-|Measure of spread of scores among multiple raters for each excerpt|

In [None]:
train_data = None
test_data  = None

for dirname, _, filenames in walk('/kaggle/input'):
    for filename in filenames:
        path_name = join(dirname, filename)
        if filename.startswith('train'):
            train_data = read_csv(path_name)
        if filename.startswith('test'):
            test_data = read_csv(path_name)

In [None]:
# countSyllables
#
# Code by Tersosauros, snarfed from https://stackoverflow.com/questions/405161/detecting-syllables-in-a-word
def countSyllables(word): 
    vowels       = "aeiouy"
    numVowels    = 0
    lastWasVowel = False
    for wc in word:
        foundVowel = False
        for v in vowels:
            if v == wc:
                if not lastWasVowel: numVowels+=1   #don't count diphthongs
                foundVowel = lastWasVowel = True
                break
        if not foundVowel:  #If full cycle and no vowel found, set lastWasVowel to false
            lastWasVowel = False
    if len(word) > 2 and word[-2:] == "es": #Remove es - it's "usually" silent (?)
        numVowels-=1
    elif len(word) > 1 and word[-1:] == "e":    #remove silent e
        numVowels-=1
    return numVowels



# Build list of failures and save it

In [None]:
Failures = []
for index,row in train_data.iterrows():
    doc = nlp(row['excerpt'])
    for token in doc:
        if token.pos_ != 'PUNCT' and countSyllables(str(token).lower())<1:
            Failures.append((token.pos_,token.tag_,str(token),token.lemma_,explain(token.pos_),explain(token.tag_)))

df = DataFrame(Failures, 
               columns=['Pos', 'Tag', 'Token','Lemma','ExplainPos','ExplainTag'])
df = df.sort_values(['Pos', 'Tag', 'Lemma'])
df.drop_duplicates(subset  = ['Lemma'],
                   inplace = True)
df.to_csv('syllable-count-failures.csv', index = False)