In [1]:
import pandas as pd
import os.path

root_path = os.path.dirname(os.getcwd())
text_raw = open(os.path.join(root_path, "DATA/list.txt"), "r").read()
text = '\n\n\n\n\n\n' + text_raw

In [2]:
sorta_sections = text.split('\n\n\n\n\n\n')[1:] # first item is empty
sorta_titles = [sorta_section.strip().split('\n\n\n')[0] for sorta_section in sorta_sections] # all titles should be followed by three \n's

# examine whether substring ending with either '(' or ',' is upper case
import re
titles = [sorta_title for sorta_title in sorta_titles if re.split('\(|,|and', sorta_title)[0].isupper()]

In [3]:
section_starts = []
for title in titles:
    section_start = text.find('\n\n\n\n\n\n'+title+'\n\n\n')
    section_starts.append(section_start)

In [4]:
sections = []

# for index, section_start in zip(range(len(section_starts)), section_starts[:-1]):
for index, section_start in enumerate(section_starts[:-1]):
    section = text[section_start:section_starts[index+1]].strip()
    sections.append(section)
sections.append(text[section_starts[-1]:].strip())

In [5]:
def extract_subsections_data(section):
    data = pd.Series([])
    
    # sidebars can screw things up
    if '(See sidebar.)' in section:
        section = section[:section.find('(See sidebar.)')].strip()

    title = section.split('\n\n\n')[0].strip()
    data['name'] = title
    section = section[len(title):].strip()
    
    subsections = section.split('\n\n\n\n')
    subsections = [subsection.strip() for subsection in subsections]
    
    for subsection in subsections:      
        if subsection: # accounting for blank subsections, which I think exist?
            if subsection.startswith('['):
                data['phonetic'] = subsection
            elif subsection.startswith('Season'):
                data['season'] = subsection
            elif subsection.startswith('Flavor') and not subsection.startswith('Flavor Affinities'):
                data['flavor'] = subsection
            elif subsection.startswith('Volume'):
                data['volume'] = subsection
            elif subsection.startswith('Techniques'):
                data['techniques'] = subsection
            elif subsection.startswith('Nutritional profile'):
                data['nutritional_profile'] = subsection
            elif subsection.startswith('Calories'):
                data['calories'] = subsection
            elif subsection.startswith('Protein'):
                data['protein'] = subsection
            elif subsection.startswith('Tip'):
                data['tip'] = subsection
            elif subsection.startswith('Vegan Tip'):
                data['vegan_tip'] = subsection
            elif subsection.startswith('Vegan substitute'):
                data['vegan_substitutes'] = subsection
            elif subsection.startswith('Vegan Brand'):
                data['vegan_brands'] = subsection
            elif subsection.startswith('Possible Substitute') or subsection.startswith('Possible substitute'): # both appear
                data['possible_substitutes'] = subsection
            elif subsection.startswith('Brand'): # was picking up "Brandy" in subsection
                data['brands'] = subsection
            elif subsection.startswith('Botanical relatives'):
                data['botanical_relatives'] = subsection
            elif subsection.startswith('Flavor Affinities'):
                data['flavor_affinities'] = subsection
            elif subsection.startswith('What they are'):
                data['what_they_are'] = subsection
    
    section_remainder = section
    
    # assuming 'Dishes' is last
    if 'Dishes' in section_remainder:
        dishes_start = section_remainder.find('Dishes')
        data['dishes'] = section[dishes_start:].strip()
        section_remainder = section_remainder[:dishes_start].strip() # removed to reveal pairs_with
    
    # removed to reveal pairs_with
    if 'Flavor Affinities' in section_remainder:
        section_remainder = section_remainder[:section_remainder.find('Flavor Affinities')].strip()
                
    subsections_remainder = section_remainder.split('\n\n\n')
    subsections_remainder = [subsection.strip() for subsection in subsections_remainder]
    
    
    for subsection in subsections_remainder:
        if '\n\n' in subsection and subsection[0] not in ['"', '“']: # check for single spaced line and not quote
            data['pairs_with'] = subsection
            break
        
    return data

In [6]:
subsections_data = pd.Series(sections).apply(extract_subsections_data)
subsections_data.replace(float('nan'), '', inplace=True)

In [7]:
# subsections_data.to_csv(os.path.join(root_path, 'DATA/subsections_data_raw.csv'), index=False, header=None)

In [8]:
# subsections_data.to_csv(os.path.join(root_path, 'DATA/subsection_data_header.csv'), index=False)

In [9]:
# very quiet
# very quiet--quiet
# quieter
# quiet
# quiet/moderate
# quiet--moderate
# moderate
# moderate/loud
# moderate--loud
# moderate--extremely_loud
# loud
# louder
# very loud
# extremely loud
# quiet        

def extract_flavor_data(row):
    data = row
    flavor_tokens = [token.strip().lower() for token in re.split('\(|\)|,|\s|\/|\.|–|-|;', row['flavor']) if token.strip() != '']
    volume_tokens = [token.strip().lower() for token in re.split('\(|\)|,|\s|\/|\.|–|-|;', row['volume']) if token.strip() != '']
    
    if 'sweet' in flavor_tokens:
        if 'loud' in volume_tokens or 'louder' in volume_tokens:
            data['sweet'] = 'Y'
        else:
            data['sweet'] = 'y'

    if 'sour' in flavor_tokens:
        if 'loud' in volume_tokens or 'louder' in volume_tokens:
            data['sour'] = 'Y'
        else:
            data['sour'] = 'y'

    if 'salty' in flavor_tokens:
        if 'loud' in volume_tokens or 'louder' in volume_tokens:
            data['salty'] = 'Y'
        else:
            data['salty'] = 'y'

    if 'bitter' in flavor_tokens:
        if 'loud' in volume_tokens or 'louder' in volume_tokens:
            data['bitter'] = 'Y'
        else:
            data['bitter'] = 'y'

    if 'savory' in flavor_tokens or 'umami' in flavor_tokens:
        if 'loud' in volume_tokens or 'louder' in volume_tokens:
            data['savory'] = 'Y'
        else:
            data['savory'] = 'y'

    if 'spicy' in flavor_tokens or 'hot' in flavor_tokens:
        if 'loud' in volume_tokens or 'louder' in volume_tokens:
            data['spicy'] = 'Y'
        else:
            data['spicy'] = 'y'
    
    return data

ingredients_data = subsections_data.apply(extract_flavor_data, axis=1)
ingredients_data.replace(float('nan'), '', inplace=True)

In [11]:
ingredients_data.to_csv(os.path.join(root_path, 'DATA/ingredients_data_unedited.csv'), index=False)