# Parsing Category Tags
This script will parse all the "tag" elements from the scraped epicurious data.

The following tag categories are addressed: 
    * meal (dinner, appetizer, brunch, etc.)
    * technique (stew, bake, marinate, roast, etc.)
    * occasion (easter, anniversary, tail-gating, etc.)
    * type (sauce, cake, ice cream, etc.)
    * special consideration (vegan, soy-free, kid-friendly, etc.)
    * source (gourmet, bon appetit, the chew, etc.)
    * cuisine (asian, america, french, etc.)
    * equipment (coffee grinder, slow cooker, etc.)
    
Each recipe could have multiple values for reach tag category.

The parsing process will create individual columns for each tag category value, prefixed by tag (meal_breakfast, meal_lunch, type_sauce, etc.).  Each recipe is then evaluated to see if they have any of those tag values and assigning a value of 1 if so or a value of 0 if not.

At the end of teach tag group, the columns are appended to the end of the original scraped data frame.  This can then be exported to a (rather large) .csv that will have many, many columns.
    

In [1]:
import pandas as pd  
import numpy as np
import matplotlib
import matplotlib.pyplot as plt  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline
import warnings; warnings.simplefilter('ignore')
from datetime import date
from datetime import datetime

# Parsing Functions

In [2]:
def parse_unique_values_from_tag_list(tag_prefix, tag_list):
    '''
    tag_prefix = prefix for tab (meal_, cuisine_, etc.)
    tag_list = list of unique values for tag
    
    Function loops through tag list and returns a unique list of values prefixed
    '''
    unique_values = [];
    for list_item in tag_list:
        if type(list_item) == str:
            split_vals = list_item.replace("[", "").replace("]", "").split(",")
            for s in split_vals:
                final_val = tag_prefix + s.replace("\'", "").strip();
                if final_val not in unique_values:
                    unique_values.append(final_val)
    return unique_values;

In [3]:
def parse_string_to_list(prefix, strings):
    '''
    prefix = tag prefix
    strings = sting of tag values for row
    
    Function parses string of tag values and returns values as a list
    '''
    result = [];
    if type(strings) == str:
        split_vals = strings.replace("[", "").replace("]", "").split(",");
        for s in split_vals:
            result.append(prefix + s.replace("\'", "").strip());
    return result; 

In [4]:
def parse_tag_list(final_list, unique_tag_list, row_tag_values):
    '''
    final_list = finale list 1/0 tag values
    unique_tag_list = unique list of tag values
    row_tag_values = list of tag values for a row
    
    Function loops through the unique tag list and creates 1 or 0 
    value for each list item based on the row tag values and appends
    to final_list
    '''
    try:
        row_values = [];
        for tag in unique_tag_list:
            if tag in row_tag_values:
    #             print(f'Has {tag}');
                row_values.append(1);
            else:
                row_values.append(0);
    #     return row_values;
        final_list.append(row_values)
    except Exception:
        print(Exception)

# Load Data

In [5]:
# read in the scraped data Sarah created
df= pd.read_csv ('df_original_scrape.csv')


# Parse Tag: Meals

In [6]:
# create list of unique meals
meal_tags = parse_unique_values_from_tag_list("meal_", df['meal'].unique().tolist())
meal_tags

['meal_dinner',
 'meal_appetizer',
 'meal_lunch',
 'meal_side',
 'meal_brunch',
 'meal_dessert',
 'meal_breakfast',
 'meal_drink',
 'meal_buffet',
 'meal_one-pot-meal',
 'meal_hors-doeuvre',
 'meal_leftovers']

In [7]:
# create a dataframe of just meal data
df_meal = df[['meal']]
df_meal.head()

Unnamed: 0,meal
0,['dinner']
1,['appetizer']
2,['lunch']
3,['side']
4,


In [8]:
# create a new column that converts the string meal data into a list
df_meal['meal_list'] = df_meal.apply(lambda x: parse_string_to_list('meal_', x['meal']), axis=1)
df_meal.head()

Unnamed: 0,meal,meal_list
0,['dinner'],[meal_dinner]
1,['appetizer'],[meal_appetizer]
2,['lunch'],[meal_lunch]
3,['side'],[meal_side]
4,,[]


In [9]:
# create a list to plug all the parsed meal data into
data_meals = []

In [10]:
# loop through all items in the dataframe, parse the meal list for each row and add to the data_meals list
for index, row in df_meal.iterrows():
    parse_tag_list(data_meals, meal_tags, row['meal_list'])

print(f'data_meals length: {len(data_meals)}')

print(f'df_meal length: {len(df_meal)}')

data_meals length: 37085
df_meal length: 37085


In [11]:
df_meals = pd.DataFrame(data_meals, columns=meal_tags)
df_meals.head()

Unnamed: 0,meal_dinner,meal_appetizer,meal_lunch,meal_side,meal_brunch,meal_dessert,meal_breakfast,meal_drink,meal_buffet,meal_one-pot-meal,meal_hors-doeuvre,meal_leftovers
0,1,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
# see how each column totals look
df_meals.apply(pd.Series.value_counts).transpose()

Unnamed: 0,0,1
meal_dinner,31767,5318
meal_appetizer,34127,2958
meal_lunch,34523,2562
meal_side,31101,5984
meal_brunch,35235,1850
meal_dessert,29476,7609
meal_breakfast,35418,1667
meal_drink,34870,2215
meal_buffet,36855,230
meal_one-pot-meal,37032,53


In [13]:
df.columns

Index(['author', 'ingredients_string', 'make_again', 'month', 'rating',
       'reviews', 'title', 'year', 'cuisine', 'technique', 'occasion', 'meal',
       'ingredients_list', 'special-consideration', 'type', 'source',
       'equipment', 'tag', 'location', 'cal', 'carb', 'fat', 'protein',
       'sat_fat', 'sodium', 'polyunsat_fat', 'fiber', 'monounsat_fat',
       'cholesterol', 'servings', 'ingredients_count', 'carb_split',
       'fat_split', 'protein_split', 'sat_fat_split', 'sodium_split',
       'polyunsat_fat_split', 'fiber_split', 'monounsat_fat_split',
       'cholesterol_split', 'make_again_split', 'servings_split', 'month_num',
       'year_num', 'age_recipe'],
      dtype='object')

In [14]:
# append the parsed meals data to the original dataframe
df = df.merge(df_meals, left_index=True, right_index=True)

# Parse Tag: Technique

In [15]:
# create list of unique techniques
technique_tags = parse_unique_values_from_tag_list("technique_", df['technique'].unique().tolist())
# technique_tags

In [16]:
# create a dataframe of just technique data
df_technique = df[['technique']]
# df_technique.head()

In [17]:
# create a new column that converts the string meal data into a list
df_technique['technique_list'] = df_technique.apply(lambda x: parse_string_to_list('technique_', x['technique']), axis=1)
# df_technique.head()

In [18]:
# create a list to plug all the parsed meal data into
data_techniques = []

# loop through all items in the dataframe, parse the meal list for each row and add to the data_meals list
for index, row in df_technique.iterrows():
    parse_tag_list(data_techniques, technique_tags, row['technique_list'])

print(f'data_techniques length: {len(data_techniques)}')

print(f'df_technique length: {len(df_technique)}')

data_techniques length: 37085
df_technique length: 37085


In [19]:
df_techniques = pd.DataFrame(data_techniques, columns=technique_tags)
# df_techniques.head()

In [20]:
# see how each column totals look
df_techniques.apply(pd.Series.value_counts).transpose()

Unnamed: 0,0,1
technique_braise,36591,494
technique_stew,36815,270
technique_roast,34980,2105
technique_saute,33800,3285
technique_chill,35188,1897
technique_marinate,36218,867
technique_poach,36857,228
technique_bake,28514,8571
technique_grill-barbecue,35359,1726
technique_no-cook,35246,1839


In [21]:
# append the parsed meals data to the original dataframe
df = df.merge(df_techniques, left_index=True, right_index=True)

In [22]:
# df.columns

# Parse Tag: Occasion

In [23]:
# create list of unique occaision
occasion_tags = parse_unique_values_from_tag_list("occasion_", df['occasion'].unique().tolist())
# occasion_tags

In [24]:
# create a dataframe of just occaision data
df_occasion = df[['occasion']]
# df_technique.head()

# create a new column that converts the string data into a list
df_occasion['occasion_list'] = df_occasion.apply(lambda x: parse_string_to_list('occasion_', x['occasion']), axis=1)
# df_occasion.head()

In [25]:
# create a list to plug all the parsed meal data into
data_occasions = []

# loop through all items in the dataframe, parse the meal list for each row and add to the data_meals list
for index, row in df_occasion.iterrows():
    parse_tag_list(data_occasions, occasion_tags, row['occasion_list'])

print(f'data_occasions length: {len(data_occasions)}')

print(f'df_occasion length: {len(df_occasion)}')

data_occasions length: 37085
df_occasion length: 37085


In [26]:
df_occasions = pd.DataFrame(data_occasions, columns=occasion_tags)
# df_occasions.head()

In [27]:
# see how each column totals look
df_occasions.apply(pd.Series.value_counts).transpose()

Unnamed: 0,0,1
occasion_rosh-hashanah-yom-kippur,36929,156
occasion_fall,31781,5304
occasion_passover,36772,313
occasion_summer,29926,7159
occasion_sukkot,37050,35
occasion_super-bowl,36443,642
occasion_winter,31707,5378
occasion_shower,36383,702
occasion_spring,34132,2953
occasion_tailgating,36817,268


In [28]:
# append the parsed occasions data to the original dataframe
df = df.merge(df_occasions, left_index=True, right_index=True)

# Parse Tag: Type

In [29]:
# create list of unique type
type_tags = parse_unique_values_from_tag_list("type_", df['type'].unique().tolist())
type_tags

['type_salad',
 'type_soup-stew',
 'type_sauce',
 'type_egg-nog',
 'type_bread',
 'type_frozen-dessert',
 'type_salad-dressing',
 'type_condiment-spread',
 'type_candy',
 'type_pastry',
 'type_cookie',
 'type_edible-gift',
 'type_cake',
 'type_casserole-gratin',
 'type_alcoholic',
 'type_cocktail',
 'type_hot-drink',
 'type_non-alcoholic',
 'type_sandwich',
 'type_punch',
 'type_digestif',
 'type_stuffing-dressing',
 'type_smoothie',
 'type_margarita',
 'type_iced-tea',
 'type_martini',
 'type_spritzer',
 'type_aperitif',
 'type_sangria',
 'type_ice-cream',
 'type_pie',
 'type_pot-pie',
 'type_pizza',
 'type_tart',
 'type_meatball',
 'type_cobbler-crumble',
 'type_macaroni-and-cheese',
 'type_chili',
 'type_quiche',
 'type_cranberry-sauce',
 'type_biscuit',
 'type_hamburger',
 'type_lasagna',
 'type_dip',
 'type_stock',
 'type_pancake',
 'type_flat-bread',
 'type_hummus',
 'type_crepe',
 'type_salsa',
 'type_brownie',
 'type_fritter',
 'type_waffle',
 'type_cheesecake',
 'type_meatloaf

In [30]:
# create a dataframe of just type data
df_type = df[['type']]
# df_type.head()

# create a new column that converts the string data into a list
df_type['type_list'] = df_type.apply(lambda x: parse_string_to_list('type_', x['type']), axis=1)
# df_type.head()

In [31]:
# create a list to plug all the parsed meal data into
data_types = []

# loop through all items in the dataframe, parse the type list for each row and add to the data_types list
for index, row in df_type.iterrows():
    parse_tag_list(data_types, type_tags, row['type_list'])

print(f'data_types length: {len(data_types)}')

print(f'df_type length: {len(df_type)}')

data_types length: 37085
df_type length: 37085


In [32]:
df_types = pd.DataFrame(data_types, columns=type_tags)
# df_types.head()

In [33]:
# see how each column totals look
df_types.apply(pd.Series.value_counts).transpose()

Unnamed: 0,0,1
type_salad,34459,2626
type_soup-stew,35092,1993
type_sauce,35717,1368
type_egg-nog,37069,16
type_bread,35929,1156
type_frozen-dessert,36363,722
type_salad-dressing,36839,246
type_condiment-spread,35760,1325
type_candy,36856,229
type_pastry,36862,223


In [34]:
# append the parsed type data to the original dataframe
df = df.merge(df_types, left_index=True, right_index=True)

# Parse Tag: Special Consideration

In [35]:
# create list of unique type
consideration_tags = parse_unique_values_from_tag_list("consideration_", df['special-consideration'].unique().tolist())
consideration_tags

['consideration_kosher',
 'consideration_paleo',
 'consideration_dairy-free',
 'consideration_wheat-gluten-free',
 'consideration_peanut-free',
 'consideration_tree-nut-free',
 'consideration_soy-free',
 'consideration_vegetarian',
 'consideration_vegan',
 'consideration_pescatarian',
 'consideration_no-sugar-added',
 'consideration_quick-and-easy',
 'consideration_low-no-sugar',
 'consideration_sugar-conscious',
 'consideration_diabetes-friendly',
 'consideration_fat-free',
 'consideration_kidney-friendly',
 'consideration_low-fat',
 'consideration_healthy',
 'consideration_kid-friendly',
 'consideration_low-cal',
 'consideration_low-carb',
 'consideration_raw',
 'consideration_low-sodium',
 'consideration_low-sugar',
 'consideration_high-fiber',
 'consideration_low-cholesterol',
 'consideration_kosher-for-passover',
 'consideration_organic',
 'consideration_high-protein']

In [36]:
# create a dataframe of just type data
df_consideration = df[['special-consideration']]
# df_consideration.head()

# create a new column that converts the string data into a list
df_consideration['consideration_list'] = df_consideration.apply(lambda x: parse_string_to_list('consideration_', x['special-consideration']), axis=1)
# df_consideration.head()

In [37]:
# create a list to plug all the parsed data into
data_considerations = []

# loop through all items in the dataframe, parse the type list for each row and add to the data_types list
for index, row in df_consideration.iterrows():
    parse_tag_list(data_considerations, consideration_tags, row['consideration_list'])

print(f'data_considerations length: {len(data_considerations)}')

print(f'df_consideration length: {len(df_consideration)}')

data_considerations length: 37085
df_consideration length: 37085


In [38]:
df_considerations = pd.DataFrame(data_considerations, columns=consideration_tags)
# df_types.head()

In [39]:
# see how each column totals look
df_considerations.apply(pd.Series.value_counts).transpose()

Unnamed: 0,0,1
consideration_kosher,24233,12852
consideration_paleo,35645,1440
consideration_dairy-free,30897,6188
consideration_wheat-gluten-free,27250,9835
consideration_peanut-free,20101,16984
consideration_tree-nut-free,22695,14390
consideration_soy-free,20543,16542
consideration_vegetarian,23135,13950
consideration_vegan,33478,3607
consideration_pescatarian,24479,12606


In [40]:
# append the parsed type data to the original dataframe
df = df.merge(df_considerations, left_index=True, right_index=True)

# Parse Tag: Source

In [41]:
# create list of unique type
source_tags = parse_unique_values_from_tag_list("source_", df['source'].unique().tolist())
source_tags

['source_gourmet',
 'source_bon-appetit',
 'source_self',
 'source_house-and-garden',
 'source_parade',
 'source_cookie',
 'source_harpercollins',
 'source_weelicious',
 'source_the-chew']

In [42]:
# create a dataframe of just type data
df_source = df[['source']]
# df_consideration.head()

# create a new column that converts the string data into a list
df_source['source_list'] = df_source.apply(lambda x: parse_string_to_list('source', x['source']), axis=1)
# df_consideration.head()

In [43]:
# create a list to plug all the parsed data into
data_sources = []

# loop through all items in the dataframe, parse the type list for each row and add to the data_types list
for index, row in df_source.iterrows():
    parse_tag_list(data_sources, source_tags, row['source_list'])

print(f'data_sources length: {len(data_sources)}')

print(f'df_source length: {len(df_source)}')

data_sources length: 37085
df_source length: 37085


In [44]:
df_sources = pd.DataFrame(data_sources, columns=source_tags)
# df_types.head()

# see how each column totals look
df_sources.apply(pd.Series.value_counts).transpose()

Unnamed: 0,0
source_gourmet,37085
source_bon-appetit,37085
source_self,37085
source_house-and-garden,37085
source_parade,37085
source_cookie,37085
source_harpercollins,37085
source_weelicious,37085
source_the-chew,37085


In [45]:
# append the parsed type data to the original dataframe
df = df.merge(df_sources, left_index=True, right_index=True)

# Parse Tag: Cuisine

In [46]:
# create list of unique type
cuisine_tags = parse_unique_values_from_tag_list("cuisine_", df['cuisine'].unique().tolist())
cuisine_tags

['cuisine_italian',
 'cuisine_jewish',
 'cuisine_tex-mex',
 'cuisine_asian',
 'cuisine_american',
 'cuisine_spanish-portuguese',
 'cuisine_chinese',
 'cuisine_english',
 'cuisine_central-south-american',
 'cuisine_french',
 'cuisine_middle-eastern',
 'cuisine_mediterranean',
 'cuisine_sushi',
 'cuisine_cajun-creole',
 'cuisine_southwestern',
 'cuisine_scandinavian',
 'cuisine_eastern-european-russian',
 'cuisine_southern',
 'cuisine_european',
 'cuisine_central-american-caribbean',
 'cuisine_vietnamese',
 'cuisine_indian',
 'cuisine_african',
 'cuisine_mexican',
 'cuisine_german',
 'cuisine_korean',
 'cuisine_moroccan',
 'cuisine_greek',
 'cuisine_basque',
 'cuisine_thai',
 'cuisine_latin-american',
 'cuisine_irish',
 'cuisine_turkish',
 'cuisine_cuban',
 'cuisine_californian',
 'cuisine_british',
 'cuisine_indonesian',
 'cuisine_japanese',
 'cuisine_nuevo-latino',
 'cuisine_italian-american',
 'cuisine_southern-italian',
 'cuisine_new-england',
 'cuisine_southeast-asian',
 'cuisine_mi

In [47]:
# create a dataframe of just cuisine data
df_cuisine = df[['cuisine']]
# df_consideration.head()

# create a new column that converts the string data into a list
df_cuisine['cuisine_list'] = df_cuisine.apply(lambda x: parse_string_to_list('cuisine_', x['cuisine']), axis=1)
# df_consideration.head()

In [48]:
# create a list to plug all the parsed data into
data_cuisines = []

# loop through all items in the dataframe, parse the type list for each row and add to the data_types list
for index, row in df_cuisine.iterrows():
    parse_tag_list(data_cuisines, cuisine_tags, row['cuisine_list'])

print(f'data_cuisines length: {len(data_cuisines)}')

print(f'df_cuisine length: {len(df_cuisine)}')

data_cuisines length: 37085
df_cuisine length: 37085


In [49]:
df_cuisines = pd.DataFrame(data_cuisines, columns=cuisine_tags)
# df_types.head()

# see how each column totals look
df_cuisines.apply(pd.Series.value_counts).transpose()

Unnamed: 0,0,1
cuisine_italian,34703,2382
cuisine_jewish,36633,452
cuisine_tex-mex,36944,141
cuisine_asian,35617,1468
cuisine_american,31692,5393
cuisine_spanish-portuguese,36699,386
cuisine_chinese,36769,316
cuisine_english,36850,235
cuisine_central-south-american,36923,162
cuisine_french,35695,1390


In [50]:
# append the parsed type data to the original dataframe
df = df.merge(df_cuisines, left_index=True, right_index=True)

# Parse Tag: Equipment

In [51]:
# create list of unique type
equipment_tags = parse_unique_values_from_tag_list("equipment_", df['equipment'].unique().tolist())
equipment_tags

['equipment_ice-cream-machine',
 'equipment_wok',
 'equipment_food-processor',
 'equipment_juicer',
 'equipment_mixer',
 'equipment_blender',
 'equipment_candy-thermometer',
 'equipment_double-boiler',
 'equipment_grill',
 'equipment_microwave',
 'equipment_ramekin',
 'equipment_pasta-maker',
 'equipment_pressure-cooker',
 'equipment_smoker',
 'equipment_bread-machine',
 'equipment_slow-cooker',
 'equipment_coffee-grinder',
 'equipment_mandoline',
 'equipment_mortar-and-pestle',
 'equipment_sheet-pan',
 'equipment_instant-pot',
 'equipment_cast-iron',
 'equipment_air-fryer']

In [52]:
# create a dataframe of just equipment data
df_equipment = df[['equipment']]
# df_consideration.head()

# create a new column that converts the string data into a list
df_equipment['equipment_list'] = df_equipment.apply(lambda x: parse_string_to_list('equipment_', x['equipment']), axis=1)
# df_consideration.head()

In [53]:
# create a list to plug all the parsed data into
data_equipments = []

# loop through all items in the dataframe, parse the type list for each row and add to the data_types list
for index, row in df_equipment.iterrows():
    parse_tag_list(data_equipments, equipment_tags, row['equipment_list'])

print(f'data_equipments length: {len(data_equipments)}')

print(f'df_equipment length: {len(df_equipment)}')

data_equipments length: 37085
df_equipment length: 37085


In [54]:
df_equipments = pd.DataFrame(data_equipments, columns=equipment_tags)
# df_types.head()

# see how each column totals look
df_equipments.apply(pd.Series.value_counts).transpose()

Unnamed: 0,0,1
equipment_ice-cream-machine,36803,282
equipment_wok,36945,140
equipment_food-processor,35355,1730
equipment_juicer,37065,20
equipment_mixer,35747,1338
equipment_blender,35990,1095
equipment_candy-thermometer,36974,111
equipment_double-boiler,37018,67
equipment_grill,36151,934
equipment_microwave,37004,81


In [55]:
# append the parsed type data to the original dataframe
df = df.merge(df_equipments, left_index=True, right_index=True)

# Export revised data frame with new columns

In [56]:
#export full dataframe
# df_.to_csv('df_scrape_parsed_tags.csv', index=False)