## 1. Setup

In [25]:
import os
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
import pandas as pd
from sklearn import feature_extraction, model_selection, pipeline, manifold, preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

#### Import Cuisine Data

In [65]:
cuisine = pd.read_json('./data/train.json')
display(cuisine)

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli..."


In [13]:
cuisine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           39774 non-null  int64 
 1   cuisine      39774 non-null  object
 2   ingredients  39774 non-null  object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


#### Import Recipe Data

In [62]:
all_recipes = pd.read_json('./data/recipes_raw_nosource_allrecipes.json', orient='index')
epicurious = pd.read_json('./data/recipes_raw_nosource_epicurious.json', orient='index')
food_network = pd.read_json('./data/recipes_raw_nosource_foodnetwork.json', orient='index')
recipes = pd.concat([all_recipes, epicurious, food_network], axis=0)

In [63]:
recipes = recipes.reset_index()
recipes = recipes.drop(columns=['index', 'picture_link'])
display(recipes)

Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ..."
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ..."
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...
...,...,...,...
124642,Summer Corn Salad,"[4 ears fresh corn, 2 heads Belgian endive, 2 ...",Watch how to make this recipe.\nPreheat a gril...
124643,Zucchini Stuffed Tomatoes,"[4 large plum tomatoes, Salt and sugar, 1 1/2 ...",Preheat the broiler. Cut the tomatoes in 1/2 c...
124644,Pepper Pasta Quick Cook,"[3 tablespoons olive oil, 2 tablespoons unsalt...",Heat the oil and butter in a large skillet ove...
124645,Chocolate Cake with Armagnac Ice Cream,"[8 ounces butter, 8 ounces bittersweet chocola...",Preheat oven to 350 degrees. On the top half o...


In [64]:
recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124647 entries, 0 to 124646
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   title         124595 non-null  object
 1   ingredients   124647 non-null  object
 2   instructions  124473 non-null  object
dtypes: object(3)
memory usage: 2.9+ MB


## 2. Pre-Processing

### A. Cleaning and Tokenization

#### Add stopwords

In [301]:
from nltk.corpus import stopwords
stopword_list = stopwords.words("english")
addl_stop_words = ['advertisement', 'advertisments', 'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'ounce', 'ounces', 'salt', 'pepper', 'pound', 'pounds']
stopword_list.extend(addl_stop_words)

#### String cleaning function

In [298]:
def clean_string(list, lemmatize = True, stemming = False):
    
    str = ' '.join(list)
    clean_text = " "

    for text in str:
        lower = text.lower() #lowercase the words
        token = word_tokenize(lower) #tokenize words
        rem_stop = [i for i in token if i not in stopword_list] #remove stopwords
        
        clean_words = []
        for word in rem_stop: 
            if len(word) > 2 and re.sub(r'\d+','', word) and re.sub(' +', ' ', word):
                #remove punctuation, digits,  and retain word length greater than 2
                
                if lemmatize:
                    lm = WordNetLemmatizer()  #lemmatize
                    lemm = lm.lemmatize(word)
                    
                    clean_words.append(lemm)

                if stemming:
                    stemmer = PorterStemmer # stem
                    stemm = stemmer.stem(word)

                    clean_words.append(stemm)

        clean_text = " ".join(clean_words)
        
    return clean_text

#### Clean Ingredients

In [299]:
recipes['clean_ingredients'] = clean_string(recipes['ingredients'])

TypeError: sequence item 0: expected str instance, list found

In [300]:
recipes['clean_ingredients'] = recipes['ingredients'].apply(lambda x: clean_string(x))

KeyboardInterrupt: 

In [35]:
display(recipes)

Unnamed: 0,title,ingredients,instructions,clean_ingredients
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ...",advertisement
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ...",advertisement
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....,advertisement
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....,advertisement
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...,advertisement
...,...,...,...,...
124642,Summer Corn Salad,"[4 ears fresh corn, 2 heads Belgian endive, 2 ...",Watch how to make this recipe.\nPreheat a gril...,1/4 teaspoon cayenne pepper taste
124643,Zucchini Stuffed Tomatoes,"[4 large plum tomatoes, Salt and sugar, 1 1/2 ...",Preheat the broiler. Cut the tomatoes in 1/2 c...,1/2 pound grated gruyere
124644,Pepper Pasta Quick Cook,"[3 tablespoons olive oil, 2 tablespoons unsalt...",Heat the oil and butter in a large skillet ove...,coarse salt
124645,Chocolate Cake with Armagnac Ice Cream,"[8 ounces butter, 8 ounces bittersweet chocola...",Preheat oven to 350 degrees. On the top half o...,flour pan


In [352]:
def clean_string(list, lemmatize = True, stemming = False):
    str = ' '.join(list)
    clean_text = ''
    
    lower = str.lower().split()
    #rem_stop = [i for i in lower if i not in stopword_list]
    
    clean_words = []
    for word in lower:
        if len(word) > 2:
            dig = re.sub(r'\d+','', word)
            #space = re.sub(' +', '', dig)
            text = re.sub(r'[^\w\s]', '', dig)
            
            
            if lemmatize:
                lm = WordNetLemmatizer()  
                lemm = lm.lemmatize(text)
                clean_words.append(lemm)
                
                if stemming:
                    stemmer = PorterStemmer # stem
                    stemm = stemmer.stem(text)
                    clean_words.append(stemm)
         
    rem_stop = [i for i in clean_words if i not in stopword_list]  
    
    clean_text = ' '.join(rem_stop)
    space = re.sub(' +', ' ', clean_text)
    
    return space

In [353]:
dataset = recipes.head(n = 20)

In [354]:
dataset['clean_ingredients'] = dataset['ingredients'].apply(lambda x: clean_string(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['clean_ingredients'] = dataset['ingredients'].apply(lambda x: clean_string(x))


In [305]:
dataset

Unnamed: 0,title,ingredients,instructions,clean_ingredients
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ...",skinless boneless chicken breast half butter ...
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ...",ounce can condensed cream mushroom soup ounce...
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....,packed brown sugar ketchup lean ground beef...
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....,butter softened white sugar packed brown sugar...
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...,whole wheat rotini pasta fresh broccoli floret...
5,Banana Banana Bread,"[2 cups all-purpose flour ADVERTISEMENT, 1 tea...",Preheat oven to 350 degrees F (175 degrees C)....,allpurpose flour baking soda butter brown s...
6,Chef John's Fisherman's Pie,"[For potato crust: ADVERTISEMENT, 3 russet pot...",Bring a large saucepan of salted water and to ...,potato crust russet potato peeled cut chunk bu...
7,Mom's Zucchini Bread,"[3 cups all-purpose flour ADVERTISEMENT, 1 tea...",Grease and flour two 8 x 4 inch pans. Preheat ...,allpurpose flour baking soda baking powder gro...
8,The Best Rolled Sugar Cookies,"[1 1/2 cups butter, softened ADVERTISEMENT, 2 ...","In a large bowl, cream together butter and sug...",butter softened white sugar egg vanilla extra...
9,Singapore Chili Crabs,"[Sauce: ADVERTISEMENT, 1/2 cup ketchup ADVERTI...","Whisk ketchup, chicken broth, egg, soy sauce, ...",sauce ketchup chicken broth large egg soy sa...


In [355]:
print(dataset['ingredients'][15])

print('\n')

print(dataset['clean_ingredients'][15])

print('\n')

['1/2 cup Parmesan cheese ADVERTISEMENT', '1/4 cup butter, softened ADVERTISEMENT', '3 tablespoons mayonnaise ADVERTISEMENT', '2 tablespoons fresh lemon juice ADVERTISEMENT', '1/4 teaspoon dried basil ADVERTISEMENT', '1/4 teaspoon ground black pepper ADVERTISEMENT', '1/8 teaspoon onion powder ADVERTISEMENT', '1/8 teaspoon celery salt ADVERTISEMENT', '2 pounds tilapia fillets ADVERTISEMENT', 'ADVERTISEMENT']


 parmesan cheese butter softened mayonnaise fresh lemon juice dried basil ground black onion powder celery tilapia fillet




In [360]:
dataset.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset.dropna(inplace=True)
