# Text Mining Student Specialist  for UW libraries
* Prathibha Ramachandran



In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import numpy as np

In [None]:
# from google.colab import drive
# drive.mount("/content/drive", force_remount=True)

1) Web scraping is performed using a python library called beautiful soup 
* collected few url food links
* extracted the title of the recipe and the ingredients list from each page
* created a  list of 100 recipes





In [2]:
class Recipe():
  def __init__(self, url):
    self.url = url
    
    page = BeautifulSoup(requests.get(self.url).content, 'html.parser')

    self.name = page.find("h1").text
    
    self.ingredients = []
    items = page.find_all("li", {"class":"wprm-recipe-ingredient"})
    for item in items:
      self.ingredients.append(item.text.strip().lower())

  @staticmethod
  def is_recipe_page(url):
    page = BeautifulSoup(requests.get(url).content, 'html.parser')
    items = page.find_all("li", {"class":"wprm-recipe-ingredient"})
    
    return(len(items) > 0)



In [3]:
seed_urls =  [
             "https://rainbowplantlife.com/category/recipes/vegan-breakfast/",
             "https://rainbowplantlife.com/category/recipes/lunch/",
             "https://rainbowplantlife.com/category/recipes/dinner/",
             "https://rainbowplantlife.com/category/recipes/dessert/",
             "https://rainbowplantlife.com/category/recipes/instant-pot-recipes/",
              "https://rainbowplantlife.com/category/recipes/side-dishes/"
]


In [4]:
seen_urls = set()
recipe_list = []

for seed_url in seed_urls:
  print("Collected recipes: %d" % len(recipe_list))
  if len(recipe_list) >= 100:
    break
  
  page = requests.get(seed_url)
  # BeautifulSoup enables to find the elements/tags in a webpage 
  html = BeautifulSoup(page.text, "html.parser")

  for link in html.find_all('a'):
      url = link.get('href')
      if url in seen_urls: 
        continue
      if url.startswith("https://rainbowplantlife.com/"):
        if Recipe.is_recipe_page(url):
          recipe_list.append(Recipe(url))
        else:
          if url not in seen_urls:
            seed_urls.append(url)

      seen_urls.add(url) 
      

Collected recipes: 0
Collected recipes: 19
Collected recipes: 38
Collected recipes: 51
Collected recipes: 69
Collected recipes: 82
Collected recipes: 98
Collected recipes: 101


In [5]:
# saving raw data
with open('rawData.csv', 'w') as outfile:
    # using csv.writer method from CSV package
    write = csv.writer(outfile)
    write.writerow(['url', 'name', 'ingredient'])
    for recipe in recipe_list:
      # saving url, name, ingredient to csv 
      for ingredient in recipe.ingredients: 
        write.writerow([recipe.url, recipe.name, ingredient])


2) Removing some stopwords for data cleaning

* Note : tried to use nltk library, didn't give good results as it just contains English words appearing coomnly. 
But the food websites have there own commonly occuring words used in the "food domain" which needs a differ stop words set. So I created a custom stop words set which takes care of this
* Took care to remove digits 

In [6]:
#from nltk.corpus import stopwords
#nltk_stop_words = set(stopwords.words("english"))

ingredients_stop_words = set([
                  'teaspoon', 'block', 'strongly', 'brewed', 'dusting', '~', '//', 'fresh', 'paste',
                  'large', 'baked', 'fresh', 'medium', 'as needed', 'crushed with a press finely', 'divided',
                  'flaky coarse', 'drained', 'liquid from a chickpeas', 'oil choice', 'freshly grated', 'fat',
                  'grated crushed', 'unsweetened', 'heaping','diced', 'medium', 'specially selected', 'extra virgin',
                  'juiced (about', 'golden', 'sliced in quarters', 'pint', 'cold','large', 'cracked', 
                  '(unsweetened, plain)', 'finely (use jalapeño a milder version pepper)', 'simply nature extra virgin',
                  'rinsed drained', '“lite”', 'smooth, creamy', 'your', 'several cracks freshly cracked',
                  'tablespoon', 'certified', 'torn', 'half', 'see below', 'fine', 'ground', 'finely',
                  'cup', 'taste', '-', 'ish', 'chop' , 'to', 'for', 'batch', 'simply nature' , 'raw', 
                  'ml', 'zest', 'plus', 'finish', '/', '-', '- -', '+ +', 'pure extract', 'extract',
                  'pinch', 'handful', 'minced', 'or', 'your favorite', 'pure', 'drained rinsed',
                   'optional', 'an', 'the', 'of', '+', 'inch', 'cm', '½' , '¼','⅓','freshly squeezed', ','
])

stop_words = ingredients_stop_words #| nltk_stop_words

In [7]:
def to_clean(ingredient):
  # taking care of numbers
  if any(c.isdigit() for c in ingredient):
    return True
  for stop_word in stop_words:
    if stop_word in ingredient:
      return True
  return False

In [8]:
def clean_ingredient(ingredient):
  words = ingredient.split()
  cleaned = [w for w in words if not to_clean(w)]
  return ' '.join(cleaned)

In [9]:
import csv
cleaned_ingredient_list = []
with open('cleanedData.csv', 'w') as outfile:
    # using csv.writer method from CSV package
    write = csv.writer(outfile)
    write.writerow(['url', 'name', 'ingredient'])
    for recipe in recipe_list:
      # url, name, ingredient
      for ingredient in recipe.ingredients: 
        write.writerow([recipe.url, recipe.name, clean_ingredient(ingredient)])
        cleaned_ingredient_list.append(clean_ingredient(ingredient))

3) Counting the top ingredients and proportion
* For this I created a dictinary with all the ingredietns and then sorted the top ingredients.
* Found the proportion using the count value of the ingredients/ total len of the csv file


In [10]:
total_len = len(cleaned_ingredient_list) # total len of csv
print(total_len) 

2173


In [11]:
ingredient_count = {i:cleaned_ingredient_list.count(i) for i in cleaned_ingredient_list}
ingredient_dict = dict(sorted(ingredient_count.items(), key=lambda item: item[1], reverse = True))
print(ingredient_dict)

{'': 116, 'black pepper': 68, 'kosher salt': 46, 'sea salt': 45, 'garlic': 31, 'cinnamon': 30, 'nutritional yeast': 27, 'maple syrup': 23, 'extra virgin olive oil': 23, 'cumin': 22, 'simply nature extra virgin olive oil': 20, 'water': 19, 'ginger': 19, 'rinsed': 17, 'baking powder': 17, 'yellow': 17, 'kosher salt sea salt': 16, 'vegetable broth': 16, 'cloves': 15, 'lemon juice': 15, 'baking soda': 14, 'onion powder': 14, 'olive oil': 13, 'turmeric': 13, 'kosher': 12, 'red pepper flakes': 12, 'smoked paprika': 11, 'squeezed lemon juice': 11, 'lemon': 11, 'flour': 11, 'dried': 10, 'garlic powder': 10, 'sugar': 10, 'garam masala': 10, 'bay leaf': 10, 'paprika': 9, 'garlic roughly': 9, 'tahini': 9, 'cashews': 9, 'kosher salt black pepper': 9, 'squeezed lime juice': 9, 'apple cider vinegar': 8, 'grated nutmeg': 8, 'cumin seeds': 8, 'lime juice': 7, 'allspice': 7, 'chili powder': 7, 'dijon mustard': 7, 'piece grated': 7, 'bay leaves': 7, 'coconut milk': 7, 'walnuts': 6, 'extra virgin olive':

In [12]:
def proportion(ingredient_dict): 
  for key in ingredient_dict.keys(): 
    proportion = (ingredient_dict[key]/ total_len)
    proportion = '{:.4f}'.format(proportion)
    return proportion

In [13]:
# saving dictionary results to the result csv
with open('result.csv', 'w') as f:
    for key in ingredient_dict.keys():
        f.write("%s,%s,%s\n,"%(key,ingredient_dict[key], proportion(ingredient_dict)))