In [78]:
import requests 
import pandas as pd
from bs4 import BeautifulSoup

In [79]:
PROJECT_ROOT = r"X:\python\food-scraper\food-scraper\data\\"



class Scraper():
    def __init__(self, base_url):
        self.base_url = base_url
    
    def scrape(self, url=None):
        response = requests.get(url) if url else requests.get(self.base_url)
        return response
        
    def parse(self, url=None):
        soup = BeautifulSoup(self.scrape(url).content, 'html.parser')
        return soup
    
    def list_to_series(self, list=None):
        return pd.Series(list)
    
    def series_to_csv(self, series=None, file_name=None):
        series.to_csv(PROJECT_ROOT + file_name, index=False)


class EnchantedLearningScraper(Scraper):
    def __init__(self, url='https://www.enchantedlearning.com/wordlist/food.shtml'):
        super().__init__(url)
        self.food_list = self.scrape_food_list()
        self.file_name = f'enchanted-learning-food-list.csv'
    
    # Scrape the page for the list of food items and return a python list
    def scrape_food_list(self):
        food_list = []
        for element in self.parse().find_all('div', 'wordlist-item'):
            food_list.append(element.get_text())
        return(food_list)
    
    def download_food_list_csv(self):
        self.series_to_csv(self.list_to_series(self.food_list), file_name=self.file_name)

class EnchantedLearningSpiceScraper(EnchantedLearningScraper):
    def __init__(self, url="https://www.enchantedlearning.com/wordlist/herbs.shtml"):
        super().__init__(url)
        self.file_name = f'enchanted-learning-spice-list.csv'

class FoodWishesScraper(Scraper):
    def __init__(self, url="https://foodwishes.blogspot.com/"):
        super().__init__(url) 
        self.older_posts = self.older_posts_link()
        self.next_page = self.older_posts_link()
        self.recipes = self.find_recipes()
    
    def older_posts_link(self):
        for a in self.parse().find_all('a'):
            for x in range(len(a)+1):
                try: 
                    if a['class'][x] == 'blog-pager-older-link':
                        return a['href']
                except: pass
        return None
    
    def find_recipes(self):
        recipes = []
        for h3 in self.parse().find_all('h3', {'class': 'post-title entry-title'}):
            recipes.append([f"{h3.find('a').text}",f"{h3.find('a')['href']}"])
        return recipes

In [80]:
# Execute this cell to download the food list from Enchanted Learning
els = EnchantedLearningScraper()
els.download_food_list_csv()
elss= EnchantedLearningSpiceScraper()
elss.download_food_list_csv()


ConnectionError: HTTPSConnectionPool(host='www.enchantedlearning.com', port=443): Max retries exceeded with url: /wordlist/food.shtml (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x04D86A50>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))

In [88]:
food_list_data_sources = [
    f'{PROJECT_ROOT}{els.file_name}',
    f'{PROJECT_ROOT}{elss.file_name}'
]

food_list_data = []

for source in food_list_data_sources:
    food_list_data.append(pd.read_csv(source, header=None)) 
    
result = pd.concat([food_list_data[0], food_list_data[1]], ignore_index=True)

print(result)



                   0
0       acorn squash
1    alfalfa sprouts
2             almond
3            anchovy
4              anise
..               ...
614           wasabi
615       watercress
616      wintergreen
617         woodruff
618   yellow mustard

[619 rows x 1 columns]


In [73]:
c = FoodWishesScraper()
recipe_dict = {
    "name": [],
    "link": [],
}
while c.next_page: 
    for recipe in c.recipes:
        recipe_dict["name"].append(recipe[0])
        recipe_dict["link"].append(recipe[1])
    c = FoodWishesScraper(c.next_page)
    print(c.next_page)
    print(c.recipes)
print(c.next_page)
print(c.recipes)

df = pd.DataFrame(recipe_dict)

print(df)
df.to_csv(PROJECT_ROOT + r'recipes.csv', index=False)


https://foodwishes.blogspot.com/search?updated-max=2019-07-19T15:32:00-04:00&max-results=4&start=4&by-date=false
[['Baltimore Peach Cake – Infested with Beauty and Deliciousness', 'https://foodwishes.blogspot.com/2019/07/baltimore-peach-cake-infested-with.html'], ['Fortune Cookies – I See Cookies in Your Future', 'https://foodwishes.blogspot.com/2019/07/fortune-cookies-i-see-cookies-in-your.html'], ['Buttermilk Barbecue Chicken – Sorry, Fried Chicken, Your Secret is Out', 'https://foodwishes.blogspot.com/2019/07/buttermilk-barbecue-chicken-not-just.html'], ['The Best Beef Tri Tip – This Tip is Tops', 'https://foodwishes.blogspot.com/2019/07/the-best-beef-tri-tip-pro-tip-literally.html']]


KeyboardInterrupt: 

                                                   name  \
0     Easy Chicken Enchiladas – Flatter Wasn’t Faste...   
1     Confetti Rice Salad – Celebrating Your Improve...   
2     Penang Pork Satay – Maybe Just Like the One at...   
3     Baltimore Peach Cake – Infested with Beauty an...   
4        Fortune Cookies – I See Cookies in Your Future   
...                                                 ...   
2526                       The "Ultimate" Roast Chicken   
2527       Do you wish your restaurant made more money?   
2528  Piquillo Peppers stuffed with Orange and Cumin...   
2529  Food Wishes Cooking Classes – 2 Great Ways to ...   
2530  Fabulous Fashions for Foodies!  Buy a shirt an...   

                                                   link  
0     https://foodwishes.blogspot.com/2019/08/easy-c...  
1     https://foodwishes.blogspot.com/2019/08/confet...  
2     https://foodwishes.blogspot.com/2019/08/penang...  
3     https://foodwishes.blogspot.com/2019/07/baltim...  
4