In [1]:
import urllib.request
from tqdm.notebook import tqdm
import six
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
html_code = '''
<!DOCTYPE html>
<html lang="ru" prefix="og: http://ogp.me/ns# article: http://ogp.me/ns/article# profile: http://ogp.me/ns/profile# fb: http://ogp.me/ns/fb#">
<head>
    <title> Just a Title</title>
</head>
<body class="layout narrow-view g-desktop " id="hey js-body">
    <p class='text odd'> This is a <b>bold</b> statement</p>
    <p class='text even'> This is my <a href="www.github.com">link</a>, click it</p>
    <p class='list odd'> This is my <a href="www.github.com"><b>bold link</b></a>, click it</p>
</body>
'''

In [3]:
soup = BeautifulSoup(html_code, 'html.parser') # lxml, html5lib

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="ru" prefix="og: http://ogp.me/ns# article: http://ogp.me/ns/article# profile: http://ogp.me/ns/profile# fb: http://ogp.me/ns/fb#">
 <head>
  <title>
   Just a Title
  </title>
 </head>
 <body class="layout narrow-view g-desktop" id="hey js-body">
  <p class="text odd">
   This is a
   <b>
    bold
   </b>
   statement
  </p>
  <p class="text even">
   This is my
   <a href="www.github.com">
    link
   </a>
   , click it
  </p>
  <p class="list odd">
   This is my
   <a href="www.github.com">
    <b>
     bold link
    </b>
   </a>
   , click it
  </p>
 </body>
</html>


In [5]:
# Хочу получить первый тег p
type(soup.body)

bs4.element.Tag

In [6]:
soup.body

<body class="layout narrow-view g-desktop" id="hey js-body">
<p class="text odd"> This is a <b>bold</b> statement</p>
<p class="text even"> This is my <a href="www.github.com">link</a>, click it</p>
<p class="list odd"> This is my <a href="www.github.com"><b>bold link</b></a>, click it</p>
</body>

In [7]:
soup.p

<p class="text odd"> This is a <b>bold</b> statement</p>

In [8]:
# Хочу получить первый тег p и взять значение class (get)
soup.p['class']

['text', 'odd']

In [9]:
soup.body['id']

'hey js-body'

In [10]:
soup.p.next.next.next.next

' statement'

In [11]:
soup.p.parent.parent

<html lang="ru" prefix="og: http://ogp.me/ns# article: http://ogp.me/ns/article# profile: http://ogp.me/ns/profile# fb: http://ogp.me/ns/fb#">
<head>
<title> Just a Title</title>
</head>
<body class="layout narrow-view g-desktop" id="hey js-body">
<p class="text odd"> This is a <b>bold</b> statement</p>
<p class="text even"> This is my <a href="www.github.com">link</a>, click it</p>
<p class="list odd"> This is my <a href="www.github.com"><b>bold link</b></a>, click it</p>
</body>
</html>

In [12]:
# parents
[tag.name for tag in soup.p.parents]

['body', 'html', '[document]']

In [13]:
soup.p.contents

[' This is a ', <b>bold</b>, ' statement']

In [14]:
# children, contents
[tag.name for tag in soup.p.children]

[None, 'b', None]

In [15]:
# next sibling, next siblings, find next siblings
list(soup.p.next_siblings)

['\n',
 <p class="text even"> This is my <a href="www.github.com">link</a>, click it</p>,
 '\n',
 <p class="list odd"> This is my <a href="www.github.com"><b>bold link</b></a>, click it</p>,
 '\n']

In [16]:
soup.p.find_next_siblings(class_='list')

[<p class="list odd"> This is my <a href="www.github.com"><b>bold link</b></a>, click it</p>]

In [17]:
# find all
soup.find_all('p', 'odd')

[<p class="text odd"> This is a <b>bold</b> statement</p>,
 <p class="list odd"> This is my <a href="www.github.com"><b>bold link</b></a>, click it</p>]

In [18]:
soup


<!DOCTYPE html>

<html lang="ru" prefix="og: http://ogp.me/ns# article: http://ogp.me/ns/article# profile: http://ogp.me/ns/profile# fb: http://ogp.me/ns/fb#">
<head>
<title> Just a Title</title>
</head>
<body class="layout narrow-view g-desktop" id="hey js-body">
<p class="text odd"> This is a <b>bold</b> statement</p>
<p class="text even"> This is my <a href="www.github.com">link</a>, click it</p>
<p class="list odd"> This is my <a href="www.github.com"><b>bold link</b></a>, click it</p>
</body>
</html>

In [19]:
soup.find_all('p')[1].a['href']

'www.github.com'

In [20]:
soup.find_all('p')[-1].b.text

'bold link'

## Задача про Еду 🍔

Хотим научиться по названию блюда определять, какая это кухня.

Пример: категоризация ресторанов по меню.

Будем парсить [eda.ru](eda.ru)

In [21]:
import requests

In [22]:
cousine_code = 'italyanskaya-kuhnya'
page_num = 200
url = f'https://eda.ru/recepty/{cousine_code}?page={page_num}'

In [23]:
response = requests.get(url)

In [24]:
response

<Response [200]>

In [25]:
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
print(soup.prettify())

In [27]:
cousines_html_block = soup.find_all('div', 'select-suggest js-select-suggest')[1]
cousines = cousines_html_block.find_all('li')[2:]
cousine_names = [cousine.text.strip() for cousine in cousines]
cousine_codes = [cousine['data-select-suggest-value'] for cousine in cousines]
cousine_code_to_name = dict(zip(cousine_codes, cousine_names))
cousine_code_to_name

{'italyanskaya-kuhnya': 'Итальянская',
 'gruzinskaya-kuhnya': 'Грузинская',
 'kitayskaya-kuhnya': 'Китайская',
 'francuzskaya-kuhnya': 'Французская',
 'russkaya-kuhnya': 'Русская',
 'yaponskaya-kuhnya': 'Японская',
 'indiyskaya-kuhnya': 'Индийская',
 'meksikanskaya-kuhnya': 'Мексиканская',
 'armyanskaya-kuhnya': 'Армянская',
 'amerikanskaya-kuhnya': 'Американская',
 'ispanskaya-kuhnya': 'Испанская',
 'nemeckaya-kuhnya': 'Немецкая',
 'grecheskaya-kuhnya': 'Греческая',
 'azerbaydzhanskaya-kuhnya': 'Азербайджанская',
 'evropeyskaya-kuhnya': 'Европейская',
 'evreyskaya-kuhnya': 'Еврейская',
 'koreyskaya-kuhnya': 'Корейская',
 'tayskaya-kuhnya': 'Тайская',
 'panaziatskaya-kuhnya': 'Паназиатская',
 'tureckaya-kuhnya': 'Турецкая',
 'uzbekskaya-kuhnya': 'Узбекская',
 'tatarskaya-kuhnya': 'Татарская',
 'sredizemnomorskaya-kuhnya': 'Средиземноморская',
 'arabskaya-kuhnya': 'Арабская',
 'ukrainskaya-kuhnya': 'Украинская',
 'polskaya-kuhnya': 'Польская',
 'britanskaya-kuhnya': 'Британская',
 'belo

In [28]:
import numpy as np
cousine_codes = list(set(cousine_codes) - {'all'})

In [29]:
meal_soup = soup('div', 'horizontal-tile__content')[0]
meal_tags = [tag.text for tag in meal_soup('li')]
meal_name = meal_soup.h3.text.strip().replace('\xa0', ' ')
meal_url = f'https://eda.ru{meal_soup.h3.a["href"]}' 
meal_tags, meal_name, meal_url

(['Закуски', 'Итальянская кухня'],
 'Винный цикорий',
 'https://eda.ru/recepty/zakuski/vinnij-cikorij-35571')

In [55]:
%%time

meal_names = []
meal_tags = []
meal_urls = []
meal_cousine_names = []
meal_images = []
MAX_NUM_PAGES = 1000
GOOD_RESPONSE_STATUS = 200

for cousine_code in tqdm(cousine_codes):
    print(cousine_code)
    for page_num in range(1, MAX_NUM_PAGES):
        url = f'https://eda.ru/recepty/{cousine_code}?page={page_num}'
        try:
            response = requests.get(url, timeout=20)
        except:
            break
        if response.status_code != GOOD_RESPONSE_STATUS:
            break
        soup = BeautifulSoup(response.content, 'html.parser')
        meal_soups = soup('div', 'horizontal-tile__content')
        # Чтобы быстро работало важно дописать это:
        if len(meal_soups) == 0:
            break
        
        image_links = [
            image.get('xlink:href')
            for image in soup('image')
        ]
        assert len(image_links) == len(meal_soups)
        
        meal_images += image_links
        for meal_soup in meal_soups:
            tags = [tag.text for tag in meal_soup('li')]
            name = meal_soup.h3.text.strip().replace('\xa0', ' ')
            url = f'https://eda.ru{meal_soup.h3.a["href"]}' 
            meal_tags.append(tags)
            meal_urls.append(url)
            meal_names.append(name)
            meal_cousine_names.append(
                cousine_code_to_name[cousine_code]
            )

  0%|          | 0/75 [00:00<?, ?it/s]

italyanskaya-kuhnya
belorusskaya-kuhnya
avstraliyskaya-kuhnya
vostochno-indiyskaya-kuhnya
odesskaya-kuhnya
evropeyskaya-kuhnya
latinoamerikanskaya-kuhnya
chechenskaya-kuhnya
kazahskaya-kuhnya
estonskaya-kuhnya
yugoslavskaya-kuhnya
turkmenskaya-kuhnya
argentinskaya-kuhnya
azerbaydzhanskaya-kuhnya
kirgizskaya-kuhnya
kitayskaya-kuhnya
severno-indiyskaya-kuhnya
avstriyskaya-kuhnya
tayskaya-kuhnya
afganskaya-kuhnya
bolgarskaya-kuhnya
shvedskaya-kuhnya
maltiyaskaya-kuhnya
arabskaya-kuhnya
irlandskaya-kuhnya
katalonskaya-kuhnya
evreyskaya-kuhnya
moldavskaya-kuhnya
tadzhikskaya-kuhnya
finskaya-kuhnya
uzbekskaya-kuhnya
bashkirskaya-kuhnya
francuzskaya-kuhnya
tureckaya-kuhnya
vengerskaya-kuhnya
vesterosskaya-kuhnya
belgiyskaya-kuhnya
kubinskaya-kuhnya
vetnamskaya-kuhnya
pendzhabskaya-kuhnya
datskaya-kuhnya
malaziyskaya-kuhnya
yaponskaya-kuhnya
marokkanskaya-kuhnya
livanskaya-kuhnya
abkhaziancuisine
krymskaya-kuhnya
amerikanskaya-kuhnya
tatarskaya-kuhnya
polskaya-kuhnya
peruanskaya_kuhnya
singapu

In [60]:
import pandas as pd

df = pd.DataFrame(
    {
        'meal_names': meal_names,
        'meal_tags': meal_tags,
        'meal_urls': meal_urls,
        'meal_cousine_names': meal_cousine_names,
        'meal_images': meal_images
    }
)
df.drop_duplicates(subset=['meal_urls'], inplace=True)
df

Unnamed: 0,meal_names,meal_tags,meal_urls,meal_cousine_names,meal_images
0,Спагетти карбонара с красным луком,"[Пошаговые рецепты, Паста и пицца, Итальянская...",https://eda.ru/recepty/pasta-picca/spagetti-ka...,Итальянская,https://eda.ru/img/eda/c20x20i/s2.eda.ru/Stati...
1,Лазанья классическая с мясом,"[Пошаговые рецепты, Паста и пицца, Итальянская...",https://eda.ru/recepty/pasta-picca/lazanja-kla...,Итальянская,https://eda.ru/img/eda/c20x20i/s2.eda.ru/Stati...
2,Салат из красной фасоли с творожным сыром…,"[Пошаговые рецепты, Салаты, Итальянская кухня]",https://eda.ru/recepty/salaty/salat-iz-krasnoj...,Итальянская,https://eda.ru/img/eda/c20x20i/s2.eda.ru/Stati...
3,Брускетта с помидорами,"[Пошаговые рецепты, Закуски, Сэндвичи, Итальян...",https://eda.ru/recepty/zakuski/brusketta-s-pom...,Итальянская,https://eda.ru/img/eda/c20x20i/s2.eda.ru/Stati...
4,Фриттата с брокколи и сладким перцем,"[Пошаговые рецепты, Завтраки, Итальянская кухня]",https://eda.ru/recepty/zavtraki/frittata-s-bro...,Итальянская,https://eda.ru/img/eda/c20x20i/s1.eda.ru/Stati...
...,...,...,...,...,...
33678,Салат из зелени с ореховой заправкой,"[Салаты, Авторская кухня, Веганская еда]",https://eda.ru/recepty/salaty/salat-iz-zeleni-...,Авторская,https://eda.ru/img/eda/c20x20i/s1.eda.ru/Stati...
33679,Коктейль «Личи с малиной — тини»,"[Напитки, Авторская кухня]",https://eda.ru/recepty/napitki/kokteyl-lichi-s...,Авторская,https://eda.ru/img/eda/c20x20i/s2.eda.ru/Stati...
33680,Коктейль «Лавандовый сауэр»,"[Напитки, Авторская кухня]",https://eda.ru/recepty/napitki/kokteyl-lavando...,Авторская,https://eda.ru/img/eda/c20x20i/s2.eda.ru/Stati...
33681,Треска по рецепту Эммы,"[Основные блюда, Авторская кухня]",https://eda.ru/recepty/osnovnye-blyuda/treska-...,Авторская,https://eda.ru/img/eda/c20x20i/s1.eda.ru/Stati...


In [61]:
df.to_csv('meal_new_db.csv', encoding='utf-8', index=False)
df = pd.read_csv('meal_new_db.csv')
df.head()

Unnamed: 0,meal_names,meal_tags,meal_urls,meal_cousine_names,meal_images
0,Спагетти карбонара с красным луком,"['Пошаговые рецепты', 'Паста и пицца', 'Италья...",https://eda.ru/recepty/pasta-picca/spagetti-ka...,Итальянская,https://eda.ru/img/eda/c20x20i/s2.eda.ru/Stati...
1,Лазанья классическая с мясом,"['Пошаговые рецепты', 'Паста и пицца', 'Италья...",https://eda.ru/recepty/pasta-picca/lazanja-kla...,Итальянская,https://eda.ru/img/eda/c20x20i/s2.eda.ru/Stati...
2,Салат из красной фасоли с творожным сыром…,"['Пошаговые рецепты', 'Салаты', 'Итальянская к...",https://eda.ru/recepty/salaty/salat-iz-krasnoj...,Итальянская,https://eda.ru/img/eda/c20x20i/s2.eda.ru/Stati...
3,Брускетта с помидорами,"['Пошаговые рецепты', 'Закуски', 'Сэндвичи', '...",https://eda.ru/recepty/zakuski/brusketta-s-pom...,Итальянская,https://eda.ru/img/eda/c20x20i/s2.eda.ru/Stati...
4,Фриттата с брокколи и сладким перцем,"['Пошаговые рецепты', 'Завтраки', 'Итальянская...",https://eda.ru/recepty/zavtraki/frittata-s-bro...,Итальянская,https://eda.ru/img/eda/c20x20i/s1.eda.ru/Stati...


In [62]:
indexes_match_queries = df.apply(
    lambda row: 'брокколи' in row['meal_names'].lower(),
    axis=1,
)
df[indexes_match_queries].sample(1)

Unnamed: 0,meal_names,meal_tags,meal_urls,meal_cousine_names,meal_images
18978,"Салат с брокколи, медом и горчицей","['Салаты', 'Французская кухня', 'Вегетарианска...",https://eda.ru/recepty/salaty/salat-s-brokkoli...,Французская,https://eda.ru/img/eda/c20x20i/s1.eda.ru/Stati...


In [63]:
class SimilarMealFinder:
    
    def __init__(self, data: pd.DataFrame):
        self.data = data
        
    @staticmethod
    def from_csv_path(csv_path: str):
        return SimilarMealFinder(
            pd.read_csv(csv_path),
        )

    @staticmethod
    def from_excel_path(excel_path: str):
        return SimilarMealFinder(
            pd.read_excel(excel_path),
        )    

    def __call__(self, meal_query: str) -> str:
        indexes_match_queries = self.data.apply(
            lambda row: meal_query in row['meal_names'].lower(),
            axis=1,
        )
        #return indexes_match_queries
        data_sample = self.data[indexes_match_queries]
        #return data_sample
        most_likely_cousine = data_sample.meal_cousine_names.mode().iloc[0]
        # Pick an arbitrary meal that matches the query.
        random_query_response = data_sample[
            data_sample['meal_cousine_names'] == most_likely_cousine
        ].sample(1)
        return random_query_response

In [64]:
similar_meal_finder = SimilarMealFinder.from_csv_path('meal_new_db.csv')

In [66]:
similar_meal_finder('креветки')

Unnamed: 0,meal_names,meal_tags,meal_urls,meal_cousine_names,meal_images
5399,Креветки со сморчками и грибным маслом,"['Закуски', 'Европейская кухня']",https://eda.ru/recepty/zakuski/krevetki-so-smo...,Европейская,https://eda.ru/img/eda/c20x20i/s1.eda.ru/Stati...


In [67]:
similar_meal_finder('брокколи')

Unnamed: 0,meal_names,meal_tags,meal_urls,meal_cousine_names,meal_images
5930,Киш с брокколи и томатами,"['Выпечка и десерты', 'Европейская кухня']",https://eda.ru/recepty/vypechka-deserty/kish-s...,Европейская,https://eda.ru/img/eda/c20x20i/s1.eda.ru/Stati...


# 3. API  
__API (Application Programming Interface)__ — this is a ready-made code that you can insert into your code! Many services, including Google and Vkontakte, provide their own ready-made solutions for your development

Examples: 

* [VK API](https://vk.com/dev/methods)
* [API Twitter](https://developer.twitter.com/en/docs.html) ,
* [API YouTube](https://developers.google.com/youtube/v3/)
* [API Google Maps](https://developers.google.com/maps/documentation/) 
* [Aviasales](https://www.aviasales.ru/API)
* [Yandex Translate](https://yandex.ru/dev/translate/)

In [68]:
with open('google-dev-token.txt') as f:
    google_token = f.read()

Работать со своими API токенами можно тут: https://console.cloud.google.com/apis/credentials

In [None]:
mainpage = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?'

location = '55.86,37.54'
radius = '3000'
keyword = 'креветки' # coffee shop

parameters = 'location='+location+'&radius='+radius+'&keyword='+keyword+'&language=ru-Ru'+'&key=' + google_token

final_url = mainpage + parameters 

# Your Google API token should appear below, don't disclose it. Disable the token in case it's compromised.
final_url

In [72]:
response = requests.get(final_url)

response.json()['results']

[{'business_status': 'OPERATIONAL',
  'geometry': {'location': {'lat': 55.883247, 'lng': 37.498813},
   'viewport': {'northeast': {'lat': 55.88450327989272,
     'lng': 37.50019707989271},
    'southwest': {'lat': 55.88180362010728, 'lng': 37.49749742010727}}},
  'icon': 'https://maps.gstatic.com/mapfiles/place_api/icons/v1/png_71/shopping-71.png',
  'name': 'Рыбоедовъ',
  'opening_hours': {'open_now': False},
  'photos': [{'height': 800,
    'html_attributions': ['<a href="https://maps.google.com/maps/contrib/108999907613617595257">A Google User</a>'],
    'photo_reference': 'Aap_uEBS4AkIzETFbxa2X_-BaW32ZT2IRWbesFRLXvFZBTJRw-djIO7Czmm7MtH1XVDKYGCQQoU4fssM3WXKS8r2iQr-OCz9ijzjN7G8-opRdkKQfASDe-Ww5_EkvBDzlSGsHwI6zjGKP2ZvhwZOmOCgf7OXQxOpaKbg17mrFvKrMGqCJLw3',
    'width': 800}],
  'place_id': 'ChIJ0cKUycw5tUYRrFbh3yjs7Aw',
  'plus_code': {'compound_code': 'VFMX+7G р-н Западное Дегунино, Москва',
   'global_code': '9G7VVFMX+7G'},
  'rating': 5,
  'reference': 'ChIJ0cKUycw5tUYRrFbh3yjs7Aw',