In [1]:
!pip install fake_useragent

Collecting fake_useragent
  Downloading fake_useragent-1.5.1-py3-none-any.whl (17 kB)
Installing collected packages: fake_useragent
Successfully installed fake_useragent-1.5.1


In [2]:
import re
import csv
import time
import json
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import concurrent.futures
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
# https://github.com/Ledka17/Parsing_banki_ru/blob/master/Parsing_banki_ru.ipynb
# https://github.com/kotikkonstantin/bankiru/blob/master/bankiru_parsing.ipynb
# https://github.com/KKQUEN/banki.ru_reviews_parser/blob/main/parser_banki.ru_ModeratedReviews.ipynb

# про многопоточность
# https://habr.com/ru/companies/otus/articles/771346/

Сначала парсим только ссылки, потом парсим текст, гуляя по ссылкам.

Парсим ссылки - ищем вручную нужный диапазон страниц и парсим. будут дубликаты, так как отзывы добавляются очень быстро. так что парсим ссылки, открываем док и убираем дубликаты.
Затем парсим по ссылкам инфу.

In [3]:
# список банков - DONE!

%%time

for year in [2023, 2024]:

  df = pd.DataFrame({'place':[], 'name':[], 'rating':[], 'responses':[], 'answers':[]})

  page = 1
  i = 1
  while True:

    url = f'https://www.banki.ru/services/responses/?date={year}&page={page}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    items = soup.find_all("script", {"type": "application/ld+json"})

    if len(items) == 1:
      break

    for item in items[:50]:
      data = json.loads(item.text)
      if 'name' in data:
        place = i
        name = data['name']
        rating = data['aggregateRating']['ratingValue']
        responses = data['aggregateRating']['ratingCount']
        answers = data['aggregateRating']['reviewCount']

        df.loc[len(df)] = [i, name, rating, responses, answers]
        i += 1
    page += 1

  df.to_csv(f'banki_banks_{year}.csv', index=False)

In [4]:
# ссылки на отзывы - DONE! - однопотоковый

%%time

base_url = 'https://www.banki.ru'

with open('banki_responses_urls.csv', "w", encoding='utf-8') as w_file:
  file_writer = csv.writer(w_file, delimiter = ",", lineterminator="\n")
  file_writer.writerow(['url'])

  for page in tqdm(range(1,5)):

    url = f'https://www.banki.ru/services/responses/list/?type=all&page={page}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')


    items = soup.find_all('a', attrs={'class': 'link-simple',
                                      'data-gtm-click': '{"event":"GTM_event","eventCategory":"ux_data","eventAction":"click_responses_response_user_rating_banking_allReviewsPage"}'})

    for item in items:

      url_resp = base_url+item['href']

      file_writer.writerow([url_resp])

    if page % 10 == 0:
      w_file.flush()

100%|██████████| 4/4 [00:07<00:00,  1.75s/it]

CPU times: user 1.2 s, sys: 33.6 ms, total: 1.24 s
Wall time: 7.04 s





In [5]:
# ссылки на отзывы - DONE! (многопоточный)

%%time

ua = UserAgent()
headers = {'user-Agent': ua.google}


def get_urls(url):

  response = requests.get(url, headers=headers)
  soup = BeautifulSoup(response.content, 'html.parser')
  # print(response.request.headers['User-Agent'], response.status_code)

  items = soup.find_all('a', attrs={'class': 'link-simple',
                                    'data-gtm-click': '{"event":"GTM_event","eventCategory":"ux_data","eventAction":"click_responses_response_user_rating_banking_allReviewsPage"}'})

  urls = []
  for item in items:
    urls.append('https://www.banki.ru' + item['href'])

  return urls


def write_to_csv(urls):
  with open('banki_urls.csv', "a", encoding='utf-8') as w_file:
    file_writer = csv.writer(w_file, delimiter=",", lineterminator="\n")
    for url in urls:
      file_writer.writerow([url])


urls = [f'https://www.banki.ru/services/responses/list/?type=all&page={page}' for page in range(1, 5)]

with open('banki_urls.csv', "a", encoding='utf-8') as w_file:
    file_writer = csv.writer(w_file, delimiter=",", lineterminator="\n")
    file_writer.writerow(['url'])

with ThreadPoolExecutor(max_workers=4) as executor:
  futures = [executor.submit(get_urls, url) for url in urls]
  for future in tqdm(as_completed(futures)):
    results = future.result()
    write_to_csv(results)

4it [00:02,  1.64it/s]

CPU times: user 1.27 s, sys: 35.2 ms, total: 1.31 s
Wall time: 2.47 s





In [6]:
# отзывы - DONE (многопоточность)

ua = UserAgent()
headers = {'user-Agent': ua.chrome}

def get_review(url):

  response = requests.get(url, headers=headers)
  # print(url, response.request.headers['User-Agent'], response.status_code)
  soup = BeautifulSoup(response.content, 'html.parser')

  date_review = soup.find('span', {'class':'l10fac986'}).text[4:14]
  time_review = soup.find('span', {'class':'l10fac986'}).text[15:20]
  user_name = soup.find('span', {'class':'l17191939'}).text.strip()
  user_city = soup.find('span', {'class':'l3a372298'}).text
  review_title = soup.find('h1', {'class':'text-header-0 le856f50c'}).text.strip()
  review_text = soup.find("div", {"class":"lf4cbd87d ld6d46e58 lfd76152f"}).text.strip()
  rating = soup.find('div', {'class':'lbb810226'}).text.strip()
  if rating == 'Без оценки':
    review_status = 'unk'
  else:
    review_status = soup.find('section', {'class':'lf4cbd87d l9656ec89 lfd76152f'}).text.strip()

  additional_grades = {}
  for txts6 in soup.find_all('div', {"class": 'text-size-6'}):
      additional_grades[txts6.text] = 0
  i = 0
  for grade in soup.find_all('div', {'class': 'ld017b199'}):
      current_key = list(additional_grades.keys())[i]
      additional_grades[current_key] = str(grade).count("l61f54b7b")
      i += 1
  clear_conditions_rating = additional_grades.get('Прозрачные условия', 0)
  polite_staff_rating = additional_grades.get('Вежливые сотрудники', 0)
  support_rating = additional_grades.get('Доступность и поддержка', 0)
  app_site_rating = additional_grades.get('Удобство приложения, сайта', 0)

  bank_name = soup.find('img', {'class':'lazy-load'})['alt']
  bank_ans = soup.find('div', {'class':'l0e7bcaa7'})
  if bank_ans is None:
    is_bank_ans = 'no'
    date_bank_ans = 'unk'
    time_bank_ans = 'unk'
    bank_text_ans = 'unk'
  else:
    is_bank_ans = 'yes'
    date_bank_ans = soup.find('div', {'class':'l0e7bcaa7'}).find('div', {'class':'l46c44745'}).text[:10].strip()
    time_bank_ans = soup.find('div', {'class':'l0e7bcaa7'}).find('div', {'class':'l46c44745'}).text[11:19].strip()
    bank_text_ans = soup.find('div', {'class':'l0e7bcaa7'}).find('div', {'class':'lb1789875'}).text.strip()

  review_row = [url, date_review, time_review, user_name, user_city, review_title, review_text, review_status,
               rating, clear_conditions_rating, polite_staff_rating, support_rating, app_site_rating,
               bank_name, is_bank_ans, time_bank_ans, date_bank_ans, bank_text_ans]

  return review_row


def write_to_csv(review_row):
  with open('banki_reviews.csv', "a", encoding='utf-8') as w_file:
    file_writer = csv.writer(w_file, delimiter=",", lineterminator="\n")
    file_writer.writerow(review_row)

urls = pd.read_csv('banki_urls.csv')
urls = urls.iloc[:,0].tolist()

with open('banki_reviews.csv', "w", encoding='utf-8') as w_file:
  file_writer = csv.writer(w_file, delimiter = ",", lineterminator="\n")
  file_writer.writerow(['url', 'date_review', 'time_review', 'user_name', 'user_city', 'review_title', 'review_text', 'review_status',
                        'rating', 'clear_conditions_rating', 'polite_staff_rating', 'support_rating', 'app_site_rating',
                        'bank_name', 'is_bank_ans', 'time_bank_ans', 'date_bank_ans', 'bank_text_ans'])

with ThreadPoolExecutor(max_workers=4) as executor:
  futures = [executor.submit(get_review, url) for url in urls]
  for future in tqdm(as_completed(futures)):
    results = future.result()
    write_to_csv(results)

100it [00:43,  2.28it/s]
