In [1]:
import pandas as pd 
import lxml
from lxml import html
import bs4 as bs
import requests
from io import StringIO, BytesIO
from PIL import Image
from pathlib import Path
from selenium import webdriver
import time


# %%
main_url = 'http://www.vybory.izbirkom.ru/region/izbirkom'


# %%
def detect_captcha_text(image):
    '''
    Gets a byte image
    Returns text from image

    '''
    import pytesseract
    stream = BytesIO(image)
    image_rgb = Image.open(stream).convert("RGBA")
    stream.close()
    gray_image = image_rgb.convert('L')
    text_image = pytesseract.image_to_string(gray_image)
    return text_image


# %%
def replace_o_as_0(list_of_char):
    '''

    Replaces o with 0
    '''
    for index in range(len(list_of_char)):
        if list_of_char[index] == 'o':
            list_of_char[index] = '0'
        else:
            continue
    return list_of_char


# %%
def find_captcha(driver):
    '''

    Takes captcha, detects it
    Returns Captcha input field element and captcha text
    

    '''
    captcha = driver.find_element_by_id("captchaImg").screenshot_as_png
    captcha_text = detect_captcha_text(captcha)
    text_list = list(captcha_text)
    text_list_prep = replace_o_as_0(text_list)
    text_prep = ''.join(text_list_prep)
    captcha_input_field = driver.find_element_by_id('captcha')
    return captcha_input_field , text_prep


# %%
def get_html_source(url):
    '''
    Gets a source site url and gets through captcha
    Return page html

    '''
    import time
    from selenium import webdriver
    driver = webdriver.Chrome('./chromedriver')  
    driver.get(url)
    time.sleep(1)
    # find link to Общероссийское голосование по вопросу одобрения изменений в Конституцию      Российской Федерации
    # link = driver.find_elements_by_css_selector('table:nth-child(17) tbody:nth-child(1)                                             tr:nth-child(2) td:nth-child(2) > a.vibLink')[0]
    # link.click()
    time.sleep(1) # Let the user actually see something!
    
    while True:
        captcha_input_field, text_prep = find_captcha(driver)
        time.sleep(1)
    
        captcha_input_field.send_keys(text_prep)
        time.sleep(1)
        driver.find_element_by_id('send').click()
        time.sleep(1)
        html_source = driver.page_source
        if 'РЕЗУЛЬТАТЫ ОБЩЕРОССИЙСКОГО ГОЛОСОВАНИЯ' in html_source:
            break
    return html_source


# %%
def get_region_result_links(html_page):
    '''
    Parsing links and names from dropdown table


    Input: html page
    Output: pandas dataframe

    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_page, 'html.parser')
    regions_dict = {}
    for option in soup.find_all('option')[1:]:
        regions_dict[f'{option.text}'] = option['value']
    regions_df = pd.DataFrame(regions_dict.items(), columns = ['Region', 'Url'])
    return regions_df

In [2]:
def get_election_result(url, driver):
    '''
    Gets a source site url from regions and gets through captcha,
    downloads excel files with election results
    

    '''
    import time
    # from selenium import webdriver
    # driver = webdriver.Chrome('./chromedriver')  
    driver.get(url)
    time.sleep(1)
    while True:
        captcha_input_field, text_prep = find_captcha(driver)
        time.sleep(1)
    
        captcha_input_field.send_keys(text_prep)
        time.sleep(1)
        driver.find_element_by_id('send').click()
        time.sleep(1)
        html_source = driver.page_source
        # КОСТЫЛЬ
        if 'РЕЗУЛЬТАТЫ ОБЩЕРОССИЙСКОГО ГОЛОСОВАНИЯ' in html_source:
            break
    vote_table = driver.find_element_by_link_text('Сводная таблица итогов голосования')
    vote_table.click()
    print_version = driver.find_element_by_link_text('Версия для печати')
    print_version.click()
    return print_version

In [3]:
links = pd.read_csv('all_levels_links.csv', index_col='Unnamed: 0')

In [4]:
links.head(1)

Unnamed: 0,REGION_LVL_1_NAME,REGION_LVL_1_URL,REGION_LVL_0_NAME,REGION_LVL_0_URL
0,1 Адыгейская,http://www.vybory.izbirkom.ru/region/izbirkom?...,Республика Адыгея (Адыгея),http://www.vybory.izbirkom.ru/region/izbirkom?...


In [5]:
def load_election_results(urls_df, region_level=0):
    '''
    Input: dataframe with different level urls
    Output: loads pivot tables with election results


    '''
    # create folder for downloaded files
    saving_folder = f'/election results level_{region_level}'
    # if os.path.exists(saving_folder):
    #     print(f"Folder {saving_folder} already exists! /n Deleting folder")
    #     os.rmdir(saving_folder)
    # os.mkdir(saving_folder)
    # print(f"Folder {saving_folder} created!")

    # set a chromedriver saving settings
    chrome_options = webdriver.ChromeOptions() 
    prefs = {"profile.default_content_settings.popups": 0,
             "download.default_directory": 
                        f"{os.getcwd() + saving_folder}",#IMPORTANT - ENDING SLASH V IMPORTANT
             "directory_upgrade": True}
    chrome_options.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome('./chromedriver', chrome_options=chrome_options)

    for url in urls_df[f'REGION_LVL_{region_level}_URL']:
        get_election_result(url, driver)
        break


    

In [6]:
load_election_results(links, 1)