In [25]:
from bs4 import BeautifulSoup
import urllib.request
import requests
import re

import numpy
import pandas as pd 

import json
import csv
from itertools import chain
from typing import List, Tuple
from multiprocessing.dummy import Pool as ThreadPool 

In [47]:
def get_vacancy_urls(initial_search_url: str) -> List[str]:
    
    def get_vacancy_urls_from_all_pages(search_url: str) -> List[str]:
        total_pages_with_results = max([0] + [int(a.contents[0]) for a in BeautifulSoup(urllib.request.urlopen(search_url)).findAll("a", {"class": "HH-Pager-Control"}) if len(a.contents[0]) <= 3])
        vacancy_ulrs = []
        for i in range(total_pages_with_results):
            results_response = urllib.request.urlopen(search_url + f"&page={i}")
            results_soup = BeautifulSoup(results_response)
            vacancy_divs = results_soup.findAll("div", {"class": "vacancy-serp-item"})
            vacancy_ulrs += [div.findAll("a", {"class":"HH-LinkModifier"})[0]['href'] for div in vacancy_divs if len(div.findAll("a", {"class":"HH-LinkModifier"}))]
        return vacancy_ulrs

    initial_search_results_page = urllib.request.urlopen(initial_search_url)
    initial_search_results_soup = BeautifulSoup(initial_search_results_page)
    clusters = initial_search_results_soup.findAll("div", {"class": "clusters-group-title"})
    divs = [c for c in clusters if c.attrs["data-toggle"] in ["professionalArea", "industry"]]
    cildren_contents = [[c for c in div.findNextSibling().children][0].contents for div in divs]
    urls_with_filters = [initial_search_url.split('?')[0] + url_ending for url_ending in list(chain(*[[el.findAll("a")[0].attrs["href"] for el in [c1 for c1 in c[1:]] if len(el.findAll("a"))] for c in cildren_contents]))]
    return list(set(chain(*[get_vacancy_urls_from_all_pages(url) for url in urls_with_filters])))

In [48]:
def save_vacancy_urls(search_urls: List[str], path: str) -> bool:
    vacancy_urls = set([])
    for search_url in search_urls:
        vacancy_urls = set(list(vacancy_urls) + get_vacancy_urls(search_url))
    with open(path, "w") as file:
        file.write("\n".join(list(vacancy_urls)))
    return vacancy_urls

In [49]:
def get_urls_and_path(city: str) -> Tuple[List[str], str]:
    search_url_beginings = {
        'smr': "https://samara.hh.ru/search/vacancy?text=&area=78",
        'spb': "https://spb.hh.ru/search/vacancy?text=&area=2",
        'kzn': "https://kazan.hh.ru/search/vacancy?text=&area=88",
        'vdk': "https://vladivostok.hh.ru/search/vacancy?text=&area=22"
    }
    
    by_salary_dec_ending = "&salary=&currency_code=RUR&experience=doesNotMatter&employment=full&schedule=fullDay&schedule=flexible&order_by=salary_desc&search_period=&items_on_page=100&no_magic=true"
    by_salary_inc_ending = "&salary=&currency_code=RUR&experience=doesNotMatter&employment=full&schedule=fullDay&schedule=flexible&order_by=salary_asc&search_period=&items_on_page=100&no_magic=true"
    by_match_ending = "&salary=&currency_code=RUR&experience=doesNotMatter&employment=full&schedule=fullDay&schedule=flexible&order_by=relevance&search_period=&items_on_page=100&no_magic=true" 
    by_date_ending = "&salary=&currency_code=RUR&experience=doesNotMatter&employment=full&schedule=fullDay&schedule=flexible&order_by=publication_time&search_period=&items_on_page=100&no_magic=true"

    search_url_endings = [by_salary_dec_ending, by_salary_inc_ending, by_match_ending, by_date_ending]
    
    return [search_url_beginings[city] + end for end in search_url_endings], f"../hh-data/urls/{city}-fulltime-urls.txt"



In [41]:
save_vacancy_urls(*get_urls_and_path('vdk'))
save_vacancy_urls(*get_urls_and_path('smr'))
save_vacancy_urls(*get_urls_and_path('spb'))

{'https://vladivostok.hh.ru/vacancy/30522866',
 'https://vladivostok.hh.ru/vacancy/29805081',
 'https://vladivostok.hh.ru/vacancy/29483509',
 'https://vladivostok.hh.ru/vacancy/30579158',
 'https://vladivostok.hh.ru/vacancy/27259039',
 'https://vladivostok.hh.ru/vacancy/30665077',
 'https://vladivostok.hh.ru/vacancy/30466408',
 'https://vladivostok.hh.ru/vacancy/30515587',
 'https://vladivostok.hh.ru/vacancy/30533062',
 'https://vladivostok.hh.ru/vacancy/30709147',
 'https://vladivostok.hh.ru/vacancy/30318201',
 'https://vladivostok.hh.ru/vacancy/30009793',
 'https://vladivostok.hh.ru/vacancy/30489255',
 'https://vladivostok.hh.ru/vacancy/30380102',
 'https://vladivostok.hh.ru/vacancy/30651617',
 'https://vladivostok.hh.ru/vacancy/30596384',
 'https://vladivostok.hh.ru/vacancy/30621359',
 'https://vladivostok.hh.ru/vacancy/30667029',
 'https://vladivostok.hh.ru/vacancy/28832308',
 'https://vladivostok.hh.ru/vacancy/30715095',
 'https://vladivostok.hh.ru/vacancy/30737055',
 'https://vla

In [50]:
save_vacancy_urls(*get_urls_and_path('kzn'))


{'https://kazan.hh.ru/vacancy/30421395',
 'https://kazan.hh.ru/vacancy/30730520',
 'https://kazan.hh.ru/vacancy/30547289',
 'https://kazan.hh.ru/vacancy/30717414',
 'https://kazan.hh.ru/vacancy/30592768',
 'https://kazan.hh.ru/vacancy/30579601',
 'https://kazan.hh.ru/vacancy/26678077',
 'https://kazan.hh.ru/vacancy/30432799',
 'https://kazan.hh.ru/vacancy/29508916',
 'https://kazan.hh.ru/vacancy/30487037',
 'https://kazan.hh.ru/vacancy/30413278',
 'https://kazan.hh.ru/vacancy/30647685',
 'https://kazan.hh.ru/vacancy/27462032',
 'https://kazan.hh.ru/vacancy/30421662',
 'https://kazan.hh.ru/vacancy/30422896',
 'https://kazan.hh.ru/vacancy/30366725',
 'https://kazan.hh.ru/vacancy/30552551',
 'https://kazan.hh.ru/vacancy/30359309',
 'https://kazan.hh.ru/vacancy/30633706',
 'https://kazan.hh.ru/vacancy/30333340',
 'https://kazan.hh.ru/vacancy/30417502',
 'https://kazan.hh.ru/vacancy/30344174',
 'https://kazan.hh.ru/vacancy/29445942',
 'https://kazan.hh.ru/vacancy/30497579',
 'https://kazan.

{'https://vladivostok.hh.ru/vacancy/30522866',
 'https://vladivostok.hh.ru/vacancy/29805081',
 'https://vladivostok.hh.ru/vacancy/29483509',
 'https://vladivostok.hh.ru/vacancy/30579158',
 'https://vladivostok.hh.ru/vacancy/27259039',
 'https://vladivostok.hh.ru/vacancy/30665077',
 'https://vladivostok.hh.ru/vacancy/30466408',
 'https://vladivostok.hh.ru/vacancy/30515587',
 'https://vladivostok.hh.ru/vacancy/30533062',
 'https://vladivostok.hh.ru/vacancy/30709147',
 'https://vladivostok.hh.ru/vacancy/30318201',
 'https://vladivostok.hh.ru/vacancy/30009793',
 'https://vladivostok.hh.ru/vacancy/30489255',
 'https://vladivostok.hh.ru/vacancy/30380102',
 'https://vladivostok.hh.ru/vacancy/30651617',
 'https://vladivostok.hh.ru/vacancy/30596384',
 'https://vladivostok.hh.ru/vacancy/30621359',
 'https://vladivostok.hh.ru/vacancy/30667029',
 'https://vladivostok.hh.ru/vacancy/28832308',
 'https://vladivostok.hh.ru/vacancy/30715095',
 'https://vladivostok.hh.ru/vacancy/30737055',
 'https://vla

In [None]:
smr_begining = 
vdk_begining = 
spb_begining = "
kzn_begining = 

In [None]:
https://samara.hh.ru/search/vacancy?text=&area=78&salary=&currency_code=RUR&experience=doesNotMatter&employment=full&schedule=fullDay&schedule=flexible&order_by=publication_time&search_period=&items_on_page=100&no_magic=true

In [92]:
search_in_smr_url = "https://samara.hh.ru/search/vacancy?text=&area=78&salary=&currency_code=RUR&experience=doesNotMatter&employment=full&employment=part&employment=project&schedule=fullDay&schedule=shift&schedule=flexible&schedule=flyInFlyOut&order_by=relevance&search_period=&items_on_page=100&no_magic=true"
search_in_spb_url = "https://spb.hh.ru/search/vacancy?text=&area=2&salary=&currency_code=RUR&only_with_salary=true&experience=doesNotMatter&employment=full&employment=part&employment=project&schedule=fullDay&schedule=shift&schedule=flexible&schedule=flyInFlyOut&order_by=salary_desc&search_period=&items_on_page=100&no_magic=true"
search_in_kzn_url = "https://kazan.hh.ru/search/vacancy?text=&area=88&salary=&currency_code=RUR&experience=doesNotMatter&employment=full&employment=part&employment=project&schedule=fullDay&schedule=shift&schedule=flexible&schedule=flyInFlyOut&order_by=relevance&search_period=&items_on_page=100&no_magic=true"
search_in_vdk_url = "https://vladivostok.hh.ru/search/vacancy?text=&area=22&salary=&currency_code=RUR&experience=doesNotMatter&employment=full&employment=part&employment=project&schedule=fullDay&schedule=shift&schedule=flexible&schedule=flyInFlyOut&order_by=relevance&search_period=&items_on_page=100&no_magic=true"
search_spb_url_2 = "https://spb.hh.ru/search/vacancy?text=&area=2&salary=&currency_code=RUR&experience=doesNotMatter&employment=full&employment=part&employment=project&schedule=fullDay&order_by=salary_asc&search_period=&items_on_page=100&no_magic=true"

smr_urls = get_vacancy_urls(search_in_smr_url)
vdk_urls = get_vacancy_urls(search_in_vdk_url)
kzn_urls = get_vacancy_urls(search_in_kzn_url)
spb_urls = get_vacancy_urls(search_in_spb_url)
urls2 = get_vacancy_urls(search_spb_url_2)
urls2 = [url for url in urls2 if url not in spb_urls]

with open("../hh-data/urls/smr-urls.txt", "w") as file:
    file.write("\n".join(smr_urls))
with open("../hh-data/urls/vdk-urls.txt", "w") as file:
    file.write("\n".join(vdk_urls))
with open("../hh-data/urls/spb-urls.txt", "w") as file:
    file.write("\n".join(spb_urls))
with open("../hh-data/urls/spb-urls-2.txt", "w") as file:
    file.write("\n".join(urls2))
with open("..hh-data/urls/kzn-urls.txt", "w") as file:
    file.write("\n".join(kzn_urls))

In [15]:
set(list(set([2])) + list(set([1, 2])))

{1, 2}