# Data scraping from hh.ru and preparation

Main goals of this file are:  
- to get hands-on experience on scraping raw data from the web  
- to get hands-on experience on basic processing / organizing datasets

Secondary goals:  
- to understand how one can get data about vacancies from the biggest Russian job site - hh.ru  
- to make a snapshot of DS vacancies, process it and save in .csv for further analysis

In [2]:
#import requests
#import urllib.request
from datetime import datetime as dt
#import re
from bs4 import BeautifulSoup
import json
import numpy as np
import pandas as pd

from time import sleep

from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium import webdriver

import sys

In [3]:
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

handler = logging.StreamHandler(stream=sys.stdout)
handler.setFormatter(logging.Formatter(fmt='[%(asctime)s: %(funcName)s: %(levelname)s] %(message)s'))
logger.addHandler(handler)

## Preparations

In [51]:
output_filename_base = 'vacancies-data-scientist-'

URLs to search vacancies by words "аналитик данных" / "data scien*" in hh.ru looks like:  
https://hh.ru/search/vacancy?  
    text=data+scien*&                   # will be format parameter  
    search_field=name&search_field=description&  
    area=1&                             # will be format parameter, if is missing => search in all areas, can contain several values (area=1&area=2&...)  
    salary=150000&currency_code=RUR&    # will be format parameter, if set to '' (salary=&...) => search for all salary amounts  
    only_with_salary=true&              # leave only vacancies with salary  
    label=not_from_agency&              # leave direct eployers only  
    experience=doesNotMatter&  
    professional_role=96&               # if filter by specialization needed (96 = programmer, developer)
    order_by=relevance&  
    search_period=0&  
    items_on_page=100&  
    no_magic=true&  
    L_save_area=true  
    page=1&                         # these 2 parameters are used when navigating to different pages  
    hhtmFrom=vacancy_search_list

In [40]:
"""
Artefacts from v1

search_url_template = "https://hh.ru/search/vacancy?text={search_text}{search_fields}{areas}&salary={salary}&currency_code=RUR&experience=doesNotMatter{other}&order_by=relevance&search_period=0&items_on_page=100"
salary_level = '' #'150000'
search_texts = [
    "data+scien*",
    "%D0%B0%D0%BD%D0%B0%D0%BB%D0%B8%D1%82%D0%B8%D0%BA+%D0%B4%D0%B0%D0%BD%D0%BD%D1%8B%D1%85",
    "data analy*",
    "{}",
    "разработчик"
]
search_fields = {
    'vac_name': '&search_field=name',
    'vac_descr': '&search_field=description'
}
areas = {
    '#later': '{areas}', # left to configure later
    'All': '',
    'Moscow': '&area=1',
    'SPb': '&area=2',
    'Ekaterinburg': '&area=3',
    'Novosib': '&area=4',
    'Austria': '&area=7',
    'Erevan': '&area=13',
    'NNovgorod': '&area=66',
    'RostovND': '&area=76',
    'Samara': '&area=78',
    'Saratov': '&area=79',
    'Kazan': '&area=88',
    'Chelyabinsk': '&area=104',
    '???': '&area=159',
    'Almaty': '&area=160',
    'Minsk': '&area=1002',
    'Nur-Sultan': '&area=159',
    'Tbilisi': '&area=2758',
    'Tashkent': '&area=2759',
}
url_tail = '&page={}&hhtmFrom=vacancy_search_list'
"""

In [38]:
"""
Artefacts from v1
def combine_base_url(template=None, search_text='', search_field_keys=['vac_name'], area_keys=[], salary='', only_with_salary=False, other=''): #, items_per_page=100
    if template is None:
        return None
    search_fields_str = ''
    for key in search_field_keys:
        search_fields_str += search_fields[key]
    areas_str = ''
    for key in area_keys:
        areas_str += areas[key]
    if salary == '' or salary is None:
        salary_str = ''
    else:
        salary_str = str(salary)
    if only_with_salary:
        salary_str +=  '&only_with_salary=true'
    return template.format(search_text=search_text, search_fields=search_fields_str, areas=areas_str, salary=salary_str, other=other) #, items_per_page
"""

In [41]:
"""
base_url = combine_base_url(
    template = search_url_template,
    search_text=search_texts[4],
    area_keys=['Moscow'],
    salary=salary_level, only_with_salary=True,
    other='&label=not_from_agency&professional_role=96' # only for programmers vacancies
)
base_url
"""

'https://hh.ru/search/vacancy?text=разработчик&search_field=name&area=1&salary=&only_with_salary=true&currency_code=RUR&experience=doesNotMatter&label=not_from_agency&professional_role=96&order_by=relevance&search_period=0&items_on_page=100'

Simple class to contruct url

In [24]:
class HhRuUrlConstructor():
    map_search_fields = {
        'vac_name': '&search_field=name',
        'vac_descr': '&search_field=description'
    }

    map_areas = {
        None: '{areas}', # left to configure later
        'All': '',
        'Moscow': '&area=1',
        'SPb': '&area=2',
        'Ekaterinburg': '&area=3',
        'Novosib': '&area=4',
        'Austria': '&area=7',
        'Erevan': '&area=13',
        'NNovgorod': '&area=66',
        'RostovND': '&area=76',
        'Samara': '&area=78',
        'Saratov': '&area=79',
        'Kazan': '&area=88',
        'Chelyabinsk': '&area=104',
        '???': '&area=159',
        'Almaty': '&area=160',
        'Minsk': '&area=1002',
        'Nur-Sultan': '&area=159',
        'Tbilisi': '&area=2758',
        'Tashkent': '&area=2759',
    }

    base_part = 'https://hh.ru/search/vacancy?'
    url_tail = '&experience=doesNotMatter{other}&order_by=relevance&search_period=0&items_on_page=100'
    search_paging = '&page={}&hhtmFrom=vacancy_search_list'

    
    def __init__(self,
                    vacancy_name=None,
                    search_fields=['vac_name', 'vac_descr'],
                    areas=['Moscow'],
                    salary_level=None, # -1 or None = all available, but -1 is easier to check and overwrite later
                    currency='RUR',
                    only_with_salary=False,
                    other=''
                ) -> None:
        self.vacancy_name = vacancy_name
        self.search_fields = search_fields
        self.areas = areas
        self.salary_level = salary_level
        self.currency = currency
        self.only_with_salary = only_with_salary
        self.other = other
        
    def _get_url_vacancy(self, vacancy_name):
        result = ''
        if vacancy_name is None:
            vacancy_name = self.vacancy_name
        if vacancy_name is not None:
            result += 'text=' + self.vacancy_name
        return result
    

    def _get_url_search_fields(self, sfields):
        if sfields is None:
            sfields = self.search_fields
        if sfields is None:
            return ''
        result = ''
        if type(sfields) == type(list()):
            for i in sfields:
                result += self.map_search_fields.get(i, '')
        else:
            result += self.map_search_fields.get(sfields, '')
        return result
    

    def _get_url_areas(self, areas):
        result=''
        if areas is None:
            areas = self.areas
        if areas is None:
            return result
        if type(areas) == type(list()):
            for i in areas:
                result += self.map_areas.get(i, '')
        else:
            result += self.map_areas.get(areas, '')
        return result

        
    def _get_url_salary(self, salary, currency, only_with_salary): # salary == -1 -> get class default value (received when instantiated)
        result=''
        is_salary_given = False
        if salary == -1: # use default from class when instantiated
            salary = self.salary_level
        if (salary is not None) and (salary != -1): # now both values -> all available
            result += '&salary={}'.format(salary)
            is_salary_given = True
        if currency == -1: # use default from class when instantiated
            currency = self.currency
        if (currency is not None) and (currency != -1): # now both values -> all values
            result += '&currency_code=' + currency
        
        if is_salary_given: # if salary was given we can't ignore it even if [self.]only_with_salary set to False
            result += '&only_with_salary=true'
        else:
            if only_with_salary is None: # use default from class when instantiated
                only_with_salary = self.only_with_salary
            if only_with_salary is not None:
                result += '&only_with_salary=' + str(only_with_salary).lower()
        
        return result

            
    def get_url(self, sfields=None, areas=None, salary=-1, currency=-1, only_with_salary=None, other='', page=None):
        result = self.base_part + \
                self._get_url_vacancy(vacancy_name=None) + \
                self._get_url_search_fields(sfields=sfields) + \
                self._get_url_areas(areas=areas) + \
                self._get_url_salary(salary=salary, currency=currency, only_with_salary=only_with_salary) + \
                self.url_tail.format(other=other)
        if (page is None) or (page == 0):
            return result
        elif page == 'param':
            return result + self.search_paging
        else:
            return result + self.search_paging.format(page)

Testing created class on different search queries:

In [40]:
search_query = HhRuUrlConstructor(
    vacancy_name="data+scien*",
    search_fields='vac_name',
)
print(search_query.get_url())
print(search_query.get_url(areas='SPb')) # area=2 instead of 1
print(search_query.get_url(salary=200000))
print(search_query.get_url(salary=4000, currency='USD'))

https://hh.ru/search/vacancy?text=data+scien*&search_field=name&area=1&currency_code=RUR&only_with_salary=false&experience=doesNotMatter&order_by=relevance&search_period=0&items_on_page=100
https://hh.ru/search/vacancy?text=data+scien*&search_field=name&area=2&currency_code=RUR&only_with_salary=false&experience=doesNotMatter&order_by=relevance&search_period=0&items_on_page=100
https://hh.ru/search/vacancy?text=data+scien*&search_field=name&area=1&salary=200000&currency_code=RUR&only_with_salary=true&experience=doesNotMatter&order_by=relevance&search_period=0&items_on_page=100
https://hh.ru/search/vacancy?text=data+scien*&search_field=name&area=1&salary=4000&currency_code=USD&only_with_salary=true&experience=doesNotMatter&order_by=relevance&search_period=0&items_on_page=100


Making query url too receive full pull of data (with or without salary mentioned):  
- vacancy name: data analyst in Russian (there is no equivalent for 'data scientist')
- search by: vacancy name only (do not search in vacancy description)
- search area: Moscow only (the most vacancies are there)

Preparing browser under selenium control to collect data

In [9]:
chrome_mode = 'headed' #'headless' # for debug purposes we can change this value to any but 'headless' to run Chrome in standard mode
chrome_options = Options()
if chrome_mode == 'headless':
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--headless')
service = Service(executable_path="d:\\Applications\\WebDriver\\chromedriver-107-x32.exe")
browser = webdriver.Chrome(service=service, options=chrome_options)

In [41]:
browser.get(search_query.get_url())

The page source code analysis shows that all info about vacancies is located under 'template' key. This 1-row data looks like json-parseable multilevel structure. So let's feed it to json module and analyse more deeply.

In [42]:
soup = BeautifulSoup(browser.page_source, 'html.parser')
# Page source code showed us there is only 1 'template' tag on page.
# It contains a huge amount of data including vacancies list in dictionary-like format (possibly for JS parsing).
json_parsed = json.loads(soup.find_all('template')[0].text)
print('"Template" tag contains {} keys'.format(len(json_parsed)))

"Template" tag contains 525 keys


In [43]:
json_parsed

{'authPhone': None,
 'authNewEmployerAreaIdsToRedirect': [],
 'authNewEmployerCategories': [],
 'authNewEmployerFields': [],
 'authNewEmployerInitialValues': {},
 'authNewEmployerPhoneMask': None,
 'activeResumeAccessType': None,
 'accountTemporarilyLocked': {},
 'accountPhoneVerification': None,
 'applicantSignup': {'fields': [{'name': 'login',
    'conditions': {'minCount': 1,
     'maxCount': 1,
     'parts': [{'minLength': 0, 'maxLength': 127}]},
    'simpleValues': None,
    'customValues': None,
    'error': None,
    'empty': True},
   {'name': 'password',
    'conditions': {'minCount': 1,
     'maxCount': 1,
     'parts': [{'minLength': 6, 'maxLength': 32}]},
    'simpleValues': None,
    'customValues': None,
    'error': None,
    'empty': True},
   {'name': 'firstName',
    'conditions': {'minCount': 1,
     'maxCount': 1,
     'parts': [{'minLength': 1, 'maxLength': 255}]},
    'simpleValues': None,
    'customValues': None,
    'error': None,
    'empty': True},
   {'name'

#### Analyzing data structure

We've got a huge amount of 'empty' data structures after translating page to json format. There are empty dictionaries and dictionaries containing 'empty' data structures. So we have to clean out these artifacts to make the visual analysis of data more efficient.

In [13]:
# Function to check if dictionary is 'empty'
def is_dict_empty(input_dict):
    result = True
    for key in input_dict.keys():
        if (type(input_dict[key]) == type(dict())) and (len(input_dict[key]) > 0):
            # recurrent check for dictionaries
            result = result and is_dict_empty(input_dict[key])
        else:
            # the structure is empty if its len == 0 or they are None or they are empty list or dictionary
            checks_empty = (input_dict[key] is None) or (str(input_dict[key]) in ['{}', '[]']) or (len(str(input_dict[key])) == 0)
            result = result and checks_empty
        if not result:
            break
    return result

In [44]:
clean_dict = {}
for key in json_parsed.keys():
    if (type(json_parsed[key]) is type(dict())):
        if not is_dict_empty(json_parsed[key]):
            clean_dict[key] = json_parsed[key]
    else:
        checks_empty = (json_parsed[key] is None) or (str(json_parsed[key]) in ['{}', '[]']) or (len(str(json_parsed[key])) == 0)
        if not checks_empty:
            clean_dict[key] = json_parsed[key]
print('{} non-empty keys in result'.format(len(clean_dict)))
print('======================= Example of data =============================')
for i, key in enumerate(clean_dict.keys()):
    print('{} ====> {}'.format(key, clean_dict[key]))
    if i > 5:
        break

203 non-empty keys in result
applicantPackageType ====> basic
applicantPaymentSource ====> desktop
accountHistoryReplenishments ====> {'bills': [], 'documentLinksVisibility': False, 'currency': 'RUR'}
accountDelete ====> {'applicantName': '', 'resumesList': {'resumes': {'published': [], 'unpublished': []}, 'count': 0}}
adsSearchParams ====> {'puid11': 'searchVacancy', 'puid23': 'Москва', 'puid14': 'data scien*', 'puid29': '', 'puid30': '', 'puid12': '', 'puid13': ''}
anonymousUserType ====> applicant


Now the *clean_dict* variable contains key with not-empty data bound to them.  
Keys analysis shows **main search results** are under _'vacancySearchResult'->'vacancies'_ keys.  
Another useful keys are:  
_'searchClusters'_ contains grouping characteristics ('industry', 'groups')  
_'searchClustersDicts'_ contains options to split data further ('area', 'compensation', etc.)

In [15]:
# Total number of search results (cross check)
print(clean_dict['searchCounts'])
# or
print(clean_dict['vacancySearchResult']['totalResults'])

{'isLoad': False, 'value': 250}
250


In [17]:
# additional info about 'searchClustersDicts' key
clean_dict['searchClustersDicts'].keys()

dict_keys(['area', 'clusters'])

In case total search results number exceeds 2000 (seems to be hardcoded limit) it is possible to use 'searchClusters'->'area' to implement partial searches

In [18]:
clean_dict['searchClusters']['area']

{'groups': {'1': {'count': 250,
   'seoDomain': 'hh.ru',
   'order': 2,
   'title': 'Москва',
   'id': '1'},
  '232': {'count': 256,
   'seoDomain': 'hh.ru',
   'order': 1,
   'title': 'Центральный округ',
   'id': '232'},
  '2019': {'count': 6,
   'seoDomain': 'hh.ru',
   'order': 3,
   'title': 'Московская область',
   'id': '2019'}},
 'selectedValues': [1],
 'disableQueries': [],
 'order': 3}

The key _'count'_ contains number of vacancies found in area, _'id'_ - area id. We can parse _'area'->'groups'_ keys to search in regions separately. It gives a way to bypass the max vacancies limitation mentioned above.  
But now we have number of vacancies in Moscow exceeding page capacity (255 vs 100)

In [45]:
areas_to_crawl = []
area_ids_excluded = ['113'] # grouping ids: 113 = Russia
print('Search result num of vacancies: ', json_parsed['vacancySearchResult']['totalResults'])
if json_parsed['vacancySearchResult']['totalResults'] > 2000:
    search_json_obj = json_parsed['searchClusters']['area']['groups']
    for key in search_json_obj.keys():
        if (key not in area_ids_excluded) and (search_json_obj[key]['count'] > 0):
            areas_to_crawl.append(key)
    print('Total areas with vacancies num: {}'.format(len(areas_to_crawl)))
    print("First 10 values of areas' ids: ", areas_to_crawl[:10])
print('Areas to crawl:', areas_to_crawl)

Search result num of vacancies:  191
Areas to crawl: []


Let's check vacancies number per page to ensure _'items_on_page'_ parameter works fine

In [46]:
vacancies_info = clean_dict['vacancySearchResult']['vacancies']
print('Vacancies data type: {}'.format(type(vacancies_info)))
print('Num of vacancies: {}'.format(len(vacancies_info)))

Vacancies data type: <class 'list'>
Num of vacancies: 100


There is 100 records containing vacancies info from the 1st search page. So this parameter is OK. And now I want to find total number of pages to get all of them.  
The _'paging'_ key seems a right place for this info

In [47]:
clean_dict['vacancySearchResult']['paging']

{'previous': {'page': -1, 'disabled': True},
 'pages': [{'text': '1', 'page': 0, 'selected': True, 'inShortRange': True},
  {'text': '2', 'page': 1, 'selected': False, 'inShortRange': True}],
 'next': {'page': 1, 'disabled': False},
 'os': 'Win'}

Here is pagination data indeed. Last page number is contained in _'lastPage'->'page'_ key (NOTE: in case there are 2 or 3 pages the key _'lastPage'_ is absent). And total search results is splitted on N+1 pages as they are 0-indexed.  
So if _'lastPage'_ is absent we should determine last page number from _'pages'_ list  
Now we have all the necessary info to get all vacancies into 1 place.  
> [!!!] In case of only 1 page the key _'paging'_ will contain _null_ value  


### Harvesting vacancies info from hh.ru

It's time to combine all pieces of informations about harvesting procedure to make it correct:  
(1) if total number of results exceeds 2000 I have to get info partially from different areas (which are in *areas_to_crawl* variable). In this case there will possibly be duplicates as I don't know display algorithm in details. If *areas_to_crawl* has length == 0 then there are less than 2000 vacancies found (if it's not enough we can split further by underground lines or stations in Moscow and St.Petersburg (parameter _metro=9&_ or _metro=9.37&_ in address line) -> need to implement parser additionally);  
(2) if _'lastPage'_ key is absent we can determine max number of pages by parsing _'pages'_ key  
(3) we have to loop through all the pages  
(4) we don't need to clean json results as we know all the necessary keys

In [22]:
# returns num of pages to parse
def get_num_pages(json_dump, logger=logger):
    result = None
    div_paging = json_dump['vacancySearchResult'].get('paging', None)
    if div_paging is not None:
        if div_paging.get('lastPage', None) is None:
            result = max([x['page'] for x in div_paging['pages']])
        else:
            result = div_paging['lastPage']['page']
    return result

In [28]:
# parse all pages from first
def parse_pages(browser, search_url_obj, parse_by_underground=False, logger=logger):
    logger.debug('Flow control received')
    result = []
    browser.get(search_url_obj.get_url())
    max_available_records = 2000
    if (json_parsed['vacancySearchResult']['totalResults'] > max_available_records) and parse_by_underground:
        logger.debug('More than {} records detected => passing flow control to "parse_pages_by_undeground"'.format(max_available_records))
        result += parse_pages_by_underground(browser, first_page_url, tail_url)
    else:
        if parse_by_underground:
            logger.debug('Less than {} records detected => beginning parsing pages'.format(max_available_records))
        else:
            logger.debug('Skipping parse_pages_by_underground => beginning parsing pages')
        json_dump = json.loads(BeautifulSoup(browser.page_source, 'html.parser').find_all('template')[0].text)
        result += json_dump['vacancySearchResult']['vacancies']
        num_pages = get_num_pages(json_dump)
        logger.debug('Page 1 / {} parsed'.format(num_pages+1))
        if num_pages is not None:
            for page in range(1, num_pages+1):
                browser.get(search_url_obj.get_url(page=page))
                div_list = BeautifulSoup(browser.page_source, 'html.parser').find_all('template')
                json_dump = json.loads(div_list[0].text)
                result += json_dump['vacancySearchResult']['vacancies']
                logger.debug('Page {} / {} parsed'.format(page+1, num_pages+1))
    return result

In [20]:
# page address to filter search results by underground lines/stations
"""
https://hh.ru/search/vacancy?
                            area=1&
                            metro=9.37&
                            search_field=name&search_field=description&
                            text=%D0%B0%D0%BD%D0%B0%D0%BB%D0%B8%D1%82%D0%B8%D0%BA+%D0%B4%D0%B0%D0%BD%D0%BD%D1%8B%D1%85&
                            from=suggest_post&
                            clusters=true&
                            no_magic=true&
                            ored_clusters=true&
                            items_on_page=100&
                            enable_snippets=true
"""

'\nhttps://hh.ru/search/vacancy?\n                            area=1&\n                            metro=9.37&\n                            search_field=name&search_field=description&\n                            text=%D0%B0%D0%BD%D0%B0%D0%BB%D0%B8%D1%82%D0%B8%D0%BA+%D0%B4%D0%B0%D0%BD%D0%BD%D1%8B%D1%85&\n                            from=suggest_post&\n                            clusters=true&\n                            no_magic=true&\n                            ored_clusters=true&\n                            items_on_page=100&\n                            enable_snippets=true\n'

In [52]:
def parse_pages_by_underground(browser, start_page_url, tail_url, logger=logger):
    # at the moment function is called browser has already opened start_page_url
    # start_page_url doesn't contain 'page=' parameter
    #browser.get(start_page_url)
    logger.debug('Flow control received')
    json_dump = json.loads(BeautifulSoup(browser.page_source, 'html.parser').find_all('template')[0].text)
    # get all underground lines with search result > 0
    underground_dump = json_dump['searchClustersBasic']['metro']['groups']
    underground_lines = {key: underground_dump[key]['title'] for key in underground_dump.keys() if underground_dump[key]['type'] == 'line'}
    new_url_template = start_page_url + '&metro={}'
    logger.debug('Got {} underground lines. Begin parsing'.format(len(underground_lines)))
    
    result = []
    for line in underground_lines:
        logger.debug('Underground line: {}'.format(underground_lines[line]))
        url = new_url_template.format(line)
        browser.get(url)
        json_dump = json.loads(BeautifulSoup(browser.page_source, 'html.parser').find_all('template')[0].text)
        result += json_dump['vacancySearchResult']['vacancies']
        num_pages = get_num_pages(json_dump)
        logger.debug('Page 1 / {} parsed'.format(num_pages+1))
        if num_pages is not None:
            for page in range(1, num_pages+1):
                browser.get(url+tail_url.format(page))
                json_dump = json.loads(BeautifulSoup(browser.page_source, 'html.parser').find_all('template')[0].text)
                result += json_dump['vacancySearchResult']['vacancies']
                logger.debug('Page {} / {} parsed'.format(page+1, num_pages+1))
    return result

In [48]:
vacancies_info = []
if len(areas_to_crawl) == 0:
    areas_to_crawl.append('1')
if len(areas_to_crawl) > 0:
    for area in areas_to_crawl:
        vacancies_info += parse_pages(browser, search_query, parse_by_underground=False)

print('Final vacancies info contains {} record(s)'.format(len(vacancies_info)))

[2022-11-27 15:21:44,968: parse_pages: DEBUG] Flow control received
[2022-11-27 15:21:47,828: parse_pages: DEBUG] Skipping parse_pages_by_underground => beginning parsing pages
[2022-11-27 15:21:48,650: parse_pages: DEBUG] Page 1 / 2 parsed
[2022-11-27 15:21:51,901: parse_pages: DEBUG] Page 2 / 2 parsed
Final vacancies info contains 191 record(s)


In [49]:
len(vacancies_info)

191

We've got info from all pages and now can close browser

In [50]:
browser.quit()

Dumping data to have an opportunity to restore raw data later...

In [52]:
filename = output_filename_base + dt.now().date().strftime('%Y-%m-%d')
#filename = output_filename_base + dt.now().date().strftime('%Y-%m-%d')
with open('datasets/'+filename+'.json', 'w') as f:
    json.dump(vacancies_info, f)

And now it's time to select fields to fill in DataFrame

In [53]:
for key in vacancies_info[0].keys():
    print('{} =======> {}'.format(key, vacancies_info[0][key]))



Useful info keys:
- 'vacancyId' - unique vacancy id which can be used to see description on https://hh.ru/vacancy/[vacancyId]
- 'name' - no comments )
- 'company'->'visibleName' + 'company'->'department'->'@name' - department info (if exists)
- 'area'->'@id', 'area'->'name' - код и название условной географической области поиска
- 'address'->'displayName', 'address'->'marker'->('@lat', '@lng') - показываемый адрес и координаты для карты
- 'compensation'->{'from', 'to', 'currencyCode', 'gross'=(True="before taxes", False="clean amount")} (или 'compensation'->'noCompensation', if no data) - salary info
- 'workSchedule' - full / shift / remote...
- 'snippet' - vacancy's description pieces: dict('req' - requirements, 'resp' - responsibilities, 'cond' - conditions, 'skill' - ?not used?, 'desc' - ?)
- 'publicationTime'
- 'lastChangeTime'

Lets define function to get necessary data from json:

In [33]:
df_column_names = [
    'vacancy_id',
    'vacancy_name',
    'company_name',
    'company_dept',
    'area',
    'address',
    'latitude',
    'longitude',
    'salary_from',
    'salary_to',
    'salary_currency',
    'salary_gross',
    'publication_time',
    'last_changed',
    'schedule',
    'req',
    'resp',
    'cond',
    'skills'
]

In [54]:
def get_record_data(rec):
    result = dict()
    result['vacancy_id'] = rec['vacancyId']
    result['vacancy_name'] = rec['name']
    result['company_name'] = rec['company']['visibleName']
    if rec['company'].get('department', np.NAN) is np.NAN:
        result['company_dept'] = np.NAN
    else:
        result['company_dept'] = rec['company']['department'].get('@name', np.NAN)
    result['area'] = rec['area']['@id']
    if rec.get('address', None) is None:
        result['address'] = np.NAN
    else:
        result['address'] = rec['address'].get('displayName', np.NAN)
        if rec['address'].get('marker', None) is None:
            result['latitude'] = np.NAN
            result['longitude'] = np.NAN
        else:
            result['latitude'] = rec['address']['marker'].get('@lat', np.NAN)
            result['longitude'] = rec['address']['marker'].get('@lng', np.NAN)
    if rec['compensation'].get('noCompensation', None) is None:
        result['salary_from'] = rec['compensation'].get('from', np.NAN)
        result['salary_to'] = rec['compensation'].get('to', np.NAN)
        result['salary_currency'] = rec['compensation'].get('currencyCode', np.NAN)
        result['salary_gross'] = rec['compensation'].get('gross', np.NAN)
    else:
        result['salary_from'] = np.NAN
        result['salary_to'] = np.NAN
        result['salary_currency'] = np.NAN
        result['salary_gross'] = np.NAN
    result['publication_time'] = rec['publicationTime']['@timestamp']
    result['last_changed'] = rec['lastChangeTime']['@timestamp']
    result['schedule'] = rec['workSchedule']
    result['req'] = rec['snippet'].get('req', np.NAN)
    result['resp'] = rec['snippet'].get('resp', np.NAN)
    result['cond'] = rec['snippet'].get('cond', np.NAN)
    result['skills'] = rec['snippet'].get('skills', np.NAN)

    return result

In [55]:
raw_parsed_data = {name: [] for name in df_column_names}
for rec in vacancies_info:
    parsed = get_record_data(rec)
    for key in df_column_names:
        raw_parsed_data[key].append(parsed.get(key, np.NAN))
print('Control of num of records created:', len(raw_parsed_data['vacancy_id']))
print('Vacancies ID sample: ', raw_parsed_data['vacancy_id'][:10])

Control of num of records created: 191
Vacancies ID sample:  [71426958, 71757739, 71908944, 72819441, 71523297, 72323871, 71492524, 71484836, 72787688, 70673604]


Converting data structures created to pandas DataFrame. Then making readable data in 'publication_time' and 'last_changed' columns. And finally assigning 'vacancy_id' as primary index (it's unique for each record). It will be useful later if I'll decide to update data so I'll be able to filter out existing data or update existing records

In [56]:
df = pd.DataFrame(raw_parsed_data)
df['publication_time'] = df['publication_time'].apply(pd.to_datetime, unit='s')
df['last_changed'] = df['last_changed'].apply(pd.to_datetime, unit='s')
df.set_index('vacancy_id', inplace=True)
df.head()

Unnamed: 0_level_0,vacancy_name,company_name,company_dept,area,address,latitude,longitude,salary_from,salary_to,salary_currency,salary_gross,publication_time,last_changed,schedule,req,resp,cond,skills
vacancy_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
71426958,Data Scientist (команда Поиска),HeadHunter::Analytics/Data Science,HeadHunter::Analytics/Data Science,1,"Москва, улица Годовикова, 9с10",55.809343,37.628505,350000.0,,RUR,True,2022-11-22 10:01:59,2022-11-22 13:59:41,FULL_DAY,Опыт реализации и применения моделей машинного...,Развитие моделей поискового ранжирования и рек...,Возможность выбора места работы: удаленно или ...,
71757739,Аналитик/data science,Смартсофт,,1,"Москва, Нижний Сусальный переулок",55.761355,37.662659,,,,,2022-11-27 07:31:08,2022-11-27 08:19:38,FULL_DAY,Высшее техническое образование (IT или физмат)...,"Участие в задачах, связанных с анализом больши...","Работу в центре Москвы возле метро ""Курская"", ...",
71908944,Ведущий специалист отдела математического моде...,Eqvanta,,1,,,,,,,,2022-11-27 07:57:44,2022-11-27 08:20:14,REMOTE,Опыт работы в data science от 2х лет. Высшее м...,Работу над актуальными финтех-продуктами. Полн...,Продуктивную и вдохновляющую атмосферу. Понятн...,
72819441,Data Scientist / Аналитик Data Science,ПроКомплаенс,,1,"Москва, 1-й переулок Тружеников, 14с8",55.737938,37.571516,,,,,2022-11-26 15:14:05,2022-11-26 15:20:30,FULL_DAY,Опыт работы над прикладными задачами или решен...,Выявление и связывание ключевой информации в к...,"Оформление по ТК РФ (оплата отпуска, больничны...",
71523297,Data Scientist (Senior),"билайн: ИТ, Data, Digital","билайн: ИТ, Data, Digital",1,,,,,,,,2022-11-26 16:04:20,2022-11-26 16:19:13,REMOTE,"Мы рассчитываем, что у тебя есть опыт и знания...",Решать бизнес-задачи с помощью анализа данных ...,,


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191 entries, 71426958 to 71923183
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   vacancy_name      191 non-null    object        
 1   company_name      191 non-null    object        
 2   company_dept      75 non-null     object        
 3   area              191 non-null    int64         
 4   address           84 non-null     object        
 5   latitude          82 non-null     float64       
 6   longitude         82 non-null     float64       
 7   salary_from       18 non-null     float64       
 8   salary_to         17 non-null     float64       
 9   salary_currency   23 non-null     object        
 10  salary_gross      23 non-null     object        
 11  publication_time  191 non-null    datetime64[ns]
 12  last_changed      191 non-null    datetime64[ns]
 13  schedule          191 non-null    object        
 14  req           

In [58]:
df.to_csv('datasets/'+filename+'.csv')