# Analysing vacancies description from HH.RU

In [1]:
import http.client
import datetime
import json
import re
import string
from operator import itemgetter
from collections import OrderedDict
import itertools

## Supporting Functions

In [2]:
#Providing headers as recomennded in API Documentation
headers = {"User-Agent": "HH-User-Agent"}

#function to get all the related vacancy ids
def get_vacancy_ids(keyword):
    vacancy_ids = []
    conn = http.client.HTTPSConnection("api.hh.ru")
    per_page = 100 #100 is a maximum allowed by API
    page = 0
    count = per_page
    date_from = (datetime.datetime.now() - datetime.timedelta(days=29)).strftime('%Y-%m-%dT%H:%M:%S')
    date_to = datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
    area_id = 113 #Russia
    
    while count == per_page:
        path = ("/vacancies?text={}&area={}&per_page={}&date_from={}&date_to={}&page={}"
                .format(keyword,area_id,per_page, date_from, date_to, page))
        
        conn.request("GET", path, headers=headers)
        resp = conn.getresponse()
        if resp.status != 200:
        # something went wrong
            raise ValueError('API error happened.')
        vacancies = resp.read()
        conn.close()

        count = len(json.loads(vacancies)['items'])
        page = page+1
        for item in json.loads(vacancies)['items']:
            vacancy_ids.append(item['id'])
    return vacancy_ids


#function to retrieve vacancy description by vacancy id and save it to a txt file.
def get_vacancies(vacancy_ids, ):
    for vac_id in vacancy_ids:
        conn = http.client.HTTPSConnection("api.hh.ru")
        conn.request("GET", "/vacancies/{}".format(vac_id), headers=headers)
        resp = conn.getresponse()
        if resp.status != 200:
        # something went wrong
            raise ValueError('API error happened.')
        vacancy_txt = resp.read()
        conn.close()
        vacancy = json.loads(vacancy_txt)
        #cleaning description out of html tags and other irrelevant charachters
        clean_desc = ''
        desc = re.sub('<[^>]*>', '', vacancy['description'].lower())
        desc = re.sub('ur[^a-zа-я]+', ' ', desc, re.UNICODE)        
        words = desc.split()
        for word in words:
                    if len(word.strip()) > 2:
                        clean_desc = desc + " " + word
        
        with open('corpus.txt', 'a') as f:
            f.write(" " + clean_desc)
            f.close
    print('file corpus.txt with vacancies descriptions is created in the working directory')

Getting vacancies descriptions.

In [3]:
#getting vacancy ids for Data Scientist keyword search
ids = get_vacancy_ids("Data+Scientist")

In [4]:
#getting vacancies descr file... Warning!!! takes time...
get_vacancies(ids)

file corpus.txt with vacancies descriptions is created in the working directory


In [5]:
document_text = open('corpus.txt', 'r')
text_string = document_text.read()
wordlist = re.findall(r'\b[a-zа-я]{3,15}\b', text_string)

frequency_dict = {}

for word in wordlist:
    count = frequency_dict.get(word,0)
    frequency_dict[word] = count + 1
     
frequency_list = frequency_dict.keys()

In [6]:
# dictionary sorted by value in descending order
sorted_frequency_dict = OrderedDict(sorted(frequency_dict.items(), key=lambda t: t[1],reverse=True))

In [7]:
# slicing 50 most frequent words
first_50 = itertools.islice(sorted_frequency_dict.items(), 0, 49)
for key, value in first_50:
    print (key, value)

and 704
данных 567
для 561
data 545
опыт 542
работы 518
the 395
знание 278
python 255
обучения 228
with 223
моделей 212
машинного 191
анализа 176
for 165
компании 164
или 161
задач 155
learning 152
области 147
задачи 145
алгоритмов 136
sql 133
бизнес 128
experience 127
что 125
анализ 122
machine 121
science 117
требования 116
spark 116
решения 115
условия 114
разработка 113
участие 112
умение 103
построение 103
возможность 102
работа 102
big 102
как 101
you 97
team 96
будет 94
нас 93
образование 88
есть 88
are 87
business 86


### TODO next: Remove English and Russian stop words.