#### requirements
```
fasttext==0.9.2
gensim==3.6.0
nltk==3.4
pandas==0.23.4
tqdm==4.47.0
```

In [None]:
import gzip
import json

def read_vacancies_part(part_num):
    with gzip.open(f'vacancies-{part_num:02}.json.gz') as fp:
        return json.loads(fp.read())

In [None]:
from nltk.corpus import stopwords
from gensim.parsing import preprocessing

stop_words = frozenset(stopwords.words('russian'))


def generate_features(data):
    yield data['employer'][:7]

    yield f'ar{data["area_id"]}'

    sal_lo, sal_hi = data['compensation_from'], data['compensation_to']
    if sal_lo:
        yield f'lo{sal_lo // 10_000}'
    if sal_hi:
        yield f'hi{sal_hi // 10_000}'
    if sal_lo and sal_hi:
        yield f'spr{(sal_hi - sal_lo) // 10_000}'

    yield data['work_schedule'] or 'nosch'

    yield data['currency'] or 'rur'

    yield ' '.join([data['name']] * 5)

    yield from data['key_skills']

    yield data['description']
        

def extract_text(vacancy_data):
    text = ' '.join(generate_features(vacancy_data))
    
    return ' '.join(
        filter(
            lambda t: t not in stop_words,
            preprocessing.preprocess_string(
                text.replace('\n', ' '),
                filters=(
                    lambda t: t.lower(),
                    preprocessing.strip_tags,
                    preprocessing.strip_punctuation,
                    preprocessing.strip_multiple_whitespaces,
                ),
            )
        )
    )

In [None]:
import pandas as pd

train_specializations = {
    vacancy_id: list(map(int, specs[1:-1].split(',')))
    for vacancy_id, specs in (
        pd.read_csv('train_labels.csv.gz', compression='gzip')
        .set_index('vacancy_id')['specializations']
        .iteritems()
    )
}
test_ids = pd.read_csv('test_vacancy_ids.csv.gz', compression='gzip').values.ravel()

In [None]:
%%time
from tqdm import tqdm

with open('train.txt', 'w') as fp, tqdm(total=len(train_specializations)) as pbar:
    for part_num in range(1, 11):
        part = read_vacancies_part(part_num)
        
        for key, data in part.items():
            vacancy_id = int(key)
            if vacancy_id not in train_specializations:
                continue

            labels = train_specializations[vacancy_id]
            text = extract_text(data)

            print(
                *(f'__label__{label}' for label in labels),
                text,
                file=fp,
            )

            pbar.update(1)

In [None]:
%%time
import fasttext

model = fasttext.train_supervised(
    input='train.txt',
    lr=0.65,
    epoch=15,
    wordNgrams=4,
    dim=150,
    loss='ova',
)

In [None]:
predicted = {}

with tqdm(total=len(test_ids)) as pbar:
    for part_num in range(1, 11):
        part = read_vacancies_part(part_num)
        for key, data in part.items():
            vacancy_id = int(key)
            if vacancy_id not in test_ids:
                continue
            
            text = extract_text(data)
            labels, _ = model.predict(text, k=6, threshold=0.13)
            if not labels:
                labels, _ = model.predict(text, k=3)

            predicted[vacancy_id] = [int(label.split('__label__')[1]) for label in labels]
            pbar.update(1)

In [None]:
submission = pd.DataFrame([
    (vacancy_id, predicted[vacancy_id])
    for vacancy_id in test_ids
], columns=['vacancy_id', 'specializations'])

submission.to_csv('submission.csv.gz', index=False, compression='gzip')