In [305]:
import lightgbm as lgb, catboost as cb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

import os
import networkx as nx
from collections import Counter

from tqdm import tqdm
import pickle

pd.set_option('display.max_columns', None)



In [306]:
# from google.colab import drive
# drive.mount('/content/drive')
# root_dir = "/kaggle/input/nsu-jobs/"

In [307]:
!pip install pymorphy2
!pip install cytoolz



In [308]:
X_train = pd.read_csv("/kaggle/input/nsu-jobs/nsu-bda-2023-jobs/X_train.csv")
y_train = pd.read_csv("/kaggle/input/nsu-jobs/nsu-bda-2023-jobs/y_train.csv")
X_train=X_train.merge(y_train,how='left', on='id')

In [309]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27934 entries, 0 to 27933
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        27934 non-null  int64  
 1   name                      27934 non-null  object 
 2   has_test                  27934 non-null  bool   
 3   response_letter_required  27934 non-null  bool   
 4   salary_from               23902 non-null  float64
 5   salary_currency           27934 non-null  object 
 6   salary_gross              27786 non-null  object 
 7   published_at              27934 non-null  object 
 8   created_at                27934 non-null  object 
 9   employer_name             27934 non-null  object 
 10  description               27933 non-null  object 
 11  area_id                   27934 non-null  int64  
 12  area_name                 27934 non-null  object 
 13  salary_to                 27934 non-null  float64
dtypes: boo

In [310]:
X_train.description = X_train.description.fillna('')
#Все времена индентичны, чекал X_train[X_train.published_at!=X_train.created_at]
X_train.published_at = pd.to_datetime(X_train.published_at, errors='raise')
X_train.created_at = pd.to_datetime(X_train.created_at, errors='raise')
X_train['timestamp'] = (X_train.published_at.astype('int64') / 10**9)
X_train.salary_gross=X_train.salary_gross.astype('bool')

In [311]:
X_train.loc[X_train.salary_gross, 'salary_from'] = X_train.loc[X_train.salary_gross].salary_from*1.13

# Features from title

In [312]:
import nltk
nltk.download('stopwords')

def int_to_roman(x):
    """
    Normalizing titles like software engineer 3

    Also filters out numbers that are not likely part of a seniority description, i.e. 2000
    """

    if not x.isnumeric():
        return x
    x = int(x)
    ints = (1000, 900,  500, 400, 100,  90, 50,  40, 10,  9,   5,  4,   1)
    nums = ('M',  'CM', 'D', 'CD','C', 'XC','L','XL','X','IX','V','IV','I')
    result = []
    for i in range(len(ints)):
        count = int(x / ints[i])
        result.append(nums[i] * count)
        x -= ints[i] * count
    result = ''.join(result).lower()
    if any([n in result for n in ['M', 'C', 'D', 'X', 'L']]):
        return ""
    return ''.join(result).lower()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [313]:
from pymorphy2 import MorphAnalyzer
from nltk import wordpunct_tokenize
from cytoolz import memoize
import re
from nltk.corpus import stopwords
from string import punctuation

morph = MorphAnalyzer()

stops = set(stopwords.words('russian'))
stops.update(set(punctuation))
stopwords = ['ул', 'пр', 'кт']
stops.update(set(stopwords))


functors_pos = {'INTJ', 'PRCL', 'CONJ', 'PREP'}  # function words

def preprocess_title(title:str):
    if pd.isna(title) or title == "":
        return []
    tokens = wordpunct_tokenize(title)
    tokens = [token.lower() for token in tokens]
    tokens = [re.sub(r"[^\w\s]", "", token) for token in tokens]
    # tokens = [re.sub(r"\d+", "", token) for token in tokens]
  ## lemmatization
    tokens = [morph.normal_forms(token)[0] for token in tokens]

  # удалить дубликаты
    tokens = [k for k, v in Counter(tokens).items() if v == 1]

  # удалить междометия, частицы, союзы и предлоги
    functors_pos = {'INTJ', 'PRCL', 'CONJ', 'PREP'}  # function words
    tokens = [token for token in tokens if morph.parse(token)[0].tag.POS not in functors_pos]

  # удалить слова из 1 символа кроме
    good_words = {'c'}
    tokens = [token for token in tokens if len(token) >= 2 or token in good_words]

  #remove stop words
    tokens = [token for token in tokens if token not in stops]

    tokens = [int_to_roman(token) for token in tokens]
    tokens = list(filter(bool, tokens))
  # print(f"{title} -> {tokens}")
    return tokens

In [314]:
titles = ['директора 2', 'дата-аналитик', 'Менеджер по продажам (брокер) элитной городской недвижимости', 'Продавец-кассир (Москва, Вернадского, 93)']
for t in titles:
    print(preprocess_title(t))

['директор']
['дата', 'аналитик']
['менеджер', 'продажа', 'брокер', 'элитный', 'городской', 'недвижимость']
['продавец', 'кассир', 'москва', 'вернадский', 'xciii']


In [315]:
from tqdm import tqdm
names1 = []
names2 = []
names3 = []
for t in tqdm(X_train.name):
    title = preprocess_title(t)
    if(len(title)>=3):
        names1.append(title[0])
        names2.append(title[1])
        names3.append(title[2])
    elif(len(title)>=2):
        names1.append(title[0])
        names2.append(title[1])
        names3.append('')
    elif(len(title)>=1):
        names1.append(title[0])
        names2.append('')
        names3.append('')
    else:
        names1.append('')
        names2.append('')
        names3.append('')

100%|██████████| 27934/27934 [00:59<00:00, 472.22it/s]


In [316]:
X_train['start_name1'] = names1
X_train['start_name2'] = names2
X_train['start_name3'] = names3

In [317]:
title_counts = Counter(X_train['name'].apply(lambda x: " ".join(preprocess_title(x))).values)
del title_counts['']
title_counts.most_common(10)

[('менеджер продажа', 344),
 ('менеджер работа клиент', 203),
 ('уборщица уборщик', 197),
 ('юрист', 163),
 ('помощник юрист', 149),
 ('аналитик', 146),
 ('оператор call центр', 136),
 ('frontend разработчик', 124),
 ('копирайтер', 115),
 ('офис менеджер', 108)]

In [318]:
len(title_counts)

14968

In [319]:
from cytoolz import isdistinct, topk
from itertools import product
from operator import itemgetter

def get_gram_counts(tokens, best_grams):
    results = []
    seen = set()
    if not best_grams:
        gram_product = product(tokens, repeat=2)
    else:
        gram_product = product(tokens, best_grams)
    for title_grams in gram_product:
        if not isdistinct(title_grams):
            continue
        title = " ".join(title_grams)
        if title in seen:
            continue
        else:
            seen.add(title)
        count = title_counts.get(title, 0)
        results.append((title, count))
    return sorted(results, key=itemgetter(1), reverse=True)


def optimize_title(x:str, topn=3, title_counts=title_counts):
    tokens = preprocess_title(x)
    if not tokens:
        return x
    if len(tokens)==1:
        return tokens[0]

    starting_score = title_counts.get(x, 1)
    best_ngrams = [(token, title_counts.get(token, 0)) for token in tokens]
    gram_counter = 2
    while gram_counter <= len(tokens): # Continue chaining tokens to get the highest score
        gram_counts = get_gram_counts(tokens, [token for token, score in best_ngrams])
        best_ngrams.extend(gram_counts)
        best_ngrams = list(topk(topn, best_ngrams, key=itemgetter(1)))
        if not any([g in best_ngrams for g in gram_counts]):  # The most recent get_gram_counts did not 'make the cut'
            best_ngram_found = topk(1, best_ngrams, key=itemgetter(1))[0]
            best_ngram, best_ngram_score = best_ngram_found
            if best_ngram_score > starting_score:
                return best_ngram
            else:
                return x
        gram_counter += 1
    return best_ngrams[0][0]

In [320]:
optimum_titless = []
optimum_countss = []

for title in tqdm(X_train.name):
    optimum_title = optimize_title(title, title_counts=title_counts)
    optimum_count = title_counts[optimum_title]
    optimum_titless.append(optimum_title)
    optimum_countss.append(optimum_count)

100%|██████████| 27934/27934 [01:03<00:00, 437.23it/s]


In [321]:
X_train['optimum_title'] = optimum_titless
X_train[['name','start_name1','start_name2','start_name3','optimum_title']].sample(10)

Unnamed: 0,name,start_name1,start_name2,start_name3,optimum_title
12805,"Продавец-консультант (Санкт-Петербург, пр-кт Э...",продавец,консультант,санкт,продавец консультант
11492,Менеджер по продажам (ГПД),менеджер,продажа,гпд,менеджер продажа
20261,Middle+ React разработчик (удалённо),middle,react,разработчик,разработчик
14812,Повар,повар,,,повар
1166,Backend Python / Django разработчик (удаленно),backend,python,django,python разработчик
25707,Укладчик-упаковщик,укладчик,упаковщик,,укладчик упаковщик
19913,Уборщик производственных и служебных помещений,уборщик,производственный,служебный,уборщик помещение
24122,"Продавец-кассир (Москва, ул Дубнинская, 44А)",продавец,кассир,москва,продавец кассир
27396,Педагог-хореограф,педагог,хореограф,,педагог хореограф
26134,Помощник менеджера проекта,помощник,менеджер,проект,менеджер проект


In [322]:
# X_train[X_train.employer_name=='Астор']

# Features description

In [323]:
from pymorphy2 import MorphAnalyzer
from nltk import wordpunct_tokenize, word_tokenize
from cytoolz import memoize
import re
from nltk.corpus import stopwords
from string import punctuation
from collections import Counter

morph = MorphAnalyzer()

stops = set(stopwords.words('russian'))
stops.update(set(punctuation))
stopwords = ['ул', 'пр', 'кт']
stops.update(set(stopwords))

functors_pos = {'INTJ', 'PRCL', 'CONJ', 'PREP'}  # function words

def is_rus(s: str):
    rus = set('абвгдеёжзийклмнопрстуфхцчшщъыьэюя1234567890')
    return all(map(lambda symbol: symbol in rus, s))

def tokenize(desc:str,is_lemmatization=False, mod=1):
    if pd.isna(desc) or desc == "":
        return []
    
    if mod == 1:
        tokens = wordpunct_tokenize(desc)
    else:
        tokens = word_tokenize(desc)
    
    tokens = [token.lower() for token in tokens]
    
    if is_lemmatization:
        tokens = [morph.normal_forms(token)[0] for token in tokens]
    
    return tokens
        
    
def desc_to_tokens(tokens:list, delete_non_unction_words=False, only_words=True):

  if only_words:
      tokens = [re.sub(r"[^\w\s]", "", token) for token in tokens]
  # tokens = [re.sub(r"\d+", "", token) for token in tokens]
  # lemmatization не делаю, т.к. высшее становится высокий
#   if is_lemmatization:
#       tokens = [morph.normal_forms(token)[0] for token in tokens]

  # удалить дубликаты
  # tokens = [k for k, v in Counter(tokens).items() if v == 1]

#   удалить междометия, частицы, союзы и предлоги
  if delete_non_unction_words:
      functors_pos = {'INTJ', 'PRCL', 'CONJ', 'PREP'}  # function words
      tokens = [token for token in tokens if morph.parse(token)[0].tag.POS not in functors_pos]

  # удалить слова из 1 символа кроме
  # good_words = {'c', '.'}
  # tokens = [token for token in tokens if len(token) >= 2 or token in good_words]

  #remove stop words
  # tokens = [token for token in tokens if token not in stops]

  tokens = list(filter(bool, tokens))
  # print(f"{desc} -> {tokens}")
  return tokens

In [324]:
"""
Проверяет, есть ли токен X в delta токенов окресности списка токенов 
m - объект match
"""
def is_word_near_eng(x: str, spisok: list, position: int, delta=6):
    
    if position >= len(spisok):
        return False
    
    okresnost = (spisok[:position])[-delta:] + (spisok[position+1:])[:delta]
    
    return x in okresnost

"""
Возвращает строку из eng_levels = ['A', 'B', 'C'] или None
"""
def get_english(desc_tokens: list):
    
    eng_levels = ['A', 'B', 'C']
    
    eng_levels_matching_strong = {
        'с1': eng_levels[2],
        'с2': eng_levels[2],
        'c1': eng_levels[2],
        'c2': eng_levels[2],
        'b1': eng_levels[1],
        'b2': eng_levels[1],
        'a1': eng_levels[0],
        'a2': eng_levels[0],
        'intermediate': eng_levels[1],
        'advanced': eng_levels[2],
        'proficient': eng_levels[2],
        'elementary': eng_levels[0],
    }
    
    eng_levels_matching = {
        'разговорный': eng_levels[2],
        'свободный': eng_levels[2],
        'свободно': eng_levels[2],
        'письменный': eng_levels[1],
        'уверенный':eng_levels[1],
        'чтение': eng_levels[1], 
        'технический': eng_levels[1],
        'литература': eng_levels[1],
        'средний': eng_levels[1],
        'хороший': eng_levels[1],
        'написание': eng_levels[1],
        'знать': eng_levels[1],
        'уровень': eng_levels[1],
        'общение': eng_levels[2],
    }
    
    angliiskii = 'английский'
    
    for token in desc_tokens:
        for eng_level in eng_levels_matching_strong.keys():
            if eng_level in token:
                return eng_levels_matching_strong[eng_level]
    
    for i in range(len(desc_tokens)):
        if angliiskii in desc_tokens[i]:
            for eng_level in eng_levels_matching.keys():
                if is_word_near_eng(position=i, spisok=desc_tokens,x=eng_level):
                    return eng_levels_matching[eng_level]
     
    return None

In [325]:
eng_test = ['Английский или немецкий язык на разговорном уровне ',
'Английский за счет компании;',
'Английский язык (письменный профессиональный must have),',
'если знаете английский язык и управляли небольшой командой.',
'Уверенный английский для изучения научной литературы.',
'английский язык - чтение технической литературы',
'Желательно английский язык для чтения тех. Документации',
'Английский не ниже Intermediate.',
'английский (средний, профессиональная терминология);',
'Команда русскоговорящая, но хороший английский может Вам пригодиться',
'Английский язык (свободно).',
'Технический английский язык (чтение документации',
'Английский язык - на уровне, достаточном для',
'английский будет плюсом',
'ной язык общения — английский',
'Английский язык не ниже уровня Upper Intermediate.',
'Английский язык Intermediate и выше',
'Корпоративный английский язык',
'Английский от upper-intermediate',
'нужно знать английский язык, а также',
'Английский язык от <strong>Intermediate<',
'Английский разговорный',
'Английский язык технический',
'Английский язык (на уровне чтения технической документации)',
'Свободный английский язык',
'Английский язык - intermediate',
'Английский не -ниже уровня B2 (Upper-Intermediate);',
'Английский, письменный и устный, свободное общение<',
'Хороший технический и разговорный английский (Intermediate +)<',
'Английский язык на уровне чтения профессиональной литературы',
'Английский язык на уровне написания текстов.',
'Разговорный английский язык.',
'Английский язык - не ниже С1;']

# h = [desc_to_tokens(a, True, True, mod = 0) for a in eng_test]

tokens_eng=[tokenize(a,is_lemmatization=True,mod = 0) for a in eng_test]
h = [desc_to_tokens(a, True) for a in tokens_eng]
h1 = [get_english(t) for t in h]

for lvl, desc in zip(h1, eng_test):
    print (f'{desc}\n -> {lvl}')

Английский или немецкий язык на разговорном уровне 
 -> C
Английский за счет компании;
 -> None
Английский язык (письменный профессиональный must have),
 -> B
если знаете английский язык и управляли небольшой командой.
 -> B
Уверенный английский для изучения научной литературы.
 -> B
английский язык - чтение технической литературы
 -> B
Желательно английский язык для чтения тех. Документации
 -> B
Английский не ниже Intermediate.
 -> B
английский (средний, профессиональная терминология);
 -> B
Команда русскоговорящая, но хороший английский может Вам пригодиться
 -> B
Английский язык (свободно).
 -> C
Технический английский язык (чтение документации
 -> B
Английский язык - на уровне, достаточном для
 -> B
английский будет плюсом
 -> None
ной язык общения — английский
 -> C
Английский язык не ниже уровня Upper Intermediate.
 -> B
Английский язык Intermediate и выше
 -> B
Корпоративный английский язык
 -> None
Английский от upper-intermediate
 -> B
нужно знать английский язык, а также
 ->

In [326]:
"""
Проверяет, есть ли токен X в delta токенов окресности списка токенов 
m - объект match
"""
def is_word_near_exp(x: str, spisok: list, position: int, delta=6):
    
    if position >= len(spisok):
        return False
    
    okresnost = (spisok[:position])[-delta:] + (spisok[position+1:])[:delta]
    
    return x in okresnost

def get_number(t: str):
    regular_digit = r'\b\d-й\b'
    if t.replace(',','',1).isdigit() or t.replace('.','',1).isdigit():
        return float(t.replace(',','.',1))           
    if re.match(regular_digit, t):  # 1-й 3-й 2-й
        return int(''.join(filter(str.isdigit, t)))
    return None
    

def get_experience(desc_tokens: list, debug=False):
#     опыт + число + год/месяц (рядом)
#     опыт + год
#     опыт работа

    year_time = ['год', 'лет']
    mounth_time = ['месяц']
    
    need_exp = ['только', 'обязательный']
    no_exp = 'без'
    
    delta = 15
    
    default_exp = 1
    
    is_need_exp = 0
    
    for i in range(len(desc_tokens)):
        if 'опыт' in desc_tokens[i]: # нашел опыт
            if debug:
                print(f'check: {desc_tokens[i+1:i+1+delta]}')
        
            next_tokens = desc_tokens[i+1:i+1+delta]
            for j in range(len(next_tokens)):
                if debug:
                    print(f'check: {next_tokens[j]}')
                num = get_number(next_tokens[j]) # в следующих токенах есть число
                if debug:
                    print(f'num: {num}')
                if num:
                    if num >= 15: # не то
                        continue
                    try:
                        if debug:
                            print(f'desc_tokens[j+1]: {next_tokens[j+1]}')
                        # после числа должен быть год или месяц

                        if next_tokens[j+1] in year_time: 
                            # опыт + число + год/месяц (рядом)
                            return num
                        if next_tokens[j+1] in mounth_time:
                            return num/12
                    except Exception:
                        continue
                else:
                    if debug:
                        print(f'{next_tokens[j]} - не число')
                    
            # в следующих токенах нет числа, поищем слово год
            for value in desc_tokens[i+1:i+1+delta]:
                if 'год' in value:
                    return 1
        
            # вероятно, это не тот опыт.
            # в окресности опыт поищем 'без'
            if is_word_near_exp(no_exp, desc_tokens, i):
                # пока запомним
                is_need_exp = 0
            
            # в окресности опыт поищем 'только' или 'обязателен'
            for h in need_exp:
                if is_word_near_exp(h, desc_tokens, i, 20):
                    # пока запомним
                    is_need_exp = default_exp
            
            # опыт работа - считаем что нужен
            for value in desc_tokens[i+1:i+1+2]:
                if 'работа' in value:
                    is_need_exp = default_exp
            
    return is_need_exp
            

In [327]:
from tqdm import tqdm
exp_test = ['Опыт работы с Java от 3-х лет;',
'опыт работы не менее 2-х лет',
'аналогичный опыт работы от года;',
'опыт работы в продажах электро и другого сложного оборудования обязателен.',
'тебя есть опыт преподавания от 6 месяцев;',
'Возможно оформление без опыта работы',
'Опыт работы желателен.',
'опыт работы по аналогичному направлению деятельности не менее 3-х лет;',
'Опыт программирования в 1С 8 не менее 3-х лет;',
'Имеется опыт в IT recruiting (инхаус от 1 года или агентство от 6 месяцев),',
'Возможно без опыта, но большое желание его получить',
'На вакансию рассматриваются только кандидаты с опытом работы в сфере HR',
'Опыт работы <strong>в найме персонала</strong> от 2-х лет Опыт найма сотрудников, которые работают в компании год и более;<',
'Релевантный опыт работы как преимущество;</p> <p>- Приветствуется опыт в сетевых компаниях;',
'отделом технической поддержки (опыт от 1-ого года) ',
'Имеете опыт общения с конечным потребителем софта',
'опыт работы с детьми от 1 года;',
'с художественным академическим образованием и опытом более 5 лет',
'Рассматриваем только с опытом продажи',
'Опыт работы от года;',
'Управленческий опыт работы от 0,5 года (желательно)',
'опыт работы в PR от 2-х лет;',
'Опыт работы с sql, kotlin. Опыт работы на руководящей должность от 1 года',
'Опыт работы в данной сфере обязателен, будет плюсом опыт в химчистке v Опыт работы с sql, kotlin.',
'Имеет опыт работы в интернет-маркетинге от 2-х лет']

# ttttt = df.loc[0:1000, 'description']

# ttttt - токенизировано mod=0

tokens_exp=[tokenize(a,is_lemmatization=True,mod = 0) for a in exp_test]

h = [desc_to_tokens(a,  False, False) for a in tokens_exp]
h1 = [get_experience(u) for u in h]


for i, j in zip(exp_test, h1):
    print(f'{i}\n -> {j}')

Опыт работы с Java от 3-х лет;
 -> 3
опыт работы не менее 2-х лет
 -> 2
аналогичный опыт работы от года;
 -> 1
опыт работы в продажах электро и другого сложного оборудования обязателен.
 -> 1
тебя есть опыт преподавания от 6 месяцев;
 -> 0.5
Возможно оформление без опыта работы
 -> 1
Опыт работы желателен.
 -> 1
опыт работы по аналогичному направлению деятельности не менее 3-х лет;
 -> 3
Опыт программирования в 1С 8 не менее 3-х лет;
 -> 3
Имеется опыт в IT recruiting (инхаус от 1 года или агентство от 6 месяцев),
 -> 1.0
Возможно без опыта, но большое желание его получить
 -> 0
На вакансию рассматриваются только кандидаты с опытом работы в сфере HR
 -> 1
Опыт работы <strong>в найме персонала</strong> от 2-х лет Опыт найма сотрудников, которые работают в компании год и более;<
 -> 2
Релевантный опыт работы как преимущество;</p> <p>- Приветствуется опыт в сетевых компаниях;
 -> 1
отделом технической поддержки (опыт от 1-ого года) 
 -> 1
Имеете опыт общения с конечным потребителем софта


In [328]:
test_df = X_train.loc[0:99,['id','description' ]]
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           100 non-null    int64 
 1   description  100 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.7+ KB


In [329]:
tokens_description_test=[tokenize(a,is_lemmatization=True,mod = 0) for a in tqdm(test_df.description)]
len(tokens_description_test)

100%|██████████| 100/100 [00:04<00:00, 22.73it/s]


100

In [330]:
h_exp_test = [desc_to_tokens(a, False, False) for a in tokens_description_test]
h1_exp_test = [get_experience(u) for u in h_exp_test]
len(h_exp_test),len(h1_exp_test)

(100, 100)

In [331]:
h_eng_test = [desc_to_tokens(a, True) for a in tqdm(tokens_description_test)]
h1_eng_test = [get_english(t) for t in h_eng_test]
len(h_eng_test ),len(h1_eng_test)

100%|██████████| 100/100 [00:03<00:00, 25.03it/s]


(100, 100)

In [332]:
test_df['exp'] = h1_exp_test
test_df['eng'] = h1_eng_test
test_df

Unnamed: 0,id,description,exp,eng
0,29083,<strong>Обязанности:</strong> <p>​​​​​- Прием ...,1.0,
1,26052,<p><strong>CityAds Media</strong> – один из ли...,2.0,B
2,24055,<p>В перспективный международный Fintech старт...,2.0,
3,4408,<p><strong>Обязанности:</strong></p><ul><li>Об...,0.0,
4,16856,<p><strong>Компания MALL DECOR</strong> - лиде...,0.0,
...,...,...,...,...
95,17956,<strong>Обязанности:</strong> <ul> <li>Разрабо...,0.0,
96,16629,<p><strong>Темп в поиске харизматичных зумеров...,1.0,
97,22786,<p><strong>Обязанности:</strong></p> <ul> <li>...,1.0,
98,35738,<p><strong>Обязанности:</strong></p> <ul> <li>...,1.0,


In [333]:
if(os.path.exists("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod0.pkl")):
    with open("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod0.pkl", 'rb') as file:
        tokens_description_lemm_mod0 = pickle.load(file)
else:
    tokens_description_lemm_mod0=[tokenize(a,is_lemmatization=True,mod = 0) for a in tqdm(X_train.description)]
    with open('tokens_description_plus_lemment_mod0.pkl', 'wb') as file:
        pickle.dump(tokens_description_lemm_mod0, file)

In [334]:
# h_exp_full = [desc_to_tokens(a, False, False) for a in tokens_description_lemm_mod0]
# h1_exp_full = [get_experience(u) for u in h_exp_full]
# len(h_exp_full),len(h1_exp_full)

In [335]:
if(os.path.exists("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod0_eng.pkl")):
    with open("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod0_eng.pkl", 'rb') as file:
        h_eng_full = pickle.load(file)
else:
    h_eng_full = [desc_to_tokens(a, True) for a in tqdm(tokens_description_lemm_mod0)]
    with open('tokens_description_plus_lemment_mod0_eng.pkl', 'wb') as file:
        pickle.dump(h_eng_full, file)

In [336]:
# h_eng_full = [desc_to_tokens(a, True) for a in tokens_description_lemm_mod0]
# h1_eng_full = [get_english(t) for t in h_eng_full]
# len(h_eng_full ),len(h1_eng_full)

In [337]:
if(os.path.exists("/kaggle/input/nsu-jobs/Exp_Eng.csv")):
    Eng_Exp = pd.read_csv("/kaggle/input/nsu-jobs/Exp_Eng.csv")
    X_train=X_train.merge(Eng_Exp ,how='left', on='id')
    X_train.eng = X_train.eng.fillna('')
else:
    h_exp_full = [desc_to_tokens(a, False, False) for a in tokens_description_lemm_mod0]
    h1_exp_full = [get_experience(u) for u in h_exp_full]
    
    h1_eng_full = [get_english(t) for t in h_eng_full]
    
    X_train['exp'] = h1_exp_full
    X_train['eng'] = h1_eng_full
    X_train[['id','exp','eng']].to_csv('Exp_Eng.csv', index=False)
    X_train.eng = X_train.eng.fillna('')

In [338]:
# X_train['exp'] = h1_exp_full
# X_train['eng'] = h1_eng_full
# X_train[['id','exp','eng']].to_csv('Exp_Eng.csv', index=False)

In [339]:
# Eng_Exp = pd.read_csv("/kaggle/input/nsu-jobs/Exp_Eng.csv")
# X_train=X_train.merge(Eng_Exp ,how='left', on='id')
# X_train.eng = X_train.eng.fillna('')

In [340]:
X_train[['id','exp','eng']]

Unnamed: 0,id,exp,eng
0,29083,1.0,
1,26052,2.0,B
2,24055,2.0,
3,4408,0.0,
4,16856,0.0,
...,...,...,...
27929,16850,3.0,B
27930,6265,0.0,
27931,11284,1.0,
27932,860,0.0,B


In [341]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27934 entries, 0 to 27933
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        27934 non-null  int64              
 1   name                      27934 non-null  object             
 2   has_test                  27934 non-null  bool               
 3   response_letter_required  27934 non-null  bool               
 4   salary_from               23902 non-null  float64            
 5   salary_currency           27934 non-null  object             
 6   salary_gross              27934 non-null  bool               
 7   published_at              27934 non-null  datetime64[ns, UTC]
 8   created_at                27934 non-null  datetime64[ns, UTC]
 9   employer_name             27934 non-null  object             
 10  description               27934 non-null  object             
 11  area_id        

In [342]:
def is_bonus(tokens_lemm: list):
    bonus_words = ['бонус', 'премия', 'процент']
    return any(map(lambda bonus_word: bonus_word in tokens_lemm, bonus_words))

In [343]:
desc_is_bonus=[is_bonus(t) for t in tokens_description_lemm_mod0]
X_train['is_bonus']=desc_is_bonus
len(desc_is_bonus)

27934

In [344]:
X_train[X_train.loc[:, X_train.dtypes == object].columns] = X_train.loc[:, X_train.dtypes == object].astype('category')

# Parse Salary from description

In [345]:
def find_salary(tokens: list):
    
    def is_real_zp(zp: int):
        
        salary_theoretical_max = 500000
        salary_theoretical_min = 10000
        
        return zp <= salary_theoretical_max and zp >= salary_theoretical_min
            
    stopwords_salary = ['партнер', 'клиент', 'партнёр', 'ученик', 'компания', 'сотрудник']    
        
    regular_digit = r'\b[1-9]\d{1,2}(к|т|тыс){1}\b'   # 50к 500к 50тыс
    
    # слово тыс слово 50 100 слово слово тыс -> слово тыс слово 50000 100000 слово слово тыс
    for position in range(len(tokens)):
        if tokens[position] == 'тыс' or tokens[position] == 'к' or tokens[position] == 'т':
            how_many = 0
            for i in range(5):
                if position-i-1 < 0:
                    break
                if tokens[position-i-1].isdigit():
                    how_many += 1
                    tokens[position-i-1] += "000"
                    if how_many == 2:
                        break
    
    tokens = list(filter(lambda t: not (t == 'тыс' or t == 'к' or t == 'т') , tokens))      
    
    # 50к -> 50000   50тыс -> 50000   50т -> 50000  000к -> ничего
    for i in range(len(tokens)):
        if re.match(regular_digit, tokens[i]):
            tokens[i] = ''.join(filter(str.isdigit, tokens[i])) + "000"
            
    # 50 000 -> 50000    50 000р -> 50000    50 000руб -> 50000   50 000к -> 50000 
    for i in range(len(tokens)):
        if re.search(r'\b000(?:р|руб|к)*\b', tokens[i]):
            if i-1 >= 0 and i-1 < len(tokens) and tokens[i-1].isdigit():
                tokens[i-1] = tokens[i-1] + "000"
                tokens[i] = ""
#             if i-2 >= 0 and i-2 < len(tokens) and tokens[i-2].isdigit():
#                 tokens[i-2] = tokens[i-2] + "000"
#                 tokens[i] = ""
#             if i-3 >= 0 and i-3 < len(tokens) and tokens[i-3].isdigit():
#                 tokens[i-3] = tokens[i-3] + "000"
#                 tokens[i] = ""

    tokens = list(filter(bool, tokens)) 

    
    # 50000р -> 50000   50000руб -> 50000 
    for i in range(len(tokens)):
        if re.search(r'\b\d{5,7}(?:р|(?:руб)?)\b', tokens[i]):
            tokens[i] = ''.join(filter(str.isdigit, tokens[i]))
            
    
    min_zp_set = set()
    max_zp_set = set()
    
    # ищем до
    for i in range(len(tokens)):
        if tokens[i].isdigit() and is_real_zp(int(tokens[i])):
            if (i-1 < len(tokens)) and tokens[i-1] == 'до' and (((i+1) == len(tokens)) or (tokens[i+1] not in stopwords_salary)):
                max_zp_set.add(int(tokens[i]))

    # ищем от или не менее
    for i in range(len(tokens)):
        if tokens[i].isdigit() and is_real_zp(int(tokens[i])):
            if (i-1 >= 0) and \
            (((i+1) == len(tokens)) or (tokens[i+1] not in stopwords_salary)) and \
            ((tokens[i-1] == 'от') or ((tokens[i-1] == 'менее') and (i-2 >= 0) and (tokens[i-2] == 'не'))):
                min_zp_set.add(int(tokens[i]))

    max_zp = None
    min_zp = None
    
    # оставляем только максимальный
    max_zp = max(max_zp_set, default=None)
    
    if len(min_zp_set) == 1:
        min_zp = min(min_zp_set, default=None)
    
    # если максимум найден, вывожу его
    if max_zp and min_zp:
        return min_zp, max_zp

    if max_zp:
        return None, max_zp
    
    digits = set()
    
    # оставляю уникальные числа, после которых нет стопслов
    for i in range(len(tokens)):
        if (tokens[i].isdigit() and is_real_zp(int(tokens[i]))) and (((i+1) == len(tokens)) or (tokens[i+1] not in stopwords_salary)):
            digits.add(int(tokens[i]))
            
    digits = list(digits)
    
    # сортировка
    digits.sort(reverse=True)
    
    # если я знаю ОТ min_zp, и если максимум больше него, то возвращаю максимум
    if min_zp:
        if len(digits) >= 1 and digits[0] > min_zp:
            max_zp = digits[0]
            return min_zp, max_zp
        else:
             # если максимума нет
            return min_zp, None
        
    # если я не знаю ОТ min_zp, то ищу два максимума
    
    # беру два самых больших
    if len(digits) >= 1:
        max_zp = digits[0]
        
    if len(digits) >= 2:
        min_zp = digits[1]  
        
    return min_zp, max_zp

In [346]:
test_df = X_train[['id','description', 'is_bonus', 'salary_from', 'salary_to']][1000:1200]
descs_tokens = [tokenize(d,is_lemmatization=True,mod = 1) for d in test_df['description']]
descs_tokens = [desc_to_tokens(d) for d in descs_tokens]
oklads = [find_salary(desc_tokens) for desc_tokens in descs_tokens]

In [347]:
test_df['oklad'] = oklads
test_df

Unnamed: 0,id,description,is_bonus,salary_from,salary_to,oklad
1000,1414,"<p>Приглашаем стать членом нашей команды, в От...",False,35000.0,35000.0,"(None, None)"
1001,23209,<p><strong>Обязанности:</strong></p> <ul> <li>...,False,14000.0,28000.0,"(None, None)"
1002,25930,<p><strong>Обязанности:</strong></p> <p>- Выпо...,False,65000.0,70000.0,"(None, None)"
1003,20024,<p><strong>Должностные обязанности:</strong></...,False,45000.0,50000.0,"(None, None)"
1004,36433,<strong>Обязанности:</strong> <p>- -Выполнение...,False,,70000.0,"(None, None)"
...,...,...,...,...,...,...
1195,35973,<p><strong>Digital-агентство Web Generation Gr...,False,,40000.0,"(30000, None)"
1196,13590,<p>В группу хроматоргафии отделения комбинатор...,False,50850.0,70000.0,"(None, None)"
1197,20632,<strong>Обязанности:</strong> <ul> <li>Опыт ра...,False,60000.0,100000.0,"(None, None)"
1198,9459,"<p>В компанию, которая специализируется на ИТ ...",False,90400.0,120000.0,"(None, None)"


In [348]:
salary_min_desc = []
salary_max_desc = []
for ind, oklad in enumerate(oklads):
    salary_from = test_df.salary_from[1000+ind]
    if(oklad[1]==salary_from):
        salary_min_desc.append(oklad[1])
        salary_max_desc.append(None)
    else:
        salary_min_desc.append(oklad[0])
        salary_max_desc.append(oklad[1])

In [349]:
test_df['salary_min_desc']=salary_min_desc
test_df['salary_max_desc']=salary_max_desc

In [350]:
test_df.loc[((test_df.salary_from>test_df.salary_max_desc) | (abs(test_df['salary_min_desc']-test_df.salary_from)>test_df.salary_from*0.1)),['salary_min_desc', 'salary_max_desc']] = None 
test_df.loc[1190:1200]

Unnamed: 0,id,description,is_bonus,salary_from,salary_to,oklad,salary_min_desc,salary_max_desc
1190,20055,<strong>Обязанности:</strong> <ul> <li> <p>Экс...,False,85000.0,100000.0,"(None, None)",,
1191,4907,<p>Yva.ai – компания спин-офф от ABBYY Давида ...,True,40000.0,50000.0,"(10000, 40000)",40000.0,
1192,18236,"<p>М13 - компания, которая занимается разработ...",False,130000.0,170000.0,"(None, None)",,
1193,24621,<p><strong>Обязанности</strong>:</p> <ul> <li>...,False,73836.46,84945.0,"(None, None)",,
1194,367,<p> Группа компаний «Велунд Сталь» зарекомендо...,False,96050.0,240000.0,"(None, 30000)",,
1195,35973,<p><strong>Digital-агентство Web Generation Gr...,False,,40000.0,"(30000, None)",30000.0,
1196,13590,<p>В группу хроматоргафии отделения комбинатор...,False,50850.0,70000.0,"(None, None)",,
1197,20632,<strong>Обязанности:</strong> <ul> <li>Опыт ра...,False,60000.0,100000.0,"(None, None)",,
1198,9459,"<p>В компанию, которая специализируется на ИТ ...",False,90400.0,120000.0,"(None, None)",,
1199,36140,<p>Контроль размещения товара на складе</p> <p...,False,33900.0,50000.0,"(None, None)",,


In [351]:
import pickle
if(os.path.exists("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod1.pkl")):
    with open("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod1.pkl", 'rb') as file:
        tokens_description_lemm_mod1 = pickle.load(file)
else:
    tokens_description_lemm_mod1 = [tokenize(d,is_lemmatization=True,mod = 1) for d in tqdm(X_train['description'])]
    with open('tokens_description_plus_lemment_mod1.pkl', 'wb') as file:
        pickle.dump(tokens_description_lemm_mod1, file)

In [352]:
descs_tokens_rus = list(map(lambda desc: list(filter(lambda token: is_rus(token), desc)), tqdm(tokens_description_lemm_mod1)))
len(descs_tokens_rus)

100%|██████████| 27934/27934 [00:35<00:00, 788.49it/s]


27934

In [353]:
descs_tokens_full = [desc_to_tokens(d) for d in tqdm(descs_tokens_rus)]

100%|██████████| 27934/27934 [00:04<00:00, 5919.14it/s]


In [354]:
oklads_full = [find_salary(descs_token) for descs_token in tqdm(descs_tokens_full)]

100%|██████████| 27934/27934 [00:15<00:00, 1848.68it/s]


In [355]:
salary_min_desc_full = []
salary_max_desc_full = []
for ind, oklad in enumerate(oklads_full):
    salary_from = X_train.salary_from[ind]
    if(oklad[1]==salary_from):
        if(oklad[0]==salary_from):
            salary_min_desc_full.append(oklad[1])
            salary_max_desc_full.append(oklad[1])
        else:
            salary_min_desc_full.append(oklad[1])
            salary_max_desc_full.append(None)
    else:
        salary_min_desc_full.append(oklad[0])
        salary_max_desc_full.append(oklad[1])

In [356]:
X_train['salary_min_desc']=salary_min_desc_full
X_train['salary_max_desc']=salary_max_desc_full
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27934 entries, 0 to 27933
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        27934 non-null  int64              
 1   name                      27934 non-null  category           
 2   has_test                  27934 non-null  bool               
 3   response_letter_required  27934 non-null  bool               
 4   salary_from               23902 non-null  float64            
 5   salary_currency           27934 non-null  category           
 6   salary_gross              27934 non-null  bool               
 7   published_at              27934 non-null  datetime64[ns, UTC]
 8   created_at                27934 non-null  datetime64[ns, UTC]
 9   employer_name             27934 non-null  category           
 10  description               27934 non-null  category           
 11  area_id        

In [357]:
X_train.loc[((X_train.salary_from>=X_train.salary_max_desc) | (abs(X_train['salary_min_desc']-X_train.salary_from)>X_train.salary_from*0.1)),['salary_min_desc', 'salary_max_desc']] = None 

In [358]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27934 entries, 0 to 27933
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        27934 non-null  int64              
 1   name                      27934 non-null  category           
 2   has_test                  27934 non-null  bool               
 3   response_letter_required  27934 non-null  bool               
 4   salary_from               23902 non-null  float64            
 5   salary_currency           27934 non-null  category           
 6   salary_gross              27934 non-null  bool               
 7   published_at              27934 non-null  datetime64[ns, UTC]
 8   created_at                27934 non-null  datetime64[ns, UTC]
 9   employer_name             27934 non-null  category           
 10  description               27934 non-null  category           
 11  area_id        

# Считаем средние зп, по ним заполняем недостающие.

In [359]:
agg_prof1_emp = X_train.groupby(['start_name1','employer_name'],sort=False).agg(
    salary_mean_prof1_emp = ('salary_from', lambda x: np.mean(x)),
)

agg_prof2_emp = X_train.groupby(['start_name2','employer_name'],sort=False).agg(
    salary_mean_prof2_emp = ('salary_from', lambda x: np.mean(x)),
)

agg_prof3_emp = X_train.groupby(['start_name3','employer_name'],sort=False).agg(
    salary_mean_prof3_emp = ('salary_from', lambda x: np.mean(x)),
)

agg_emp_area = X_train.groupby(['employer_name','area_id'],sort=False).agg(
    salary_mean_emp_area = ('salary_from', lambda x: np.mean(x)),
)
agg_prof1_area = X_train.groupby(['start_name1','area_id'],sort=False).agg(
    salary_mean_prof1_area = ('salary_from', lambda x: np.mean(x)),
)
agg_emp = X_train.groupby(['employer_name',],sort=False).agg(
    salary_mean_emp = ('salary_from', lambda x: np.mean(x)),
)
agg_prof = X_train.groupby(['start_name1',],sort=False).agg(
    salary_mean_prof = ('salary_from', lambda x: np.mean(x)),
)
agg_area = X_train.groupby(['area_id',],sort=False).agg(
    salary_mean_area = ('salary_from', lambda x: np.mean(x)),
)
agg_all = X_train.salary_from.mean()

agg_prof12 = X_train.groupby(['start_name1','start_name2',],sort=False).agg(
    salary_mean_prof12 = ('salary_from', lambda x: np.mean(x)),
)
agg_prof13 = X_train.groupby(['start_name1','start_name3',],sort=False).agg(
    salary_mean_prof13 = ('salary_from', lambda x: np.mean(x)),
)
agg_prof23 = X_train.groupby(['start_name2','start_name3',],sort=False).agg(
    salary_mean_prof23 = ('salary_from', lambda x: np.mean(x)),
)

agg_opt_emp = X_train.groupby(['optimum_title','employer_name'],sort=False).agg(
    salary_mean_opt_emp = ('salary_from', lambda x: np.mean(x)),
)

agg_opt_area = X_train.groupby(['optimum_title','area_id'],sort=False).agg(
    salary_mean_opt_area = ('salary_from', lambda x: np.mean(x)),
)

agg_opt = X_train.groupby(['optimum_title'],sort=False).agg(
    salary_mean_opt = ('salary_from', lambda x: np.mean(x)),
)
agg_opt_prof1 = X_train.groupby(['optimum_title', 'start_name1'],sort=False).agg(
    salary_mean_opt_prof1 = ('salary_from', lambda x: np.mean(x)),
)

In [360]:
agg_prof12 = X_train.groupby(['start_name1','start_name2',],sort=False).agg(
    salary_mean_prof12 = ('salary_from', lambda x: np.mean(x)),
)
agg_prof13 = X_train.groupby(['start_name1','start_name3',],sort=False).agg(
    salary_mean_prof13 = ('salary_from', lambda x: np.mean(x)),
)
agg_prof23 = X_train.groupby(['start_name2','start_name3',],sort=False).agg(
    salary_mean_prof23 = ('salary_from', lambda x: np.mean(x)),
)

In [361]:
agg_opt_emp = X_train.groupby(['optimum_title','employer_name'],sort=False).agg(
    salary_mean_opt_emp = ('salary_from', lambda x: np.mean(x)),
)

agg_opt_area = X_train.groupby(['optimum_title','area_id'],sort=False).agg(
    salary_mean_opt_area = ('salary_from', lambda x: np.mean(x)),
)

agg_opt = X_train.groupby(['optimum_title'],sort=False).agg(
    salary_mean_opt = ('salary_from', lambda x: np.mean(x)),
)
agg_opt_prof1 = X_train.groupby(['optimum_title', 'start_name1'],sort=False).agg(
    salary_mean_opt_prof1 = ('salary_from', lambda x: np.mean(x)),
)

In [362]:
agg_opt_emp.reset_index(inplace=True)
agg_opt_area.reset_index(inplace=True)
agg_opt.reset_index(inplace=True)
agg_opt_prof1.reset_index(inplace=True)

agg_prof1_emp.reset_index(inplace=True)
agg_prof2_emp.reset_index(inplace=True)
agg_prof3_emp.reset_index(inplace=True)
agg_emp_area.reset_index(inplace=True)
agg_prof1_area.reset_index(inplace=True)
agg_emp.reset_index(inplace=True)
agg_prof.reset_index(inplace=True)
agg_area.reset_index(inplace=True)
agg_prof12.reset_index(inplace=True)
agg_prof13.reset_index(inplace=True)
agg_prof23.reset_index(inplace=True)

In [363]:
agg_opt_emp=agg_opt_emp.dropna()
agg_opt_area=agg_opt_area.dropna()
agg_opt=agg_opt.dropna()
agg_opt_prof1=agg_opt_prof1.dropna()

agg_prof1_emp = agg_prof1_emp.dropna()
agg_prof2_emp = agg_prof2_emp.dropna()
agg_prof3_emp = agg_prof3_emp.dropna()
agg_emp_area = agg_emp_area.dropna()
agg_prof1_area = agg_prof1_area.dropna()
agg_emp = agg_emp.dropna()
agg_prof = agg_prof.dropna()
agg_area = agg_area.dropna()
agg_prof12 = agg_prof12.dropna()
agg_prof13 = agg_prof13.dropna()
agg_prof23 = agg_prof23.dropna()

In [364]:
len(agg_prof1_emp), len(agg_emp_area), len(agg_prof1_area), len(agg_emp),len(agg_prof),len(agg_area),len(agg_prof12), len(agg_prof13), len(agg_prof23),

(16774, 13112, 2200, 11980, 925, 150, 5025, 6440, 6632)

In [365]:
X_train2 = X_train.copy()
without_salary = X_train2[X_train2.salary_from.isnull()]

In [366]:
without_salary = without_salary.merge(agg_opt_emp, on=['optimum_title','employer_name'], how='left')
without_salary = without_salary.merge(agg_opt_area, on=['optimum_title','area_id'], how='left')
without_salary = without_salary.merge(agg_opt, on=['optimum_title'], how='left')
without_salary = without_salary.merge(agg_opt_prof1, on=['optimum_title', 'start_name1'], how='left')

without_salary = without_salary.merge(agg_prof12, on=['start_name1','start_name2'], how='left')
without_salary = without_salary.merge(agg_prof13, on=['start_name1','start_name3'], how='left')
without_salary= without_salary.merge(agg_prof23, on=['start_name2','start_name3'], how='left')

without_salary = without_salary.merge(agg_prof1_emp, on=['start_name1','employer_name'], how='left')
without_salary = without_salary.merge(agg_prof2_emp, on=['start_name2','employer_name'], how='left')
without_salary = without_salary.merge(agg_prof3_emp, on=['start_name3','employer_name'], how='left')

without_salary = without_salary.merge(agg_emp_area, on=['employer_name','area_id'], how='left')
without_salary = without_salary.merge(agg_prof1_area, on=['start_name1','area_id'], how='left')
without_salary = without_salary.merge(agg_emp, on=['employer_name',], how='left')
without_salary = without_salary.merge(agg_prof, on=['start_name1',], how='left')
without_salary = without_salary.merge(agg_area, on=['area_id',], how='left')

In [367]:
without_salary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4032 entries, 0 to 4031
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        4032 non-null   int64              
 1   name                      4032 non-null   category           
 2   has_test                  4032 non-null   bool               
 3   response_letter_required  4032 non-null   bool               
 4   salary_from               0 non-null      float64            
 5   salary_currency           4032 non-null   category           
 6   salary_gross              4032 non-null   bool               
 7   published_at              4032 non-null   datetime64[ns, UTC]
 8   created_at                4032 non-null   datetime64[ns, UTC]
 9   employer_name             4032 non-null   category           
 10  description               4032 non-null   category           
 11  area_id          

In [368]:
# without_salary[['salary_from','salary_to','salary_mean_opt_emp','salary_mean_opt_prof1','salary_mean_opt_area','salary_mean_opt','salary_mean_prof12', 'salary_mean_prof13', 'salary_mean_prof23','salary_mean_prof1_emp','salary_mean_prof2_emp', 'salary_mean_prof3_emp','salary_mean_emp_area','salary_mean_prof1_area','salary_mean_emp', 'salary_mean_prof','salary_mean_area']].sample(12)

In [369]:
from tqdm import tqdm
median_emp=[]
median_prof = []
for i in tqdm(range(len(without_salary))):
    row_values_emp = without_salary.loc[i, ['salary_mean_prof1_emp','salary_mean_prof2_emp', 'salary_mean_prof3_emp','salary_mean_opt_emp']]
    row_values_prof = without_salary.loc[i, ['salary_mean_opt_prof1','salary_mean_opt_area', 'salary_mean_opt','salary_mean_prof12','salary_mean_prof13','salary_mean_prof23','salary_mean_emp_area','salary_mean_prof1_area','salary_mean_emp','salary_mean_prof']]
    median_emp.append(row_values_emp.mean())
    median_prof.append(row_values_prof.mean())

100%|██████████| 4032/4032 [00:06<00:00, 660.93it/s]


In [370]:
without_salary['median_emp'] = median_emp
without_salary['median_prof'] = median_prof

In [371]:
without_salary.loc[(without_salary.salary_from.isnull()),'salary_from'] = without_salary.loc[(without_salary.salary_from.isnull())].median_emp
without_salary.loc[(without_salary.salary_from.isnull()),'salary_from'] = without_salary.loc[(without_salary.salary_from.isnull())].median_prof
without_salary.loc[(without_salary.salary_from.isnull()),'salary_from'] = without_salary.loc[(without_salary.salary_from.isnull())].salary_mean_area

In [372]:
without_salary = without_salary.rename (columns= {'salary_from': 'salary_from_without'})

In [373]:
without_salary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4032 entries, 0 to 4031
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        4032 non-null   int64              
 1   name                      4032 non-null   category           
 2   has_test                  4032 non-null   bool               
 3   response_letter_required  4032 non-null   bool               
 4   salary_from_without       4032 non-null   float64            
 5   salary_currency           4032 non-null   category           
 6   salary_gross              4032 non-null   bool               
 7   published_at              4032 non-null   datetime64[ns, UTC]
 8   created_at                4032 non-null   datetime64[ns, UTC]
 9   employer_name             4032 non-null   category           
 10  description               4032 non-null   category           
 11  area_id          

# Добавляем в трейн

In [374]:
X_train2 = X_train.copy()
X_train2 = X_train2.merge(agg_opt_emp, on=['optimum_title','employer_name'], how='left')
X_train2 = X_train2.merge(agg_opt_area, on=['optimum_title','area_id'], how='left')
X_train2 = X_train2.merge(agg_opt, on=['optimum_title'], how='left')
X_train2 = X_train2.merge(agg_opt_prof1, on=['optimum_title', 'start_name1'], how='left')

X_train2 = X_train2.merge(agg_prof12, on=['start_name1','start_name2'], how='left')
X_train2 = X_train2.merge(agg_prof13, on=['start_name1','start_name3'], how='left')
X_train2 = X_train2.merge(agg_prof23, on=['start_name2','start_name3'], how='left')

X_train2 = X_train2.merge(agg_prof1_emp, on=['start_name1','employer_name'], how='left')
X_train2 = X_train2.merge(agg_prof2_emp, on=['start_name2','employer_name'], how='left')
X_train2 = X_train2.merge(agg_prof3_emp, on=['start_name3','employer_name'], how='left')

X_train2 = X_train2.merge(agg_emp_area, on=['employer_name','area_id'], how='left')
X_train2 = X_train2.merge(agg_prof1_area, on=['start_name1','area_id'], how='left')
X_train2 = X_train2.merge(agg_emp, on=['employer_name',], how='left')
X_train2 = X_train2.merge(agg_prof, on=['start_name1',], how='left')
X_train2 = X_train2.merge(agg_area, on=['area_id',], how='left')

In [375]:
X_train2= X_train2.merge(without_salary[['id','salary_from_without']], on='id', how='left')
X_train2.loc[(X_train2.salary_from.isnull()),'salary_from'] = X_train2.salary_from_without

In [376]:
X_train2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27934 entries, 0 to 27933
Data columns (total 40 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        27934 non-null  int64              
 1   name                      27934 non-null  category           
 2   has_test                  27934 non-null  bool               
 3   response_letter_required  27934 non-null  bool               
 4   salary_from               27934 non-null  float64            
 5   salary_currency           27934 non-null  category           
 6   salary_gross              27934 non-null  bool               
 7   published_at              27934 non-null  datetime64[ns, UTC]
 8   created_at                27934 non-null  datetime64[ns, UTC]
 9   employer_name             27934 non-null  category           
 10  description               27934 non-null  category           
 11  area_id        

In [377]:
# X_train2['difff'] = abs(X_train1.salary_from - X_train1.salary_to)
# pd.DataFrame({'from':X_train1.salary_from, 'to': X_train1.salary_to, 'difff': X_train1.difff}).sort_values(by='difff').tail(10)

In [378]:
X_train2[X_train2.salary_from<5000]

Unnamed: 0,id,name,has_test,response_letter_required,salary_from,salary_currency,salary_gross,published_at,created_at,employer_name,description,area_id,area_name,salary_to,timestamp,start_name1,start_name2,start_name3,optimum_title,exp,eng,is_bonus,salary_min_desc,salary_max_desc,salary_mean_opt_emp,salary_mean_opt_area,salary_mean_opt,salary_mean_opt_prof1,salary_mean_prof12,salary_mean_prof13,salary_mean_prof23,salary_mean_prof1_emp,salary_mean_prof2_emp,salary_mean_prof3_emp,salary_mean_emp_area,salary_mean_prof1_area,salary_mean_emp,salary_mean_prof,salary_mean_area,salary_from_without
466,35514,Мастер маникюра и педикюра (м),False,False,60.0,RUR,False,2020-07-12 10:27:10+00:00,2020-07-12 10:27:10+00:00,HEIM beauty studio,<p><strong>В салон красоты HEIM BEAUTY требует...,1,Москва,100000.0,1.594550e+09,мастер,маникюр,педикюр,мастер маникюр,0.0,,False,,,60.0,50635.384615,46014.444444,47391.764706,47391.764706,44884.444444,44884.444444,60.0,60.0,60.0,60.000000,59413.933444,60.000000,56474.200083,69104.656165,
481,23899,QA engineer (инженер по качеству / тестировщик...,False,False,40.0,RUR,False,2020-04-09 01:55:19+00:00,2020-04-09 01:55:19+00:00,Роадгид,Автомобильные видео-регистраторы Roadgid прода...,2,Санкт-Петербург,55.0,1.586397e+09,qa,engineer,инженер,инженер,0.0,B,False,,,40.0,68485.385417,73966.940963,99460.000000,93485.000000,25020.000000,25020.000000,40.0,40.0,40.0,40.000000,130480.000000,40.000000,129254.137931,55174.750504,
816,25278,Секретарь,False,False,1130.0,RUR,True,2020-04-27 22:10:59+00:00,2020-04-27 22:10:59+00:00,Дельта +,<strong>Обязанности:</strong> <ul> <li>Делопро...,1,Москва,30000.0,1.588025e+09,секретарь,,,секретарь,0.0,,False,,,1130.0,38430.739130,37333.534247,37746.000000,39298.666667,37935.575758,52690.940836,1130.0,1130.0,1130.0,1130.000000,38145.870968,1130.000000,38300.821053,69104.656165,
1105,18751,Копирайтер технических текстов в мототематике,False,False,2000.0,RUR,False,2021-11-15 06:58:28+00:00,2021-11-15 06:58:28+00:00,СМ-Моторс,<p><strong>Компания &quot;SM - Motors&quot;( Ф...,1,Москва,80000.0,1.636960e+09,копирайтер,технический,текст,копирайтер,0.0,,False,,,2000.0,43057.186940,41865.795847,42043.412675,14666.666667,4666.666667,2000.000000,2000.0,2000.0,2000.0,2000.000000,43355.209456,2000.000000,42043.412675,69104.656165,
1924,3831,Стажер SMM,False,False,1000.0,RUR,False,2020-08-28 10:15:03+00:00,2020-08-28 10:15:03+00:00,Романов Дмитрий Алексеевич,"<p>Мы рассматриваем вовлечённого человека, кот...",1,Москва,1500.0,1.598610e+09,стажёр,smm,,стажёр,0.0,,False,,,7750.0,48139.747849,43931.649784,43859.120312,19000.000000,42625.769231,38900.000000,7750.0,1000.0,15500.0,31148.148148,42471.511454,31107.142857,39535.968772,69104.656165,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27242,15034,Investment Analyst/ Investment Associate,False,False,150.0,RUR,False,2021-06-25 12:20:29+00:00,2021-06-25 12:20:29+00:00,Black Lions Capital,<p>Black Lions Capital is a company within Red...,1,Москва,250.0,1.624624e+09,analyst,associate,,analyst,0.0,C,False,,,150.0,96880.555556,110352.923077,66541.666667,150.000000,66541.666667,20075.000000,150.0,150.0,150.0,150.000000,59850.000000,150.000000,66541.666667,69104.656165,
27618,27236,Классный барбер/ стилист по волосам,False,False,56.5,RUR,True,2020-05-13 21:26:58+00:00,2020-05-13 21:26:58+00:00,Glambar,"<p>Glambar -это новый проект, созданный команд...",1,Москва,120000.0,1.589405e+09,классный,барбер,стилист,барбер,1.0,,False,,,56.5,50091.300000,50091.300000,56.500000,56.500000,56.500000,56.500000,56.5,56.5,56.5,28278.250000,56.500000,28278.250000,56.500000,69104.656165,
27645,29592,Шеф-повар,False,False,150.0,RUR,False,2020-06-28 18:21:12+00:00,2020-06-28 18:21:12+00:00,Подъяпольская Татьяна Викторовна,<strong>Обязанности:</strong> <ul> <li>Создани...,1,Москва,300.0,1.593368e+09,шеф,повар,,повар,3.0,,False,,,150.0,44050.455696,41346.057851,75421.428571,75421.428571,73519.230769,67772.222222,150.0,150.0,150.0,150.000000,72975.000000,150.000000,73383.333333,69104.656165,
27728,31616,Администратор в СПА-Студию при фитнес-клубе (г...,False,False,50.0,RUR,False,2020-08-02 14:55:53+00:00,2020-08-02 14:55:53+00:00,Эстетик Фэмили,<strong>Обязанности:</strong> <ul> <li>Презент...,1,Москва,65.0,1.596380e+09,администратор,спа,студия,администратор,1.0,,False,,10000.0,50.0,45812.573150,43432.176750,40255.478562,20025.000000,25010.000000,50.000000,50.0,50.0,50.0,50.000000,42763.253638,50.000000,40179.239459,69104.656165,


In [379]:
# X_train2[['salary_from','salary_to','salary_mean_opt_emp','salary_mean_opt_prof1','salary_mean_opt_area','salary_mean_opt','salary_mean_prof12', 'salary_mean_prof13', 'salary_mean_prof23','salary_mean_prof1_emp','salary_mean_prof2_emp', 'salary_mean_prof3_emp','salary_mean_emp_area','salary_mean_prof1_area','salary_mean_emp', 'salary_mean_prof','salary_mean_area']].sample(12)

# Посмотрим крайные ЗП


In [380]:
X_train_low = X_train[X_train.salary_from<=5000]

In [381]:
X_train_norm = X_train[(X_train.salary_from>5000) & (X_train.salary_from<=300000)]

In [382]:
X_train_high = X_train[X_train.salary_from>5000]

In [383]:
len(X_train_low), len(X_train_norm), len(X_train_high)

(127, 23706, 23775)

In [384]:
tes = X_train_low[['id', 'salary_from', 'salary_to', 'salary_min_desc', 'salary_max_desc']]

In [385]:
tes[(~tes.salary_min_desc.isnull())]

Unnamed: 0,id,salary_from,salary_to,salary_min_desc,salary_max_desc


In [386]:
tes[((~tes.salary_max_desc.isnull()) | (~tes.salary_min_desc.isnull()))]

Unnamed: 0,id,salary_from,salary_to,salary_min_desc,salary_max_desc
4917,906,5000.0,15000.0,,15000.0
14068,6260,35.0,60.0,,137000.0
20184,25928,5000.0,10000.0,,10000.0
20805,34963,1600.0,1800.0,,414000.0
21190,6752,2500.0,2500.0,,50000.0
21938,34492,5000.0,10000.0,,10000.0
21964,16727,5000.0,10000.0,,10000.0
23017,5798,15.0,25.0,,15000.0
23125,30979,50.85,100000.0,,35000.0
27728,31616,50.0,65.0,,10000.0


# Train

In [387]:
X_train2 = X_train2.drop(columns=['salary_from_without'])
X_train = X_train2.copy()

In [388]:
y_true = X_train.salary_to
X = X_train.drop(columns=["id", "description","salary_currency","salary_to", 'published_at','created_at', 'exp', 'eng', 'is_bonus'])

In [389]:
category_feat = X.loc[:, X.dtypes == 'category'].columns.to_list()
category_feat

['name',
 'employer_name',
 'area_name',
 'start_name1',
 'start_name2',
 'start_name3',
 'optimum_title']

In [390]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27934 entries, 0 to 27933
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   name                      27934 non-null  category
 1   has_test                  27934 non-null  bool    
 2   response_letter_required  27934 non-null  bool    
 3   salary_from               27934 non-null  float64 
 4   salary_gross              27934 non-null  bool    
 5   employer_name             27934 non-null  category
 6   area_id                   27934 non-null  int64   
 7   area_name                 27934 non-null  category
 8   timestamp                 27934 non-null  float64 
 9   start_name1               27934 non-null  category
 10  start_name2               27934 non-null  category
 11  start_name3               27934 non-null  category
 12  optimum_title             27934 non-null  category
 13  salary_min_desc           3291 non-null   floa

In [391]:
X_tr, X_val, y_tr, y_val = train_test_split(X, y_true, test_size=0.2, random_state=912, shuffle=True)


In [392]:
from catboost import CatBoostRegressor
cat = CatBoostRegressor(cat_features=category_feat, iterations=600, learning_rate=0.1, depth=10,score_function= 'L2', random_state=912, boosting_type = 'Plain',has_time=True,max_bin=254, random_strength=1)

In [393]:
cat.fit(X_tr,y_tr)

0:	learn: 98584.3987082	total: 72.8ms	remaining: 43.6s
1:	learn: 93947.2761764	total: 140ms	remaining: 41.9s
2:	learn: 89945.4857068	total: 208ms	remaining: 41.4s
3:	learn: 86313.0415324	total: 276ms	remaining: 41.2s
4:	learn: 83166.1458553	total: 343ms	remaining: 40.8s
5:	learn: 80286.6387234	total: 413ms	remaining: 40.9s
6:	learn: 77767.0857814	total: 483ms	remaining: 40.9s
7:	learn: 75625.3386193	total: 554ms	remaining: 41s
8:	learn: 73697.2914719	total: 624ms	remaining: 40.9s
9:	learn: 71887.7796061	total: 694ms	remaining: 40.9s
10:	learn: 70379.9834575	total: 762ms	remaining: 40.8s
11:	learn: 69095.2970549	total: 834ms	remaining: 40.9s
12:	learn: 67898.2060635	total: 911ms	remaining: 41.1s
13:	learn: 66702.3278160	total: 988ms	remaining: 41.4s
14:	learn: 65729.7316718	total: 1.09s	remaining: 42.7s
15:	learn: 64809.1722370	total: 1.17s	remaining: 42.8s
16:	learn: 63968.0981218	total: 1.24s	remaining: 42.5s
17:	learn: 63194.6008213	total: 1.31s	remaining: 42.4s
18:	learn: 62511.9328

<catboost.core.CatBoostRegressor at 0x7fe38bc34310>

In [394]:

pd.DataFrame({'featchure':X_tr.columns, 'imp': cat.feature_importances_})

Unnamed: 0,featchure,imp
0,name,0.701808
1,has_test,0.100207
2,response_letter_required,0.731988
3,salary_from,18.183968
4,salary_gross,1.498029
5,employer_name,3.783938
6,area_id,0.681302
7,area_name,1.19078
8,timestamp,8.646606
9,start_name1,8.109944


In [395]:
y_pred_val = cat.predict(X_val)

In [396]:
import numpy as np

def sym_mean_absolute_percentage_error(actual, predicted):
    return 200*np.mean(np.abs(actual - predicted) / (np.abs(actual) + np.abs(predicted)))


In [397]:
sym_mean_absolute_percentage_error(y_val,y_pred_val)

23.003961296892538

In [398]:
qw = pd.DataFrame({ 'X_from': X_val.salary_from,'y_true':y_val, 'y_pred': y_pred_val,'salary_max_desc':X_val.salary_max_desc, 'y_pred_copy': y_pred_val  })

In [399]:
qw.loc[~qw.salary_max_desc.isnull(),'y_pred'] = qw.loc[~qw.salary_max_desc.isnull()].salary_max_desc
qw.loc[~qw.salary_max_desc.isnull()]

Unnamed: 0,X_from,y_true,y_pred,salary_max_desc,y_pred_copy
5739,37000.000000,45000.0,45000.0,45000.0,51544.207760
16333,30000.000000,35000.0,35000.0,35000.0,42797.243266
25790,55656.171349,20000.0,10000.0,10000.0,120855.195004
21308,50000.000000,70000.0,70000.0,70000.0,97130.012570
2632,60887.304029,80000.0,100000.0,100000.0,83080.243805
...,...,...,...,...,...
8073,50000.000000,70000.0,60000.0,60000.0,62736.572142
11316,39200.000000,40000.0,40000.0,40000.0,40653.999171
27094,50000.000000,110000.0,110000.0,110000.0,104781.841082
1782,38985.000000,43000.0,43000.0,43000.0,40278.768658


In [400]:
y_pred_val = qw.y_pred

In [401]:
smape = sym_mean_absolute_percentage_error(y_val,y_pred_val)
print("SMAPE:", smape)

SMAPE: 22.45448109567663


In [402]:
qw['diff'] = abs(qw.y_true-qw.y_pred)

In [403]:
qw[((~qw.salary_max_desc.isnull()))].sort_values(by = 'diff').tail(20)

Unnamed: 0,X_from,y_true,y_pred,salary_max_desc,y_pred_copy,diff
9263,40000.0,200000.0,352185.0,352185.0,171157.137572,152185.0
3134,100000.0,300000.0,130000.0,130000.0,209409.718892,170000.0
16918,124300.0,130000.0,300000.0,300000.0,248564.282181,170000.0
19418,125000.0,550000.0,378000.0,378000.0,271005.759821,172000.0
5981,90000.0,300000.0,100000.0,100000.0,210521.430404,200000.0
27791,75000.0,150000.0,350000.0,350000.0,217224.403428,200000.0
25259,165038.990802,250000.0,40000.0,40000.0,300266.230763,210000.0
26549,179290.853477,250000.0,25000.0,25000.0,180063.277186,225000.0
26625,100000.0,400000.0,150000.0,150000.0,251723.191828,250000.0
22859,32000.0,300000.0,39000.0,39000.0,112408.793441,261000.0


In [404]:
qw.loc[((~qw.salary_max_desc.isnull()) &(qw.y_pred>qw.X_from*2)  ), 'y_pred'] = qw.loc[((~qw.salary_max_desc.isnull()) &(qw.y_pred>qw.X_from*2))].y_pred_copy

In [405]:
qw['diff'] = abs(qw.y_true-qw.y_pred)

In [406]:
qw[((~qw.salary_max_desc.isnull()) )].sort_values(by = 'diff').tail(20)

Unnamed: 0,X_from,y_true,y_pred,salary_max_desc,y_pred_copy,diff
7236,59140.679906,150000.0,39000.0,39000.0,88777.09,111000.0
20444,131071.461722,400000.0,288360.0,300000.0,288360.0,111640.0
7809,100000.0,150000.0,261646.3,300000.0,261646.3,111646.3
25733,81650.0,150000.0,265422.6,500000.0,265422.6,115422.6
16918,124300.0,130000.0,248564.3,300000.0,248564.3,118564.3
16156,120000.0,260000.0,140000.0,140000.0,180814.9,120000.0
21232,50000.0,100000.0,220392.4,200000.0,220392.4,120392.4
11393,263333.333333,400000.0,250000.0,250000.0,313350.9,150000.0
20004,220000.0,250000.0,400000.0,400000.0,339789.6,150000.0
13342,127193.226478,250000.0,100000.0,100000.0,205258.6,150000.0


In [407]:
sym_mean_absolute_percentage_error(qw[((~qw.salary_max_desc.isnull()))].y_true, qw[((~qw.salary_max_desc.isnull()))].y_pred)

14.313717620526193

In [408]:
qw.loc[qw.X_from<150,'y_pred']  = qw.loc[qw.X_from<=150].X_from*1.2
y_pred_val=qw.y_pred

In [409]:
smape = sym_mean_absolute_percentage_error(y_val,y_pred_val)
print("SMAPE:", smape)

SMAPE: 22.140890728029504


# Train all + predict Test

In [410]:
y_true = X_train.salary_to
X = X_train.drop(columns=["id", "description","salary_currency","salary_to", 'published_at','created_at' ])

In [411]:
category_feat = X.loc[:, X.dtypes == 'category'].columns.to_list()
category_feat

['name',
 'employer_name',
 'area_name',
 'start_name1',
 'start_name2',
 'start_name3',
 'optimum_title',
 'eng']

In [412]:
cat = CatBoostRegressor(cat_features=category_feat, iterations=700, learning_rate=0.1, depth=10,score_function= 'L2', random_state=912, boosting_type = 'Plain',has_time=False,max_bin=254, random_strength=1)


In [413]:
cat.fit(X, y_true)

0:	learn: 100085.4514101	total: 107ms	remaining: 1m 15s
1:	learn: 95512.2620877	total: 208ms	remaining: 1m 12s
2:	learn: 91465.3428053	total: 311ms	remaining: 1m 12s
3:	learn: 87891.7330466	total: 403ms	remaining: 1m 10s
4:	learn: 84803.5566089	total: 508ms	remaining: 1m 10s
5:	learn: 82171.6821106	total: 611ms	remaining: 1m 10s
6:	learn: 79953.9530379	total: 708ms	remaining: 1m 10s
7:	learn: 77901.9137021	total: 817ms	remaining: 1m 10s
8:	learn: 76235.0267165	total: 920ms	remaining: 1m 10s
9:	learn: 74650.1293763	total: 1.02s	remaining: 1m 10s
10:	learn: 73092.9237140	total: 1.14s	remaining: 1m 11s
11:	learn: 71933.0748301	total: 1.24s	remaining: 1m 10s
12:	learn: 70843.7678538	total: 1.33s	remaining: 1m 10s
13:	learn: 69872.6075062	total: 1.44s	remaining: 1m 10s
14:	learn: 69017.1915045	total: 1.58s	remaining: 1m 12s
15:	learn: 68273.3549968	total: 1.71s	remaining: 1m 13s
16:	learn: 67617.4489874	total: 1.84s	remaining: 1m 14s
17:	learn: 66986.0531172	total: 1.99s	remaining: 1m 15s
1

<catboost.core.CatBoostRegressor at 0x7fe38c17b670>

# Test

In [414]:
X_test = pd.read_csv('/kaggle/input/nsu-jobs/nsu-bda-2023-jobs/X_test.csv')
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9312 entries, 0 to 9311
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        9312 non-null   int64  
 1   name                      9312 non-null   object 
 2   has_test                  9312 non-null   bool   
 3   response_letter_required  9312 non-null   bool   
 4   salary_from               7924 non-null   float64
 5   salary_currency           9312 non-null   object 
 6   salary_gross              9263 non-null   object 
 7   published_at              9312 non-null   object 
 8   created_at                9312 non-null   object 
 9   employer_name             9312 non-null   object 
 10  description               9312 non-null   object 
 11  area_id                   9312 non-null   int64  
 12  area_name                 9312 non-null   object 
dtypes: bool(2), float64(1), int64(2), object(8)
memory usage: 818.6

In [415]:
X_test.description = X_test.description.fillna('')
X_test.published_at = pd.to_datetime(X_test.published_at, errors='raise')
X_test.created_at = pd.to_datetime(X_test.created_at, errors='raise')
X_test['timestamp'] = (X_test.published_at.astype('int64') / 10**9)
X_test.salary_gross=X_test.salary_gross.astype('bool')
X_test.loc[X_test.salary_gross, 'salary_from'] = X_test.loc[X_test.salary_gross].salary_from*1.13

In [416]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9312 entries, 0 to 9311
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        9312 non-null   int64              
 1   name                      9312 non-null   object             
 2   has_test                  9312 non-null   bool               
 3   response_letter_required  9312 non-null   bool               
 4   salary_from               7924 non-null   float64            
 5   salary_currency           9312 non-null   object             
 6   salary_gross              9312 non-null   bool               
 7   published_at              9312 non-null   datetime64[ns, UTC]
 8   created_at                9312 non-null   datetime64[ns, UTC]
 9   employer_name             9312 non-null   object             
 10  description               9312 non-null   object             
 11  area_id          

In [417]:
from tqdm import tqdm
names1_test = []
names2_test = []
names3_test = []
for t in tqdm(X_test.name):
    title = preprocess_title(t)
    if(len(title)>=3):
        names1_test.append(title[0])
        names2_test.append(title[1])
        names3_test.append(title[2])
        continue
    elif(len(title)>=2):
        names1_test.append(title[0])
        names2_test.append(title[1])
        names3_test.append('')
        continue
    elif(len(title)>=1):
        names1_test.append(title[0])
        names2_test.append('')
        names3_test.append('')
    else:
        names1_test.append('')
        names2_test.append('')
        names3_test.append('')

100%|██████████| 9312/9312 [00:17<00:00, 539.07it/s]


In [418]:
X_test['start_name1'] = names1_test
X_test['start_name2'] = names2_test
X_test['start_name3'] = names3_test

In [419]:
title_counts_test = Counter(X_test['name'].apply(lambda x: " ".join(preprocess_title(x))).values)
del title_counts_test['']
title_counts_test.most_common(10)

[('менеджер продажа', 110),
 ('менеджер работа клиент', 70),
 ('юрист', 62),
 ('frontend разработчик', 56),
 ('помощник юрист', 54),
 ('уборщица уборщик', 49),
 ('копирайтер', 48),
 ('аналитик', 44),
 ('продавец консультант', 40),
 ('менеджер закупка', 39)]

In [420]:
optimum_titless_test = []
optimum_countss_test = []

for title in tqdm(X_test.name):
    optimum_title = optimize_title(title, title_counts=title_counts_test)
    optimum_count = title_counts[optimum_title]
    optimum_titless_test.append(optimum_title)
    optimum_countss_test.append(optimum_count)

100%|██████████| 9312/9312 [00:18<00:00, 513.12it/s]


In [421]:
X_test['optimum_title'] = optimum_titless_test

In [422]:
if(os.path.exists("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod0_test.pkl")):
    with open("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod0_test.pkl", 'rb') as file:
        tokens_description_lemm_mod0_test = pickle.load(file)
else:
    tokens_description_lemm_mod0_test=[tokenize(a,is_lemmatization=True,mod = 0) for a in tqdm(X_test.description)]
    with open('tokens_description_plus_lemment_mod0_test.pkl', 'wb') as file:
        pickle.dump(tokens_description_lemm_mod0_test, file)

In [423]:
# h_exp_full_test = [desc_to_tokens(a, False, False) for a in tokens_description_lemm_mod0_test]
# h1_exp_full_test = [get_experience(u) for u in h_exp_full_test]
# len(h_exp_full_test),len(h1_exp_full_test)

In [424]:
if(os.path.exists("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod0_eng_test.pkl")):
    with open("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod0_eng_test.pkl", 'rb') as file:
        h_eng_full_test = pickle.load(file)
else:
    h_eng_full_test = [desc_to_tokens(a, True) for a in tqdm(tokens_description_lemm_mod0_test)]
    with open('tokens_description_plus_lemment_mod0_eng_test.pkl', 'wb') as file:
        pickle.dump(h_eng_full_test, file)

In [425]:
if(os.path.exists("/kaggle/input/nsu-jobs/ExpEng_test.csv")):
    EngExp_test = pd.read_csv("/kaggle/input/nsu-jobs/ExpEng_test.csv")
    X_test=X_test.merge(EngExp_test ,how='left', on='id')
    X_test.eng = X_test.eng.fillna('')
else:
    h_exp_full_test = [desc_to_tokens(a, False, False) for a in tokens_description_lemm_mod0_test]
    h1_exp_full_test = [get_experience(u) for u in h_exp_full_test]
    
    h1_eng_full_test = [get_english(t) for t in h_eng_full_test]
    
    X_test['exp'] = h1_exp_full_test
    X_test['eng'] = h1_eng_full_test
    X_test[['id','exp','eng']].to_csv('ExpEng_test.csv', index=False)
    X_train.eng = X_train.eng.fillna('')

In [426]:
# h_eng_full_test = [desc_to_tokens(a, True) for a in tqdm(tokens_description_full_test)]
# h1_eng_full_test = [get_english(t) for t in h_eng_full_test]
# len(h_eng_full_test ),len(h1_eng_full_test)

In [427]:
# X_test['exp'] = h1_exp_full_test
# X_test['eng'] = h1_eng_full_test
# X_test[['id','exp','eng']].to_csv('ExpEng_test.csv', index=False)

In [428]:
# EngExp_test = pd.read_csv("/kaggle/input/nsu-jobs/ExpEng_test.csv")
# X_test=X_test.merge(EngExp_test ,how='left', on='id')
# X_test.eng = X_test.eng.fillna('')

In [429]:
desc_is_bonus_test=[is_bonus(t) for t in tokens_description_lemm_mod0_test]
len(desc_is_bonus_test)

9312

In [430]:
X_test['is_bonus']=desc_is_bonus_test

In [431]:
import pickle
if(os.path.exists("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod1_test.pkl")):
    with open("/kaggle/input/nsu-jobs/tokens_description_plus_lemment_mod1_test.pkl", 'rb') as file:
        tokens_description_lemm_mod1_test = pickle.load(file)
else:
    tokens_description_lemm_mod1_test = [tokenize(d,is_lemmatization=True,mod = 1) for d in tqdm(X_test['description'])]
    with open('tokens_description_plus_lemment_mod1_test.pkl', 'wb') as file:
        pickle.dump(tokens_description_lemm_mod1_test, file)

In [432]:
descs_tokens_rus_test = list(map(lambda desc: list(filter(lambda token: is_rus(token), desc)), tokens_description_lemm_mod1_test))
len(descs_tokens_rus_test)

9312

In [433]:
descs_tokens_full_test = [desc_to_tokens(d) for d in descs_tokens_rus_test]

In [434]:
oklads_full_test = [find_salary(descs_token) for descs_token in descs_tokens_full_test]

In [435]:
salary_min_desc_full_test = []
salary_max_desc_full_test = []
for ind, oklad in enumerate(oklads_full_test):
    salary_from = X_train.salary_from[ind]
    if(oklad[1]==salary_from):
        salary_min_desc_full_test.append(oklad[1])
        salary_max_desc_full_test.append(None)
    else:
        salary_min_desc_full_test.append(oklad[0])
        salary_max_desc_full_test.append(oklad[1])

In [436]:
X_test['salary_min_desc']=salary_min_desc_full_test
X_test['salary_max_desc']=salary_max_desc_full_test
X_test.loc[((X_test.salary_from>X_test.salary_max_desc) | (abs(X_test['salary_min_desc']-X_test.salary_from)>X_test.salary_from*0.1)),['salary_min_desc', 'salary_max_desc']] = None 

In [437]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9312 entries, 0 to 9311
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        9312 non-null   int64              
 1   name                      9312 non-null   object             
 2   has_test                  9312 non-null   bool               
 3   response_letter_required  9312 non-null   bool               
 4   salary_from               7924 non-null   float64            
 5   salary_currency           9312 non-null   object             
 6   salary_gross              9312 non-null   bool               
 7   published_at              9312 non-null   datetime64[ns, UTC]
 8   created_at                9312 non-null   datetime64[ns, UTC]
 9   employer_name             9312 non-null   object             
 10  description               9312 non-null   object             
 11  area_id          

In [438]:
agg_prof1_emp_test = X_test.groupby(['start_name1','employer_name'],sort=False).agg(
    salary_mean_prof1_emp = ('salary_from', lambda x: np.mean(x)),
)

agg_prof2_emp_test = X_test.groupby(['start_name2','employer_name'],sort=False).agg(
    salary_mean_prof2_emp = ('salary_from', lambda x: np.mean(x)),
)

agg_prof3_emp_test = X_test.groupby(['start_name3','employer_name'],sort=False).agg(
    salary_mean_prof3_emp = ('salary_from', lambda x: np.mean(x)),
)

agg_emp_area_test = X_test.groupby(['employer_name','area_id'],sort=False).agg(
    salary_mean_emp_area = ('salary_from', lambda x: np.mean(x)),
)
agg_prof1_area_test = X_test.groupby(['start_name1','area_id'],sort=False).agg(
    salary_mean_prof1_area = ('salary_from', lambda x: np.mean(x)),
)
agg_emp_test = X_test.groupby(['employer_name',],sort=False).agg(
    salary_mean_emp = ('salary_from', lambda x: np.mean(x)),
)
agg_prof_test = X_test.groupby(['start_name1',],sort=False).agg(
    salary_mean_prof = ('salary_from', lambda x: np.mean(x)),
)
agg_area_test = X_test.groupby(['area_id',],sort=False).agg(
    salary_mean_area = ('salary_from', lambda x: np.mean(x)),
)
agg_all_test = X_test.salary_from.mean()

agg_prof12_test = X_test.groupby(['start_name1','start_name2',],sort=False).agg(
    salary_mean_prof12 = ('salary_from', lambda x: np.mean(x)),
)
agg_prof13_test = X_test.groupby(['start_name1','start_name3',],sort=False).agg(
    salary_mean_prof13 = ('salary_from', lambda x: np.mean(x)),
)
agg_prof23_test = X_test.groupby(['start_name2','start_name3',],sort=False).agg(
    salary_mean_prof23 = ('salary_from', lambda x: np.mean(x)),
)

agg_opt_emp_test = X_test.groupby(['optimum_title','employer_name'],sort=False).agg(
    salary_mean_opt_emp = ('salary_from', lambda x: np.mean(x)),
)

agg_opt_area_test = X_test.groupby(['optimum_title','area_id'],sort=False).agg(
    salary_mean_opt_area = ('salary_from', lambda x: np.mean(x)),
)

agg_opt_test = X_test.groupby(['optimum_title'],sort=False).agg(
    salary_mean_opt = ('salary_from', lambda x: np.mean(x)),
)
agg_opt_prof1_test = X_test.groupby(['optimum_title', 'start_name1'],sort=False).agg(
    salary_mean_opt_prof1 = ('salary_from', lambda x: np.mean(x)),
)

In [439]:
agg_opt_emp_test.reset_index(inplace=True)
agg_opt_area_test.reset_index(inplace=True)
agg_opt_test.reset_index(inplace=True)
agg_opt_prof1_test.reset_index(inplace=True)

agg_prof1_emp_test.reset_index(inplace=True)
agg_prof2_emp_test.reset_index(inplace=True)
agg_prof3_emp_test.reset_index(inplace=True)
agg_emp_area_test.reset_index(inplace=True)
agg_prof1_area_test.reset_index(inplace=True)
agg_emp_test.reset_index(inplace=True)
agg_prof_test.reset_index(inplace=True)
agg_area_test.reset_index(inplace=True)
agg_prof12_test.reset_index(inplace=True)
agg_prof13_test.reset_index(inplace=True)
agg_prof23_test.reset_index(inplace=True)

agg_opt_emp_test=agg_opt_emp_test.dropna()
agg_opt_area_test=agg_opt_area_test.dropna()
agg_opt_test=agg_opt_test.dropna()
agg_opt_prof1_test=agg_opt_prof1_test.dropna()

agg_prof1_emp_test = agg_prof1_emp_test.dropna()
agg_prof2_emp_test = agg_prof2_emp_test.dropna()
agg_prof3_emp_test = agg_prof3_emp_test.dropna()
agg_emp_area_test = agg_emp_area_test.dropna()
agg_prof1_area_test = agg_prof1_area_test.dropna()
agg_emp_test = agg_emp_test.dropna()
agg_prof_test = agg_prof_test.dropna()
agg_area_test = agg_area_test.dropna()
agg_prof12_test = agg_prof12_test.dropna()
agg_prof13_test= agg_prof13_test.dropna()
agg_prof23_test = agg_prof23_test.dropna()

In [440]:
X_test2 = X_test.copy()
without_salary_test = X_test2[X_test2.salary_from.isnull()]

In [441]:
without_salary_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1388 entries, 10 to 9310
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        1388 non-null   int64              
 1   name                      1388 non-null   object             
 2   has_test                  1388 non-null   bool               
 3   response_letter_required  1388 non-null   bool               
 4   salary_from               0 non-null      float64            
 5   salary_currency           1388 non-null   object             
 6   salary_gross              1388 non-null   bool               
 7   published_at              1388 non-null   datetime64[ns, UTC]
 8   created_at                1388 non-null   datetime64[ns, UTC]
 9   employer_name             1388 non-null   object             
 10  description               1388 non-null   object             
 11  area_id              

In [442]:
without_salary_test = without_salary_test.merge(agg_opt_emp_test, on=['optimum_title','employer_name'], how='left')
without_salary_test = without_salary_test.merge(agg_opt_area_test, on=['optimum_title','area_id'], how='left')
without_salary_test = without_salary_test.merge(agg_opt_test, on=['optimum_title'], how='left')
without_salary_test = without_salary_test.merge(agg_opt_prof1_test, on=['optimum_title', 'start_name1'], how='left')

without_salary_test = without_salary_test.merge(agg_prof12_test, on=['start_name1','start_name2'], how='left')
without_salary_test = without_salary_test.merge(agg_prof13_test, on=['start_name1','start_name3'], how='left')
without_salary_test= without_salary_test.merge(agg_prof23_test, on=['start_name2','start_name3'], how='left')

without_salary_test = without_salary_test.merge(agg_prof1_emp_test, on=['start_name1','employer_name'], how='left')
without_salary_test = without_salary_test.merge(agg_prof2_emp_test, on=['start_name2','employer_name'], how='left')
without_salary_test = without_salary_test.merge(agg_prof3_emp_test, on=['start_name3','employer_name'], how='left')

without_salary_test = without_salary_test.merge(agg_emp_area_test, on=['employer_name','area_id'], how='left')
without_salary_test = without_salary_test.merge(agg_prof1_area_test, on=['start_name1','area_id'], how='left')
without_salary_test = without_salary_test.merge(agg_emp_test, on=['employer_name',], how='left')
without_salary_test = without_salary_test.merge(agg_prof_test, on=['start_name1',], how='left')
without_salary_test = without_salary_test.merge(agg_area_test, on=['area_id',], how='left')

In [443]:
from tqdm import tqdm
median_emp_test=[]
median_prof_test = []
for i in tqdm(range(len(without_salary_test))):
    row_values_emp_test = without_salary_test.loc[i, ['salary_mean_prof1_emp','salary_mean_prof2_emp', 'salary_mean_prof3_emp','salary_mean_opt_emp']]
    row_values_prof_test = without_salary_test.loc[i, ['salary_mean_opt_prof1','salary_mean_opt_area', 'salary_mean_opt','salary_mean_prof12','salary_mean_prof13','salary_mean_prof23','salary_mean_emp_area','salary_mean_prof1_area','salary_mean_emp','salary_mean_prof']]
    median_emp_test.append(row_values_emp_test.mean())
    median_prof_test.append(row_values_prof_test.mean())

100%|██████████| 1388/1388 [00:01<00:00, 728.69it/s]


In [444]:
without_salary_test['median_emp'] = median_emp_test
without_salary_test['median_prof'] = median_prof_test
without_salary_test.loc[(without_salary_test.salary_from.isnull()),'salary_from'] = without_salary_test.loc[(without_salary_test.salary_from.isnull())].median_emp
without_salary_test.loc[(without_salary_test.salary_from.isnull()),'salary_from'] = without_salary_test.loc[(without_salary_test.salary_from.isnull())].median_prof
without_salary_test.loc[(without_salary_test.salary_from.isnull()),'salary_from'] = without_salary_test.loc[(without_salary_test.salary_from.isnull())].salary_mean_area
without_salary_test = without_salary_test.rename (columns= {'salary_from': 'salary_from_without'})

In [445]:
without_salary_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1388 entries, 0 to 1387
Data columns (total 40 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        1388 non-null   int64              
 1   name                      1388 non-null   object             
 2   has_test                  1388 non-null   bool               
 3   response_letter_required  1388 non-null   bool               
 4   salary_from_without       1388 non-null   float64            
 5   salary_currency           1388 non-null   object             
 6   salary_gross              1388 non-null   bool               
 7   published_at              1388 non-null   datetime64[ns, UTC]
 8   created_at                1388 non-null   datetime64[ns, UTC]
 9   employer_name             1388 non-null   object             
 10  description               1388 non-null   object             
 11  area_id          

In [446]:
X_test2 = X_test.copy()
X_test2 = X_test2.merge(agg_opt_emp_test, on=['optimum_title','employer_name'], how='left')
X_test2 =X_test2.merge(agg_opt_area_test, on=['optimum_title','area_id'], how='left')
X_test2 = X_test2.merge(agg_opt_test, on=['optimum_title'], how='left')
X_test2 =X_test2.merge(agg_opt_prof1_test, on=['optimum_title', 'start_name1'], how='left')

X_test2 = X_test2.merge(agg_prof12_test, on=['start_name1','start_name2'], how='left')
X_test2 = X_test2.merge(agg_prof13_test, on=['start_name1','start_name3'], how='left')
X_test2= X_test2.merge(agg_prof23_test, on=['start_name2','start_name3'], how='left')

X_test2 = X_test2.merge(agg_prof1_emp_test, on=['start_name1','employer_name'], how='left')
X_test2 = X_test2.merge(agg_prof2_emp_test, on=['start_name2','employer_name'], how='left')
X_test2 = X_test2.merge(agg_prof3_emp_test, on=['start_name3','employer_name'], how='left')

X_test2 = X_test2.merge(agg_emp_area_test, on=['employer_name','area_id'], how='left')
X_test2 = X_test2.merge(agg_prof1_area_test, on=['start_name1','area_id'], how='left')
X_test2 = X_test2.merge(agg_emp_test, on=['employer_name',], how='left')
X_test2 = X_test2.merge(agg_prof_test, on=['start_name1',], how='left')
X_test2 = X_test2.merge(agg_area_test, on=['area_id',], how='left')

In [447]:
X_test2= X_test2.merge(without_salary_test[['id','salary_from_without']], on='id', how='left')
X_test2.loc[(X_test2.salary_from.isnull()),'salary_from'] = X_test2.salary_from_without

In [448]:
X_test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9312 entries, 0 to 9311
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   id                        9312 non-null   int64              
 1   name                      9312 non-null   object             
 2   has_test                  9312 non-null   bool               
 3   response_letter_required  9312 non-null   bool               
 4   salary_from               9312 non-null   float64            
 5   salary_currency           9312 non-null   object             
 6   salary_gross              9312 non-null   bool               
 7   published_at              9312 non-null   datetime64[ns, UTC]
 8   created_at                9312 non-null   datetime64[ns, UTC]
 9   employer_name             9312 non-null   object             
 10  description               9312 non-null   object             
 11  area_id          

In [449]:
X_test = X_test2.drop(columns=["id", "description","salary_currency", 'published_at', 'created_at', 'salary_from_without'])

In [450]:
X_test[X_test.loc[:, X_test.dtypes == object].columns] = X_test.loc[:, X_test.dtypes == object].astype('category')

In [451]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9312 entries, 0 to 9311
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   name                      9312 non-null   category
 1   has_test                  9312 non-null   bool    
 2   response_letter_required  9312 non-null   bool    
 3   salary_from               9312 non-null   float64 
 4   salary_gross              9312 non-null   bool    
 5   employer_name             9312 non-null   category
 6   area_id                   9312 non-null   int64   
 7   area_name                 9312 non-null   category
 8   timestamp                 9312 non-null   float64 
 9   start_name1               9312 non-null   category
 10  start_name2               9312 non-null   category
 11  start_name3               9312 non-null   category
 12  optimum_title             9312 non-null   category
 13  exp                       9312 non-null   float6

In [452]:
y_pred_test = cat.predict(X_test)

In [453]:
qw_test = pd.DataFrame({ 'X_from': X_test.salary_from, 'y_pred': y_pred_test,'salary_max_desc':X_test.salary_max_desc, 'y_pred_copy': y_pred_test  })

In [454]:
qw_test.loc[~qw_test.salary_max_desc.isnull(),'y_pred'] = qw_test.loc[~qw_test.salary_max_desc.isnull()].salary_max_desc
qw_test.loc[~qw_test.salary_max_desc.isnull()]

Unnamed: 0,X_from,y_pred,salary_max_desc,y_pred_copy
2,200000.0,200000.0,200000.0,306619.099444
30,80000.0,100000.0,100000.0,139095.500540
33,40000.0,55000.0,55000.0,60825.013931
34,30000.0,40000.0,40000.0,41257.974703
37,50000.0,60000.0,60000.0,53688.364632
...,...,...,...,...
9270,80000.0,80000.0,80000.0,121984.938074
9294,190000.0,340000.0,340000.0,279723.865027
9295,45000.0,75000.0,75000.0,57908.295417
9305,60000.0,140000.0,140000.0,114992.245519


In [455]:
qw_test.loc[((~qw_test.salary_max_desc.isnull()) &(qw_test.y_pred>qw_test.X_from*2)  )]

Unnamed: 0,X_from,y_pred,salary_max_desc,y_pred_copy
132,84000.000000,180000.0,180000.0,170807.751285
166,15000.000000,40000.0,40000.0,32187.595073
213,95000.000000,350000.0,350000.0,257625.069863
250,21000.000000,45000.0,45000.0,39451.934040
259,55000.000000,150000.0,150000.0,117236.694303
...,...,...,...,...
8976,30000.000000,100000.0,100000.0,91322.635511
9064,32200.000000,82000.0,82000.0,45866.102191
9177,55195.569284,238000.0,238000.0,154551.854336
9195,25000.000000,80000.0,80000.0,72265.713587


In [456]:
qw_test.loc[((~qw_test.salary_max_desc.isnull()) &(qw_test.y_pred>qw_test.X_from*2)  ), 'y_pred'] = qw_test.loc[((~qw_test.salary_max_desc.isnull()) &(qw_test.y_pred>qw_test.X_from*2))].y_pred_copy

In [457]:
qw_test.loc[((~qw_test.salary_max_desc.isnull()) &(qw_test.y_pred>qw_test.X_from*2)  )]

Unnamed: 0,X_from,y_pred,salary_max_desc,y_pred_copy
132,84000.000000,170807.751285,180000.0,170807.751285
166,15000.000000,32187.595073,40000.0,32187.595073
213,95000.000000,257625.069863,350000.0,257625.069863
259,55000.000000,117236.694303,150000.0,117236.694303
261,40000.000000,110607.241944,120000.0,110607.241944
...,...,...,...,...
8960,60000.000000,127663.875696,140000.0,127663.875696
8969,70000.000000,193580.217705,235000.0,193580.217705
8976,30000.000000,91322.635511,100000.0,91322.635511
9177,55195.569284,154551.854336,238000.0,154551.854336


In [458]:
qw_test['difff'] = abs(qw_test.salary_max_desc-qw_test.y_pred_copy)
qw_test[((~qw_test.salary_max_desc.isnull())&(qw_test.X_from>qw_test.y_pred )  )].sort_values(by = 'difff').tail(30)

Unnamed: 0,X_from,y_pred,salary_max_desc,y_pred_copy,difff
1737,53257.592913,30000.0,30000.0,90609.852259,60609.852259
1416,133224.153417,100000.0,100000.0,161003.448527,61003.448527
8701,31300.0,24000.0,24000.0,85597.17067,61597.17067
2577,31300.0,24000.0,24000.0,88269.746776,64269.746776
6006,68732.171363,60000.0,60000.0,125716.71351,65716.71351
4624,67993.31605,20000.0,20000.0,89771.639249,69771.639249
7583,91801.844129,40000.0,40000.0,114290.803321,74290.803321
175,101542.553721,60000.0,60000.0,137404.895996,77404.895996
8826,51834.501212,10000.0,10000.0,89967.84469,79967.84469
8075,250000.0,200000.0,200000.0,282997.230938,82997.230938


In [459]:
qw_test.loc[((~qw_test.salary_max_desc.isnull()) &(qw_test.X_from>qw_test.y_pred )& (qw_test.difff>40000)), 'y_pred'] = qw_test.loc[((~qw_test.salary_max_desc.isnull())  &(qw_test.X_from>qw_test.y_pred )& (qw_test.difff>40000))].y_pred_copy

In [460]:
qw_test[((~qw_test.salary_max_desc.isnull()) )].sort_values(by = 'difff').tail(20)

Unnamed: 0,X_from,y_pred,salary_max_desc,y_pred_copy,difff
3877,101700.0,160000.0,160000.0,310677.055458,150677.055458
7822,42838.113455,98726.272158,250000.0,98726.272158,151273.727842
6629,40.0,193176.853203,40000.0,193176.853203,153176.853203
7441,50000.0,146445.240776,300000.0,146445.240776,153554.759224
7243,120000.0,150000.0,150000.0,303641.326796,153641.326796
376,100000.0,100000.0,100000.0,264143.437664,164143.437664
424,114321.334894,205486.522504,25000.0,205486.522504,180486.522504
138,122638.147272,208578.749073,20000.0,208578.749073,188578.749073
1520,130000.0,219732.010009,20000.0,219732.010009,199732.010009
6957,43785.745238,96690.32094,300000.0,96690.32094,203309.67906


In [461]:
qw_test.loc[qw_test.X_from<150]

Unnamed: 0,X_from,y_pred,salary_max_desc,y_pred_copy,difff
466,40.0,98957.23811,,98957.23811,
989,79.1,96536.114655,,96536.114655,
1393,50.0,48319.470532,,48319.470532,
1828,35.0,106984.01026,,106984.01026,
2509,67.8,21803.443379,,21803.443379,
2650,20.0,178641.298801,,178641.298801,
3330,1.0,21959.973799,20000.0,21959.973799,1959.973799
3524,1.13,479102.522752,,479102.522752,
4079,100.0,99970.784607,,99970.784607,
4749,67.8,868.840616,,868.840616,


In [462]:
qw_test.loc[qw_test.X_from<150,'y_pred']  = qw_test.loc[qw_test.X_from<=150].X_from*1.2
y_pred_test=qw_test.y_pred

In [463]:
qw_test.tail(10)

Unnamed: 0,X_from,y_pred,salary_max_desc,y_pred_copy,difff
9302,10000.0,43782.375449,,43782.375449,
9303,67800.0,74677.59609,,74677.59609,
9304,55000.0,106615.123141,,106615.123141,
9305,60000.0,114992.245519,140000.0,114992.245519,25007.754481
9306,60000.0,80000.0,80000.0,128472.344363,48472.344363
9307,40000.0,85806.532278,,85806.532278,
9308,40000.0,97580.807733,,97580.807733,
9309,39550.0,39288.070943,,39288.070943,
9310,52112.2871,68927.627835,,68927.627835,
9311,30000.0,31000.642144,,31000.642144,


In [464]:
df_sub = pd.read_csv('/kaggle/input/nsu-jobs/random_result.csv')

In [465]:
df_sub.salary_to=y_pred_test
df_sub

Unnamed: 0,id,salary_to
0,20978,40576.952958
1,3102,61898.300145
2,14731,200000.000000
3,16113,212027.458584
4,8410,138294.074065
...,...,...
9307,29490,85806.532278
9308,34298,97580.807733
9309,33921,39288.070943
9310,8886,68927.627835


In [466]:
#чуть улучшил скор общей коррректировкой значений
df_sub.loc[df_sub.salary_to>80000,'salary_to' ]=df_sub[df_sub.salary_to>80000].salary_to*0.925
df_sub.loc[df_sub.salary_to>250000,'salary_to' ]=df_sub[df_sub.salary_to>250000].salary_to*0.975
df_sub.loc[df_sub.salary_to>500000,'salary_to' ]=df_sub[df_sub.salary_to>500000].salary_to*0.95
df_sub.loc[df_sub.salary_to<10,'salary_to' ]=df_sub[df_sub.salary_to<10].salary_to*0.83333
df_sub

Unnamed: 0,id,salary_to
0,20978,40576.952958
1,3102,61898.300145
2,14731,185000.000000
3,16113,196125.399190
4,8410,127922.018510
...,...,...
9307,29490,79371.042358
9308,34298,90262.247153
9309,33921,39288.070943
9310,8886,68927.627835


In [467]:
df_sub.to_csv('submission111.csv', index=False)