In [8]:
import pandas as pd
from tqdm import tqdm
import string
import re 
import numpy as np

import spacy
from spacy.tokens import DocBin
from spacy.lang.ru.stop_words import STOP_WORDS
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

import warnings
warnings.simplefilter("ignore")

# Предобработка текста

In [9]:
nlp = spacy.load('ru_core_news_lg')

stopwords_nltk = stopwords.words('russian')
stopwords_spacy = STOP_WORDS

In [10]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [11]:
def remove_url(text): 
    url_pattern  = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)

In [12]:
def clean_text(text):
    doc = nlp(text)
    return (" ".join(token.text for token in doc if not token.is_punct)).lower()

In [13]:
def lemma(text):
    doc = nlp(text)
    return " ".join([token.lemma_.lower().strip() for token in doc if token.lemma_ != '-PRON-'])

In [14]:
def stop_words(text):
    doc = nlp(text)
    return ' '.join([token.text for token in doc if token.text not in stopwords_spacy])

In [15]:
def clean_space(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.replace('  ', ' ').replace('   ', ' ')
    return text

# Классификатор (вакансия/невакансия)

## Spacy classifier

In [9]:
# import json
# with open('./all.jsonl', 'r', encoding='utf-8') as f:
#     json_list = list(f)

In [10]:
# main = pd.DataFrame(columns=['data', 'salary'])

In [11]:
# for json_str in json_list:
#     result = json.loads(json_str)
#     df = pd.DataFrame(result)
#     if len(df.label) != 0:
#         df['salary'] = df.data[0][df.label[0][0]:df.label[0][1]]
#     else:
#         df['salary'] = None
#     df = df[['data', 'salary']]
#     main = main.append(df)


In [12]:
# main.columns = ['text', 'salary']
# main

In [13]:
# non_vacancy = pd.read_csv('./non_vacancy.csv')
# vacancy = pd.read_csv('./vacancy.csv')

In [14]:
# it = pd.read_csv('./df_it_short2.csv')
# it.columns = ['text', 'IT'] 

In [15]:
# df = pd.concat([non_vacancy, vacancy]).reset_index(drop=True)
# df = df.drop_duplicates()
# df.shape

In [16]:
# df = df.merge(it, how='left', on='text').drop_duplicates()
# df.shape

In [17]:
# df['vacancy'] =  df.label
# df.loc[df.vacancy != 0, 'vacancy'] = 1

In [18]:
# df  = df.merge(main, how='left', on='text').drop_duplicates()

In [19]:
# df = df.reset_index(drop=True)[['text', 'vacancy', 'IT', 'label', 'salary']]
# df.loc[df.label == 0, 'label'] = np.nan
# df.loc[df.vacancy == 0, ['IT', 'label', 'salary']] = np.nan
# df

In [20]:
# df.to_csv('main.csv', index=False)

In [21]:
df = pd.read_csv('./main.csv')
df.sample(5)

Unnamed: 0,text,vacancy,IT,label,salary
2285,#vacancy #kharkiv #recruiter #fulltime #ваканс...,1,,HR-специалист,
5825,#vacancy #java #techlead #senior #middle #remo...,1,,Java-разработчик,
5878,• Senior Backend Developer (Python-разработчик...,1,,Backend-разработчик,
5163,#php #москва #backend\n\nBackend-разработчик (...,1,,Backend-разработчик,
5474,#вакансия #mobile #developer #senior #middle #...,1,,Android-разработчик,


In [22]:
df.text = df.text.apply(clean_text).apply(clean_space).apply(lemma).apply(stop_words)
df = df[df.text != '']

### DocBin

In [26]:
nlp = spacy.load('ru_core_news_lg')

In [27]:
train, test = train_test_split(df, test_size=0.25, random_state=25)

train_data = []
test_data = []

for i in range(len(train)):
    train_data.append((train.iloc[i, 0], train.iloc[i, 1]))

for i in range(len(test)):
    test_data.append((test.iloc[i, 0], test.iloc[i, 1]))

In [28]:
def make_docs(data):
    docs = []
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total=len(data)):
        doc.cats['vacancy'] = label
        docs.append(doc)
    return docs

In [29]:
train_docs = make_docs(train_data)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk('./class_text/train.spacy')

test_docs = make_docs(test_data)
doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk('./class_text/test.spacy')


100%|██████████| 4957/4957 [01:51<00:00, 44.48it/s]
100%|██████████| 1653/1653 [00:40<00:00, 40.59it/s]


### Проверка качества модели

In [33]:
nlp = spacy.load('./class_text/model-best')
test.reset_index(drop=True, inplace=True)

for i in tqdm(range(len(test))):
    doc = nlp(test.iloc[i, 0])
    test.loc[i, 'predict'] = doc.cats['vacancy']

test.predict = test.predict.round()
test.predict = test.predict.astype('int32')

100%|██████████| 1653/1653 [00:01<00:00, 878.28it/s] 


In [40]:
print(metrics.classification_report(test.predict, test.vacancy))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       547
           1       0.98      0.96      0.97      1106

    accuracy                           0.96      1653
   macro avg       0.95      0.96      0.95      1653
weighted avg       0.96      0.96      0.96      1653



In [43]:
(test.vacancy != test.predict).sum()

68

## SGDClassifier 

In [51]:
df = pd.read_csv('./main.csv')

In [52]:
df.vacancy.value_counts()

1    4388
0    2242
Name: vacancy, dtype: int64

In [53]:
nlp = spacy.load('ru_core_news_lg')

stopwords_nltk = stopwords.words('russian')
stopwords_spacy = STOP_WORDS

In [None]:
df.text = df.text.apply(remove_emoji).apply(clean_text).apply(clean_space).apply(lemma).apply(stop_words)
df = df[df.text != '']

In [40]:
train, test = train_test_split(df, test_size=0.25, random_state=25)

In [55]:
sgd_ppl_clf = Pipeline([('tfidf', TfidfVectorizer()),
                        ('sgd_clf', SGDClassifier(random_state=42))])

sgd_ppl_clf.fit(train.text, train.vacancy)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('sgd_clf', SGDClassifier(random_state=42))])

In [56]:
predicted_sgd = sgd_ppl_clf.predict(test.text)
print(metrics.classification_report(predicted_sgd, test.vacancy))

              precision    recall  f1-score   support

           0       0.91      0.96      0.94       536
           1       0.98      0.95      0.97      1117

    accuracy                           0.96      1653
   macro avg       0.95      0.96      0.95      1653
weighted avg       0.96      0.96      0.96      1653



In [59]:
test['predict'] = predicted_sgd
(test.vacancy != test.predict).sum()

71

# IT / Не IT

In [37]:
df = pd.read_csv('./main.csv')
df = df[df.IT.notna()]

In [38]:
df.text = df.text.apply(clean_text).apply(clean_space).apply(lemma).apply(stop_words)
df = df[df.text != '']

## Spacy classifier

In [39]:
nlp = spacy.load('ru_core_news_lg')

In [40]:
df.IT = df.IT.astype('int')
df.sample(3)

Unnamed: 0,text,vacancy,IT,label,salary
3674,∞ продакт менеджер команда бизнес инкубатор ...,1,1,Product-менеджер,
2354,вакансия qa manual remote fulltime middle regu...,1,1,QA-Инженер,
4280,smm vacancy kharkiv 🖍 the chi software is ...,1,0,non-IT,


In [41]:
train, test = train_test_split(df, test_size=0.25, random_state=25)

train_data = []
test_data = []

for i in range(len(train)):
    train_data.append((train.iloc[i, 0], train.iloc[i, 2]))

for i in range(len(test)):
    test_data.append((test.iloc[i, 0], test.iloc[i, 2]))

In [42]:
def make_docs(data):
    docs = []
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total=len(data)):
        doc.cats['IT'] = label
        docs.append(doc)
    return docs

In [43]:
train_docs = make_docs(train_data)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk('./it/train.spacy')

test_docs = make_docs(test_data)
doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk('./it/test.spacy')


100%|██████████| 753/753 [00:21<00:00, 35.49it/s]
100%|██████████| 252/252 [00:06<00:00, 36.84it/s]


### Проверка качества модели

In [59]:
nlp = spacy.load('./it/model-best')
test.reset_index(drop=True, inplace=True)

for i in tqdm(range(len(test))):
    doc = nlp(test.iloc[i, 0])
    test.loc[i, 'predict'] = doc.cats['IT']

test.predict = test.predict.round()
test.predict = test.predict.astype('int32')

100%|██████████| 252/252 [00:00<00:00, 563.01it/s]


In [60]:
print(metrics.classification_report(test.predict, test.IT))

              precision    recall  f1-score   support

           0       0.89      0.81      0.85        59
           1       0.94      0.97      0.96       193

    accuracy                           0.93       252
   macro avg       0.92      0.89      0.90       252
weighted avg       0.93      0.93      0.93       252



In [62]:
(test.IT != test.predict).sum()

17

## SGDClassifier 

In [64]:
sgd_ppl_clf = Pipeline([('tfidf', TfidfVectorizer()),
                        ('sgd_clf', SGDClassifier(random_state=42))])

sgd_ppl_clf.fit(train.text, train.IT)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('sgd_clf', SGDClassifier(random_state=42))])

In [67]:
predicted_sgd = sgd_ppl_clf.predict(test.text)
print(metrics.classification_report(predicted_sgd, test.IT))

              precision    recall  f1-score   support

           0       0.87      0.81      0.84        58
           1       0.94      0.96      0.95       194

    accuracy                           0.93       252
   macro avg       0.91      0.89      0.90       252
weighted avg       0.93      0.93      0.93       252



In [70]:
test['predict'] = predicted_sgd
(test.IT != test.predict).sum()

18

# Классификация названиий вакансий

In [71]:
nlp = spacy.load('ru_core_news_lg')

stopwords_nltk = stopwords.words('russian')
stopwords_spacy = STOP_WORDS

In [81]:
df = pd.read_csv('./main.csv')
df = df[df.label.notna()]
df = df[df.label != 'non-IT']

In [82]:
unknown = df.label.value_counts()[df.label.value_counts() < 70].index
df.loc[df.label.isin(unknown), 'label'] = 'unknown'

In [83]:
# mask = df.label.value_counts()[df.label.value_counts() > 100].index
# df = df[df.label.isin(mask)]
# df.reset_index(drop=True, inplace=True)

In [84]:
df.label.value_counts()

unknown                 1064
Java-разработчик         346
iOS-разработчик          238
QA-Инженер               236
Python-разработчик       230
Frontend-разработчик     227
Android-разработчик      224
Product-менеджер         210
1C-разработчик           194
PHP-разработчик          183
Project-менеджер         145
Ruby-разработчик         125
Backend-разработчик      116
Несколько вакансий       100
UX/UI-дизайнер            72
Name: label, dtype: int64

In [85]:
encoder = LabelEncoder()
df.label = encoder.fit_transform(df.label)

In [86]:
df.text = df.text.apply(remove_emoji).apply(clean_text).apply(clean_space).apply(lemma).apply(stop_words)
df = df[df.text != '']

In [87]:
train, test = train_test_split(df, test_size=0.25, random_state=25)

## LogisticRegression

In [88]:
from sklearn.linear_model import LogisticRegression

In [89]:
log_ppl_clf = Pipeline([('tfidf', TfidfVectorizer()),
                        ('log_clf', LogisticRegression(random_state=42, class_weight='balanced'))])

In [90]:
log_ppl_clf.fit(train.text, train.label)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('log_clf',
                 LogisticRegression(class_weight='balanced', random_state=42))])

In [91]:
predicted_log = log_ppl_clf.predict(test.text)
print(metrics.classification_report(predicted_log, test.label))

              precision    recall  f1-score   support

           0       0.95      0.75      0.84        53
           1       0.95      0.91      0.93        43
           2       0.45      0.50      0.47        26
           3       0.93      0.60      0.73        83
           4       0.94      0.89      0.91       106
           5       0.90      0.78      0.84        49
           6       0.88      0.78      0.82        54
           7       0.91      0.69      0.78        45
           8       0.95      0.78      0.86        92
           9       0.93      0.96      0.95        54
          10       0.97      0.82      0.89        44
          11       1.00      0.75      0.86        28
          12       0.98      0.88      0.93        74
          13       0.55      0.87      0.67       167
          14       0.31      0.50      0.38        10

    accuracy                           0.80       928
   macro avg       0.84      0.76      0.79       928
weighted avg       0.85   

In [39]:
parameters = { 
              'log_clf__solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              'log_clf__class_weight':[None, 'balanced'],
              'log_clf__penalty':[None, 'l2', 'l1', 'elasticnet'],
              'tfidf__strip_accents':['ascii', 'unicode', None],
              'tfidf__ngram_range':[(1,2), (2,3), (3,4)]
              }

In [40]:
model = GridSearchCV(log_ppl_clf, parameters, cv=4, n_jobs=-1).fit(train.text, train.label)
print('Best score and parameter combination:')
print(model.best_score_, model.best_params_)

In [None]:
log_ppl_clf = Pipeline([('tfidf', TfidfVectorizer()),
                        ('log_clf', LogisticRegression(random_state=42))])

## SGDClassifier

In [92]:
sgd_ppl_clf = Pipeline([('tfidf', TfidfVectorizer()),
                        ('sgd_clf', SGDClassifier(random_state=42))]) 
                
sgd_ppl_clf.fit(train.text, train.label)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('sgd_clf', SGDClassifier(random_state=42))])

In [93]:
predicted_sgd = sgd_ppl_clf.predict(test.text)
print(metrics.classification_report(predicted_sgd, test.label))

              precision    recall  f1-score   support

           0       0.93      0.81      0.87        48
           1       0.95      0.91      0.93        43
           2       0.21      0.55      0.30        11
           3       0.85      0.71      0.77        65
           4       0.90      0.90      0.90       100
           5       0.81      0.87      0.84        39
           6       0.79      0.86      0.83        44
           7       0.82      0.76      0.79        37
           8       0.91      0.87      0.89        79
           9       0.96      0.98      0.97        55
          10       0.97      0.90      0.94        40
          11       0.95      0.80      0.87        25
          12       0.97      0.93      0.95        69
          13       0.78      0.78      0.78       266
          14       0.25      0.57      0.35         7

    accuracy                           0.84       928
   macro avg       0.80      0.81      0.80       928
weighted avg       0.86   

In [94]:
test['predict'] = predicted_sgd
(test.label != test.predict).sum()

153

In [None]:
parameters = { 
              'sgd_clf__loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
              'sgd_clf__class_weight':[None, 'balanced'],
              'sgd_clf__penalty':[None, 'l2', 'l1', 'elasticnet'],
              'tfidf__strip_accents':['ascii', 'unicode', None],
              'tfidf__ngram_range':[(1,2), (2,3), (3,4)]
              } 
              
model = GridSearchCV(sgd_ppl_clf, parameters, cv=4, n_jobs=-1).fit(train.text, train.label)
print('Best score and parameter combination:')
print(model.best_score_, model.best_params_)

In [25]:
sgd_ppl_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 2), strip_accents='ascii')),
                        ('sgd_clf', SGDClassifier(random_state=42, class_weight='balanced', loss='hinge', penalty='l1'))])

sgd_ppl_clf.fit(train.text, train.label)


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(ngram_range=(1, 2), strip_accents='ascii')),
                ('sgd_clf',
                 SGDClassifier(class_weight='balanced', penalty='l1',
                               random_state=42))])

In [26]:
predicted_sgd = sgd_ppl_clf.predict(test.text)
print(metrics.classification_report(predicted_sgd, test.label))

              precision    recall  f1-score   support

           0       0.83      0.65      0.73        66
           1       1.00      0.97      0.98        59
           2       0.64      0.49      0.55        37
           3       0.89      0.70      0.79        71
           4       0.53      0.56      0.54        18
           5       0.99      0.90      0.94        96
           6       0.91      0.82      0.86        49
           7       0.84      0.73      0.78        56
           8       0.70      0.62      0.66        42
           9       0.92      0.76      0.83        79
          10       0.96      0.93      0.95        59
          11       1.00      0.93      0.96        41
          12       0.94      0.85      0.89        20
          13       1.00      0.95      0.98        64
          14       0.58      0.86      0.70       169
          15       0.22      0.38      0.28        13

    accuracy                           0.80       939
   macro avg       0.81   

In [27]:
test['predict'] = predicted_sgd
(test.label != test.predict).sum()

187

In [95]:
test[test.label != test.predict].label.value_counts()

13    58
2     23
14    12
6     10
4     10
5      8
3      8
8      7
7      6
0      3
1      2
12     2
9      2
11     1
10     1
Name: label, dtype: int64

In [97]:
encoder.inverse_transform([13, 14, 6, 2])


array(['unknown', 'Несколько вакансий', 'Product-менеджер',
       'Backend-разработчик'], dtype=object)