# Краткое описание

Данный ноутбук предназначен для того, чтобы проверять соответствие резюме заданным вакансиям

Система осуществляет подбор кандидату той вакансии, где он сможет наиболее полно раскрыть свой потенциал

Система использует взвешенную оценку семантичности текста для наиболее точного анализа

# Resume-vacancy analysis

In [169]:
import time
import pandas as pd
import json
import random as rd
import numpy as np
from sklearn.metrics import accuracy_score
import re 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [170]:
#!pip install dostoevsky
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel



In [171]:
!python -m dostoevsky download fasttext-social-network-model

## Analysis

In [172]:
with open('drive/MyDrive/LD final/data_file_it.json') as f:
    data = json.load(f)

In [173]:
vacs = pd.read_csv('drive/MyDrive/LD final/vacancy_all_it.csv', index_col=0)
vacs = vacs['description']

In [174]:
res_list = []
for i in range(len(data)):
  resume = re.sub(r"[,\'{}\\\[\]]", "", str(data[i])).replace('xa0',' ')
  res_list.append(resume)

## Sentiment analysis for each resume and vacancy

In [175]:
res_list_for_sentiment = []
for res in data:
  text = res['skills']
  for i in range(len(res['experience'])):
    text = text + ' ' +res['experience'][i]['description']
  res_list_for_sentiment.append(text)

Emotion scoring

In [176]:
tokenizer = RegexTokenizer()
tokens = tokenizer.split('всё очень плохо')  # [('всё', None), ('очень', None), ('плохо', None)]

model = FastTextSocialNetworkModel(tokenizer=tokenizer)

results = model.predict(res_list_for_sentiment, k=2)
res_values = []
for res, sentiment in zip(res_list_for_sentiment, results):
    res_values.append([res_list_for_sentiment.index(res),sentiment])




In [177]:
for i in range(len(res_values)):  
  res_values[i][1] = 1 - res_values[i][1]['neutral']

In [178]:
res_values[0] #emotion score for data[0] resume

[0, 0.2942049503326416]

### Matching resumes

Some functions

emo_weight - это тот вес, который HR задаёт для оценки эмоциональной окраски текста.
Чем выше оценка, тем более "софтскильным" является кандидат

Там, где требуется искать более общительных кандидатов, стоит задавать большой вес для оценки семантичности текста

In [179]:
def index(textcv, textjd):
  '''
  Resume to vacancy score
  '''

  documents = [textjd, textcv]
  count_vectorizer = CountVectorizer()
  sparse_matrix = count_vectorizer.fit_transform(documents)
  doc_term_matrix = sparse_matrix.todense()
  df = pd.DataFrame(doc_term_matrix, 
              columns=count_vectorizer.get_feature_names(), 
              index=['textjd', 'textcv'])
  answer = cosine_similarity(df, df)
  answer = pd.DataFrame(answer)
  answer = answer.iloc[[1],[0]].values[0]
  answer = round(float(answer),4)*100

  return answer

In [180]:
def best_resume_vacancy_score(emo_weight):
  '''
  Searching vest resume for each vacancy
  '''

  best_match = []
  for i in range(len(vacs[:15])):
    vac = vacs[i]
    max_index = 0
    for k in range(len(res_list)):
      resume = res_list[k]
      index_temp = index(resume, vac)*(emo_weight*(1+res_values[k][1]))
      if index_temp > max_index:
        max_index = index_temp
        best_resume_number = k
    best_match.append([i, best_resume_number, max_index])

  return best_match

In [181]:
def all_resumes_to_vacancies(emo_weight):
  '''
  Function that creates dataframe with resume scores to all vacancies
  '''
  scores = []
  for i in range(len(res_list)):
    resume = re.sub(r"[,\'{}\\\[\]]", "", str(data[i])).replace('xa0',' ')
    indexes_temp = []
    for k in range(len(vacs[:20])):
      vac = vacs[k]
      index_temp = index(resume, vac)*(emo_weight*(1+res_values[k][1]))
      indexes_temp.append(index_temp)
    scores.append(indexes_temp)
  
  return pd.DataFrame(scores)

In [182]:
best_resume_vacancy_score(1.125)

[[0, 46, 65.20722660563887],
 [1, 46, 19.195639913901687],
 [2, 46, 67.72801252640784],
 [3, 327, 29.50002345710993],
 [4, 11, 37.099665012359615],
 [5, 11, 38.370571904182434],
 [6, 361, 32.03245085969567],
 [7, 11, 30.02855539083481],
 [8, 253, 25.988436812460424],
 [9, 28, 46.03403934270143],
 [10, 156, 37.53016975164414],
 [11, 11, 42.64298230648041],
 [12, 11, 38.370571904182434],
 [13, 28, 52.02560047388077],
 [14, 156, 38.57963610112667]]

In [183]:
df = all_resumes_to_vacancies(1.125)
df = df.transpose()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389
0,10.191864,17.151451,18.592872,11.516806,18.549192,3.494353,15.71003,16.059466,29.498166,8.808682,5.736563,30.386314,11.385768,8.692204,6.872228,16.263303,21.69411,0.0,5.023133,13.089265,13.48238,12.987347,22.247383,7.614778,11.065452,2.635325,4.120425,14.851002,23.805282,3.916588,6.508233,7.032386,5.358008,5.678324,8.415568,7.571099,15.069399,6.173358,3.188597,6.624712,...,10.774256,23.470407,13.293103,8.299089,6.02776,9.609472,8.109812,13.773576,13.817256,11.502246,7.48374,20.383728,7.556539,17.26793,56.637644,0.0,13.540619,20.034293,5.503607,11.109132,12.201117,5.24153,31.784056,22.422101,29.585525,15.447954,11.99728,10.017146,5.416248,42.325355,16.903934,3.334196,2.446047,14.355968,30.269836,10.089945,6.464554,9.041639,25.683497,14.705404
1,7.330345,7.029097,9.883772,3.141576,8.736164,2.725568,17.443638,5.752384,8.635748,9.783356,0.0,11.275246,4.44698,4.375255,5.1212,5.809764,15.234493,0.286902,1.707067,6.412258,6.61309,8.879615,8.148015,6.928682,4.289184,2.696878,1.692721,9.553835,13.929089,1.534925,11.619528,9.611215,3.873176,3.873176,4.47567,6.899992,6.139701,2.295215,2.180455,7.918493,...,3.500204,11.275246,9.080446,5.293341,4.934713,6.125356,3.68669,7.531176,2.696878,6.426603,2.912055,15.420979,4.131388,8.492297,15.033662,0.086071,11.08876,9.453419,1.649686,7.588556,4.44698,3.571929,13.828673,10.38585,7.172548,8.635748,4.102698,5.766729,8.320156,12.408509,2.95509,1.133263,2.668188,4.088353,15.650501,5.393756,4.131388,3.62931,7.43076,9.223897
2,10.739697,17.457413,17.774559,9.096307,18.538591,3.676004,16.030259,16.909617,24.045389,8.707083,7.553827,27.620482,11.128921,8.663836,7.236682,11.10009,22.53174,0.0,7.049278,12.714648,14.199465,12.498412,22.185763,6.674469,11.402819,3.459768,4.339126,14.257128,17.183515,4.713934,6.847458,6.414987,7.30876,5.9681,8.865656,9.298127,12.829974,7.092525,3.358858,6.977199,...,11.345156,24.708511,12.829974,8.735915,10.148653,12.13802,7.582659,13.060625,15.064407,8.807993,7.121356,19.518859,8.490848,15.641035,57.749297,0.0,12.829974,19.533274,5.506798,11.330741,13.709331,6.890705,31.700126,21.349653,28.643997,14.790509,11.691133,12.772311,6.414987,46.432972,17.342088,3.517431,3.099376,12.599322,26.712293,9.240464,6.818626,10.076575,26.351901,14.228297
3,9.364929,13.303331,14.239824,13.0211,18.396313,3.219996,13.008272,12.418153,22.013998,12.097436,5.567643,20.833761,9.724132,12.418153,8.479751,8.364293,19.473922,2.257846,8.646524,8.890269,14.08588,14.791457,14.38094,9.429073,7.581744,3.399598,6.65808,14.111537,23.951128,5.798559,8.082062,10.90437,7.748517,6.414335,13.303331,6.529793,14.919744,6.17059,4.118003,6.965968,...,12.597754,21.090334,13.957593,11.802377,12.071779,9.108356,10.185964,15.137831,3.194339,12.16158,8.980069,21.770254,11.391859,15.394405,18.370656,3.438084,16.908188,16.779901,7.453458,15.471377,16.561814,9.300786,12.905642,14.611856,8.633695,11.353373,7.953776,10.22445,6.555451,12.456639,9.801104,3.771629,3.797286,19.717666,7.928118,10.493852,6.132104,8.582381,16.035838,14.894086
4,11.427472,19.062099,13.715413,17.593902,17.777426,4.159893,14.559626,16.088999,13.666473,9.690104,6.643594,33.572786,9.4087,8.894831,7.524513,10.779018,19.257859,0.0,5.811616,13.360598,15.69748,16.480518,14.718681,5.872791,7.084054,1.823012,2.667226,21.851675,9.445405,4.147658,7.879327,4.771642,5.554681,5.909495,12.1371,9.934804,8.907066,6.23984,2.214531,5.371156,...,11.403002,17.826366,13.849998,7.683568,3.902959,10.081624,7.084054,16.162409,1.823012,10.167269,8.246377,18.560465,9.800219,17.593902,20.762761,0.110115,14.620801,18.095536,6.582419,8.356492,11.305122,6.668064,18.242356,23.283167,8.845891,10.411968,8.894831,10.252913,8.148497,13.947877,16.859803,2.312411,4.991872,7.206403,2.275706,11.378532,7.989442,7.879327,16.125704,7.732508


In [184]:
#df.to_csv('proba_to_vacancy.csv')

### Гипотеза: специалисты по машинному обучению востребованы больше остальных

In [185]:
ml_ids = []
for i in range(len(res_list)):  
  if ('машинное обучение' or ' ml ' or 'машинному обучению' or 'data science' or 'machine learning') in res_list[i].lower():
    ml_ids.append(i)

In [186]:
ml_score = []
other_score = []
for col in df.columns:
  if col in ml_ids:
    temp = df[col].mean()
    ml_score.append(temp)
  else:
    temp = df[col].mean()
    other_score.append(temp)

In [187]:
print('Средний коэф ML специалистов:', sum(ml_score)/len(ml_score))

Средний коэф ML специалистов: 16.1303584691151


In [188]:
print('Средний коэф других специалистов:', sum(other_score)/len(other_score))

Средний коэф других специалистов: 11.373484977933638


### Гипотеза: успешнее ли резюме наиболее эмоциональных кандидатов?

Для начала выберем оптимальное решение распределения

In [189]:
n_01 = []
n_02 = []
n_03 = []
for res in res_values:
  if res[1]<0.135:
    n_01.append(res_values.index(res))
  elif res[1]>=0.135 and res[1]<0.21:
    n_02.append(res_values.index(res))
  else:
    n_03.append(res_values.index(res))
print(len(n_01), len(n_02), len(n_03))

102 142 146


Сделаем выборку кандидатов без коэффициента шума

In [190]:
scores = []
for i in range(len(res_list)):
  resume = re.sub(r"[,\'{}\\\[\]]", "", str(data[i])).replace('xa0',' ')
  indexes_temp = []
  for k in range(len(vacs[:20])):
    vac = vacs[k]
    index_temp = index(resume, vac)
    indexes_temp.append(index_temp)
  scores.append(indexes_temp)

df_h = pd.DataFrame(scores)

In [191]:
df_h = df_h.transpose()

In [192]:
n1 = []
n2 = []
n3 = []
for col in df_h.columns:
  if col in n_01:
    n1.append(df_h[col].mean())
  elif col in n_02:
    n2.append(df_h[col].mean())
  else:
    n3.append(df_h[col].mean())

In [193]:
print('Среднее у 1 группы: ', sum(n1)/len(n1))

Среднее у 1 группы:  6.57501470588235


In [194]:
print('Среднее у 2 группы: ', sum(n2)/len(n2))

Среднее у 2 группы:  9.890309859154932


In [195]:
print('Среднее у 3 группы: ', sum(n3)/len(n3))

Среднее у 3 группы:  9.110208904109589


In [196]:
all_means = []
for col in df_h.columns:
  all_means.append(df[col].mean())

In [198]:
all_emo_val = []
for val in res_values:
  all_emo_val.append(val[1])

In [204]:
from scipy.stats.stats import pearsonr

print('Коэф. корреляции Пирсона:', pearsonr(all_means, all_emo_val)[0], '\np_value:', pearsonr(all_means, all_emo_val)[1])

Коэф. корреляции Пирсона: 0.13625250602313158 
p_value: 0.007044946369153215


### Выводы:

Есть небольшая корреляция соответствия резюме вакансиям и тональности резюме