## Importing Modules

In [2]:
import pandas as pd
import en_core_web_sm
from spacy.matcher import PhraseMatcher

In [3]:
nlp = en_core_web_sm.load()

## Reading in Resume data

In [4]:
df = pd.read_csv('Resume.csv')
df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [8]:
df.drop('Resume_html', axis=1, inplace=True)

In [9]:
df['Category'].unique()

array(['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE',
       'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE',
       'BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE',
       'CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT',
       'CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION'],
      dtype=object)

### Selecting only resumes in the IT field

In [10]:
tech_resumes = df[df['Category'] == 'INFORMATION-TECHNOLOGY']
tech_resumes.head()

Unnamed: 0,ID,Resume_str,Category
217,36856210,INFORMATION TECHNOLOGY Summar...,INFORMATION-TECHNOLOGY
218,21780877,INFORMATION TECHNOLOGY SPECIALIST\tGS...,INFORMATION-TECHNOLOGY
219,33241454,INFORMATION TECHNOLOGY SUPERVISOR ...,INFORMATION-TECHNOLOGY
220,25990239,INFORMATION TECHNOLOGY INSTRUCTOR ...,INFORMATION-TECHNOLOGY
221,16899268,INFORMATION TECHNOLOGY MANAGER/ANALYS...,INFORMATION-TECHNOLOGY


In [17]:
texts = tech_resumes[['ID', 'Resume_str']]
texts

Unnamed: 0,ID,Resume_str
217,36856210,INFORMATION TECHNOLOGY Summar...
218,21780877,INFORMATION TECHNOLOGY SPECIALIST\tGS...
219,33241454,INFORMATION TECHNOLOGY SUPERVISOR ...
220,25990239,INFORMATION TECHNOLOGY INSTRUCTOR ...
221,16899268,INFORMATION TECHNOLOGY MANAGER/ANALYS...
...,...,...
332,27485716,CORPORATE PROJECT MANAGER Caree...
333,26480367,IT TECHNOLOGY SPECIALIST Profes...
334,81761658,IT MANAGER Highlights ...
335,79541391,SUBJECT MATTER EXPERT (INFORMATION TE...


## Reading in Keywords

In [12]:
keyword_dict = pd.read_csv('Skills_Keywords.csv')
keyword_dict.head()

Unnamed: 0,Statistics,Machine Learning,Deep Learning,R Language,Python Language,NLP,Data Engineering
0,statistical models,linear regression,neural network,r,python,nlp,laws
1,statistical modeling,logistic regression,keras,ggplot,flask,natural language processing,ec2
2,probability,K means,theano,shiny,django,topic modeling,amazon redshift
3,normal distribution,random forest,face detection,cran,pandas,Ida,s3
4,poisson distribution,xgboost,neural networks,dplyr,numpy,named entity recognition,docker


### Creating lists of tokenized keywords for each category for pattern matching

In [13]:
stats_words = [nlp(text) for text in keyword_dict['Statistics'].dropna(axis = 0)]
nlp_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis = 0)]
ml_words = [nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis = 0)]
dl_words = [nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis = 0)]
r_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis = 0)]
python_words = [nlp(text) for text in keyword_dict['Python Language'].dropna(axis = 0)]
data_eng_words = [nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis = 0)]

In [14]:
stats_words

[statistical models,
 statistical modeling,
 probability,
 normal distribution,
 poisson distribution,
 survival models,
 hypothesis testing,
 bayesian inference,
 factor analysis,
 forecasting,
 markow chain,
 monte carlo]

### Creating Pattern Matcher for Keyword Extraction

In [15]:
matcher = PhraseMatcher(nlp.vocab)

In [16]:
matcher.add('Stats', None, *stats_words)
matcher.add('NLP', None, *nlp_words)
matcher.add('ML', None, *ml_words)
matcher.add('DL', None, *dl_words)
matcher.add('R', None, *r_words)
matcher.add('Python', None, *python_words)
matcher.add('Data Engineering', None, *data_eng_words)

## Keyword Extraction

In [23]:
d = {}
for id_, text in texts.values:
    # Tokenize the resume
    doc = nlp(text)

    # Find matches in the doc
    matches = matcher(doc)

    # For each of the matches
    for match_id, start, end in matches:
        # Get the general word and the matched phrase
        gen_word = nlp.vocab.strings[match_id]
        match = doc[start:end]

        # Append all the keywords specific to a resume ID
        d.setdefault(id_, []).append((gen_word, match.text))

KeyboardInterrupt: 

In [26]:
d

{19796840: [('Data Engineering', 'laws')],
 91697974: [('Data Engineering', 'laws')],
 70089206: [('Stats', 'probability')],
 52618188: [('Stats', 'forecasting'),
  ('Stats', 'forecasting'),
  ('Stats', 'forecasting')],
 18159866: [('Stats', 'forecasting'), ('Data Engineering', 'laws')],
 33381211: [('Stats', 'forecasting'), ('Stats', 'forecasting')]}