# Resume Parsing

## 1. Importing the libraries


In [51]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import matplotlib.pyplot as plt

from PyPDF2 import PdfReader
from spacy import displacy

## 2. Loading the data

In [None]:
df_resume = pd.read_csv("data/resume.csv")

Lets find out the unique category found in the resume csv file

In [None]:
df_resume.Category.unique()

In [None]:
df_resume.shape

In [None]:
df_resume = df_resume.reindex(np.random.permutation(df_resume.index))
df_resume = df_resume.copy().iloc[:1000, ]
df_resume.shape

## 3. Load skill data

If we define patterns for all the skill, we gonna be too tired.

So spacy knows that, so it allows you to give you a list of words, then it will automatically create pattern.

In [None]:
nlp = spacy.load('en_core_web_md')
skill_path = 'data/skills.jsonl'

In [None]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_path)
nlp.pipe_names

In [None]:
doc = nlp("Chaky loves ajax.")
doc.ents

## 4. Let's try to extract skills from this resume.csv

In [None]:
df_resume.head()

Lets clean our data by removing stop words and punctuation symbols

In [None]:
#clean our data
from spacy.lang.en.stop_words import STOP_WORDS

def preprocessing(sentence):
    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
            token.pos_ != 'SPACE':
                clean_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(clean_tokens)

In [None]:
random_resume = df_resume.Resume_str.iloc[5]
random_resume[:300]

In [None]:
preprocessing(random_resume[:300])

In [None]:
for i, row in df_resume.iterrows():
    clean_text = preprocessing(row.Resume_str)
    df_resume.at[i, 'Clean_resume'] = clean_text

In [None]:
df_resume.head()

## 5. Let's really extract skills!!

In [None]:
def get_skills(text):
    
    doc = nlp(text)
    
    skills = []
    
    for ent in doc.ents:
        if ent.label_ == 'SKILL':
            skills.append(ent.text)
            
    return skills

def unique_skills(x):
    return list(set(x))

In [None]:
def get_entities(resume):
    
    doc = nlp(resume)

    entities={}
    
    for entity in doc.ents:
        if entity.label_ in entities:
            entities[entity.label_].append(entity.text)
        else:
            entities[entity.label_] = [entity.text]
    for ent_type in entities.keys():
        entities[ent_type]=', '.join(unique_entities(entities[ent_type]))
    return entities

def unique_entities(x):
    return list(set(x))

In [None]:
df_resume['Skills'] = df_resume.Clean_resume.apply(get_skills)
df_resume['Skills'] = df_resume.Skills.apply(unique_skills)

In [None]:
df_resume.Skills.iloc[0]

## 6. Visualization

Which skills is most important in information management?

In [None]:
category = 'INFORMATION-TECHNOLOGY'
cond = df_resume.Category == category

df_resume_it = df_resume[cond]
df_resume_it.shape

In [None]:
all_skills = np.concatenate(df_resume_it.Skills.values)

In [None]:
from collections import Counter, OrderedDict

counting = Counter(all_skills)
counting = OrderedDict(counting.most_common(10))

In [None]:
counting

In [None]:
plt.figure(figsize=(15, 3))
plt.xticks(rotation=45)

plt.bar(counting.keys(), counting.values())

## 7. Let's load the PDF - add some realism

In [None]:



reader = PdfReader("data/chaklam_resume.pdf")
page = reader.pages[0]
text = page.extract_text()
text = preprocessing(text)
doc = nlp(text)
colors = {"SKILL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"colors": colors}

displacy.render(doc, style='ent', options=options)