# Spacy Resume Analysis

In [1]:
#spacy
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy
import pyLDAvis.gensim_models
from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#Data loading/ Data manipulation
import pandas as pd
import numpy as np
import jsonlines

#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])

#warning
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("data/Resume/Resume.csv")
df = df.reindex(np.random.permutation(df.index))
data = df.copy().iloc[0:200,]
data.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
1437,10588874,MANAGER AND EXECUTIVE CHEF ...,"<div class=""fontsize fontface vmargins hmargin...",CHEF
1900,24703009,STAFF ACCOUNTANT Virginia F...,"<div class=""fontsize fontface vmargins hmargin...",ACCOUNTANT
804,67032711,FITNESS INSTRUCTOR Summary I...,"<div class=""fontsize fontface vmargins hmargin...",FITNESS
516,37333719,SEXUAL ASSAULT CRISIS COUNSELOR / VIC...,"<div class=""fontsize fontface vmargins hmargin...",ADVOCATE
685,96260484,HEALTHCARE CONSULTANT Execu...,"<div class=""fontsize fontface vmargins hmargin...",HEALTHCARE


In [3]:
# spacy.cli.download('en_core_web_lg')

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
nlp = spacy.load('en_core_web_lg')
skill_pattern_path = "data/jz_skill_patterns.jsonl"

In [4]:
ruler = nlp.add_pipe('entity_ruler', before='ner')
ruler.from_disk(skill_pattern_path)

patterns = [
    {
        "label": "EMAIL", "pattern": [{"TEXT": {"REGEX": "([^@|\s]+@[^@]+\.[^@|\s]+)"}}]
    },
    {
        "label": "MOBILE", "pattern": [{"TEXT": {"REGEX": "\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}"}}]
    }
]
# nlp.remove_pipe('entity_ruler')
ruler.add_patterns(patterns)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'entity_ruler',
 'ner']

In [10]:
from spacy import displacy
doc = nlp(data['Resume_str'][685])
displacy.render(doc, style="ent", jupyter=True)

In [31]:
displacy.render(doc[0:10], style="dep", jupyter=True, options={"distance": 90})

In [12]:
text = "My name is Subhradeep Rang. I have a good knowledge on Data Science and Machine Learning. I also know Python, C++, Kotlin. My Email Address is srang992@gmail.com"
doc2 = nlp(text)
displacy.render(doc2, style='ent', jupyter=True)

In [13]:
patterns = df.Category.unique()
for a in patterns:
    ruler.add_patterns([{"label": "Job-Category", "pattern": a}])

In [41]:
pattern2 = [
    {
        "label": "PERSON", "pattern": [{"LOWER": "subhradeep"}, {"LOWER": "rang"}]
    },
    {
        "label": "PERSON", "pattern": [{"LOWER": "tanuja"}, {"LOWER": "rang"}]
    },
    {
        "label": "ORG", "pattern": [{"LOWER": "maulana"},{"LOWER": "abul"}, {"LOWER": "kalam"}, {"LOWER": "azad"},
                                    {"LOWER": "university"}, {"LOWER": "of"}, {"LOWER": "technology"}]
    },
    {
        "label": "GPE", "pattern": [{"LOWER": "Hooghly"}]
    },
    {
        "label": "Job-Category", "pattern":[{"TEXT": "DIGITAL"}, {"LOWER": "MARKETING"}, {"LOWER": "DIRECTOR"}]
    }
]

ruler.add_patterns(pattern2)

In [43]:
text2 = "My name is Subhradeep Rang. \
I live in Konnagar, Hooghly. I am persuing M.Sc. in Artificial Intelligence from Maulana Abul Kalam Azad University \
of Technology. I know Python, Kotlin, C, C++, Java. I have a good  knowledge on Data Science and Machine Learning Domain. \
My email address is srang992@gmail.com."

doc3 = nlp(text2)
displacy.render(doc3, style='ent', jupyter=True)

In [44]:
# options=[{"ents": "Job-Category", "colors": "#ff3232"},{"ents": "SKILL", "colors": "#56c426"}]
colors = {
    "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
    "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
    "ORG": "#ffd966",
    "PERSON": "#e06666",
    "GPE": "#9fc5e8",
    "DATE": "#c27ba0",
    "ORDINAL": "#674ea7",
    "PRODUCT": "#f9cb9c",
}
options = {
    "ents": [
        "Job-Category",
        "SKILL",
        "ORG",
        "PERSON",
        "GPE",
        "DATE",
        "ORDINAL",
        "PRODUCT",
    ],
    "colors": colors,
}
sent = nlp(data["Resume_str"].iloc[6])
displacy.render(sent, style="ent", jupyter=True, options=options)

In [55]:
doc = sent = nlp(data["Resume_str"][685])
displacy.render(doc, style="ent", jupyter=True)

In [None]:
# class ProcessSkills:
#
#     @staticmethod
#     def get_skills(text):
#         doc = nlp(text)
#         myset = []
#         subset = []
#         for ent in doc.ents:
#             if ent.label_ == "SKILL":
#                 subset.append(ent.text)
#         myset.append(subset)
#         return subset
#
#     @staticmethod
#     def unique_skills(x):
#         return list(set(x))

In [None]:
# nltk.download('omw-1.4')

In [None]:
# clean = []
# for i in range(data.shape[0]):
#     review = re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ",data["Resume_str"].iloc[i],)
#     review = review.lower()
#     review = review.split()
#     lm = WordNetLemmatizer()
#     review = [
#         lm.lemmatize(word)
#         for word in review
#         if not word in set(stopwords.words("english"))
#     ]
#     review = " ".join(review)
#     clean.append(review)

In [None]:
# data['clean_resume'] = clean
# data['skills'] = data['clean_resume'].str.lower().apply(ProcessSkills().get_skills)
# data['skills'] = data['skills'].apply(ProcessSkills().unique_skills)
# data.head()

In [45]:
!pip install -qq pyresparser

In [47]:
!pip install python-docx docx

Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
     ---------------------------------------- 5.6/5.6 MB 3.2 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting docx
  Downloading docx-0.2.4.tar.gz (54 kB)
     ---------------------------------------- 54.9/54.9 KB 3.0 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting lxml>=2.3.2
  Using cached lxml-4.8.0-cp37-cp37m-win_amd64.whl (3.6 MB)
Building wheels for collected packages: python-docx, docx
  Building wheel for python-docx (setup.py): started
  Building wheel for python-docx (setup.py): finished with status 'done'
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184507 sha256=ddfe88404480bd62d26973abe1b3e1381f7122a97066ca21a90661840d548c03
  Stored in directory: c:\users\hp\appdata\local\pip\cache\wheels\f6\6f\b9\d79812

In [50]:
spacy.cli.download('en_core_web_sm')

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [52]:
from docx import Document
from pyresparser import ResumeParser

try:
    doc = Document()
    with open("C:/Users/HP/Downloads/Subhradeep Rang's Resume Now.pdf", 'r') as file:
        doc.add_paragraph(file.read())
    doc.save("text.docx")
    data = ResumeParser('text.docx').get_extracted_data()
    print(data['skills'])
except:
    data = ResumeParser("C:/Users/HP/Downloads/Subhradeep Rang's Resume Now.pdf").get_extracted_data()
    print(data['skills'])

OSError: [E053] Could not read config file from D:\spacy resume analysis\resume_venv\lib\site-packages\pyresparser\config.cfg