In [None]:
#spacy
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

#gensim
import gensim
from gensim import corpora

#Visualization
from spacy import displacy

from wordcloud import WordCloud
import plotly.express as px
import matplotlib.pyplot as plt

#Data loading/ Data manipulation
import pandas as pd
import numpy as np


#nltk
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['stopwords','wordnet'])

#warning
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
pip install jsonlines

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jsonlines
  Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.1.0


In [None]:
import jsonlines

In [None]:
pip install pyldavis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyldavis
  Downloading pyLDAvis-3.4.0-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
Collecting funcy
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyldavis
Successfully installed funcy-2.0 pyldavis-3.4.0


In [None]:
import pyLDAvis.gensim_models

In [None]:
df = pd.read_csv("/content/Resume.csv")
df = df.reindex(np.random.permutation(df.index))
data = df.copy().iloc[
    0:200,
]
data.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
1224,26167298,MARKETING CONSULTANT Summar...,"<div class=""fontsize fontface vmargins hmargin...",CONSULTANT
2465,98389424,GUNNERY SERGEANT Core Qua...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
1897,18569929,PAYROLL ACCOUNTANT Summary S...,"<div class=""fontsize fontface vmargins hmargin...",ACCOUNTANT
2254,14626780,GRANTS MANAGER Summary ...,"<div class=""fontsize fontface vmargins hmargin...",BANKING
777,18129173,MARKETER / ADMINISTRATOR Prof...,"<div class=""fontsize fontface vmargins hmargin...",HEALTHCARE


In [None]:
import spacy

spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:

skill_pattern_path = "/content/jz_skill_patterns.jsonl"

In [None]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [None]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset


def unique_skills(x):
    return list(set(x))

In [None]:
clean = []
for i in range(data.shape[0]):
    review = re.sub(
        '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
        " ",
        data["Resume_str"].iloc[i],
    )
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [
        lm.lemmatize(word)
        for word in review
        if not word in set(stopwords.words("english"))
    ]
    review = " ".join(review)
    clean.append(review)

In [None]:
data["Clean_Resume"] = clean
data["skills"] = data["Clean_Resume"].str.lower().apply(get_skills)
data["skills"] = data["skills"].apply(unique_skills)
data.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_Resume,skills
1224,26167298,MARKETING CONSULTANT Summar...,"<div class=""fontsize fontface vmargins hmargin...",CONSULTANT,marketing consultant summary value creator mar...,"[medium, business, advertising, design, hubspo..."
2465,98389424,GUNNERY SERGEANT Core Qua...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,gunnery sergeant core qualification detail ori...,"[data management, computer network, linux, des..."
1897,18569929,PAYROLL ACCOUNTANT Summary S...,"<div class=""fontsize fontface vmargins hmargin...",ACCOUNTANT,payroll accountant summary sixteen year experi...,"[accounting, stemming, computer programming, m..."
2254,14626780,GRANTS MANAGER Summary ...,"<div class=""fontsize fontface vmargins hmargin...",BANKING,grant manager summary finance professional suc...,"[accounting, business, finance, monitoring, co..."
777,18129173,MARKETER / ADMINISTRATOR Prof...,"<div class=""fontsize fontface vmargins hmargin...",HEALTHCARE,marketer administrator professional summary ad...,"[support, marketing, business, ruby]"


In [None]:
fig = px.histogram(
    data, x="Category", title="Distribution of Jobs Categories"
).update_xaxes(categoryorder="total descending")
fig.show()

In [None]:
Job_Category = data["Category"].unique()
Job_Cat = np.append(Job_Category, "ALL")


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
Total_skills = []
if Job_Cat.all() != "ALL":
    fltr = data[data["Category"] == Job_Cat]["skills"]
    for x in fltr:
        for i in x:
            Total_skills.append(i)
else:
    fltr = data["skills"]
    for x in fltr:
        for i in x:
            Total_skills.append(i)

fig = px.histogram(
    x=Total_skills,
    labels={"x": "Skills"},
    title=f"{Job_Cat} Distribution of Skills",
).update_xaxes(categoryorder="total descending")
fig.show()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
sent = nlp(data["Resume_str"].iloc[0])
displacy.render(sent, style="ent", jupyter=True)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
displacy.render(sent[0:10], style="dep", jupyter=True, options={"distance": 90})


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
patterns = df.Category.unique()
for a in patterns:
    ruler.add_patterns([{"label": "Job-Category", "pattern": a}])


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
# options=[{"ents": "Job-Category", "colors": "#ff3232"},{"ents": "SKILL", "colors": "#56c426"}]
colors = {
    "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
    "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
    "ORG": "#ffd966",
    "PERSON": "#e06666",
    "GPE": "#9fc5e8",
    "DATE": "#c27ba0",
    "ORDINAL": "#674ea7",
    "PRODUCT": "#f9cb9c",
}
options = {
    "ents": [
        "Job-Category",
        "SKILL",
        "ORG",
        "PERSON",
        "GPE",
        "DATE",
        "ORDINAL",
        "PRODUCT",
    ],
    "colors": colors,
}
sent = nlp(data["Resume_str"].iloc[7])
displacy.render(sent, style="ent", jupyter=True, options=options)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [None]:
docs = data["Clean_Resume"].values
dictionary = corpora.Dictionary(d.split() for d in docs)
bow = [dictionary.doc2bow(d.split()) for d in docs]
lda = gensim.models.ldamodel.LdaModel
num_topics = 4
ldamodel = lda(
    bow,
    num_topics=num_topics,
    id2word=dictionary,
    passes=50,
    minimum_probability=0
)
ldamodel.print_topics(num_topics=num_topics)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



[(0,
  '0.011*"marketing" + 0.009*"state" + 0.009*"company" + 0.009*"city" + 0.008*"management" + 0.006*"name" + 0.006*"project" + 0.005*"business" + 0.005*"development" + 0.005*"new"'),
 (1,
  '0.009*"state" + 0.009*"company" + 0.008*"management" + 0.008*"city" + 0.008*"customer" + 0.008*"service" + 0.007*"name" + 0.006*"sale" + 0.006*"skill" + 0.006*"team"'),
 (2,
  '0.011*"state" + 0.010*"company" + 0.010*"city" + 0.008*"customer" + 0.007*"name" + 0.007*"management" + 0.007*"account" + 0.006*"employee" + 0.006*"financial" + 0.005*"service"'),
 (3,
  '0.013*"food" + 0.011*"customer" + 0.008*"equipment" + 0.007*"state" + 0.007*"city" + 0.006*"service" + 0.006*"work" + 0.006*"company" + 0.005*"product" + 0.005*"name"')]

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(ldamodel, bow, dictionary)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

