In [1]:
import pandas as pd 
import spacy
import re

from spacy.pipeline import EntityRuler

In [2]:
nlp = spacy.load("en_core_web_md")

# Updated Spacy Entities
# Tokens like "Sklearn" and "IoT" were incorrectly labeled as spacy entities.
# These patterns update the token labels as "0" so they are not removed from the text.  
patterns = [{"label": "O", "pattern": "Sklearn"},
            {"label": "O", "pattern": "IoT"},
            {"label": "O", "pattern": "Databricks"},
            {"label": "O", "pattern": "Bioanalyzer"},
            {"label": "O", "pattern": "Biotech"},
            {"label": "O", "pattern": "BioHive"},
            {"label": "O", "pattern": "MLOps"},
            ]

ruler = nlp.add_pipe('entity_ruler', before='ner')
ruler.add_patterns(patterns)

#nlp.pipe_names

In [3]:
df = pd.read_csv('metadataCleaned.csv')

In [4]:
# Remove Sections of Text based on keyphrases. 

def removeEnts(text):
    doc = nlp(text)
    newText = [token.text + token.whitespace_ for token in doc if token.ent_type_ not in ["GPE", "LOC", "DATE", "CARDINAL", "ORDINAL"]]
    return "".join(newText)

def removeSection(text, phrase):
    pattern = rf'(?i)(\n.*?)({re.escape(phrase)}.*?)(\n|$)'
    newText = re.sub(pattern, '', text, flags=re.IGNORECASE)
    return newText

def removePhraseToSectionEnd(text, phrase):
    pattern = rf'(?i)({re.escape(phrase)}.*?)(\n|$)'
    newText = re.sub(pattern, '', text, flags=re.IGNORECASE)
    return newText


# Removed standard company statements on equal opportunity, benefit packages, and compensation to enhance data summary and model accuracy.
# Individually checked sections where the removed section exceeded the length of the remaining section.  Spot checked remaining sections.  
descriptionShort = df['description'].copy()

phrases = ["equal opportunity", "Equal Employment", "discriminate", "race, color, religion", "sexual orientation", "PTO ", "Benefits", "Compensation", "competitive", "COVID", "salary", "Health insurance", "applicants", "applicant", "dental", ] 

for p in phrases:
    descriptionShort = descriptionShort.apply(removeSection, phrase=p)
    descriptionShort = descriptionShort.apply(removePhraseToSectionEnd, phrase=p)

In [5]:
df['jobTitle'] = df['jobTitle'].apply(removeEnts)
df['descriptionShort'] = descriptionShort.apply(removeEnts)

In [6]:
# Text modifications
df['descriptionShort'] = df['descriptionShort'].str.replace(r"R&D", "Research and Development", case=False, regex=True)\
                                                .str.replace(r"&", "", regex=True)\
                                                .str.replace(r"A/B", "A-B", regex=True)\
                                                .str.replace(r"/", " ", regex=True)\
                                                .str.replace(r"\s{2,}", " ", regex=True).str.strip()\
                                                .str.replace(r"\n", " ", regex=True)

In [7]:
df.to_csv('dfCleaned.csv', index=False)

In [8]:
df.head()

Unnamed: 0,jobTitle,description,jobLocation,minSalary,maxSalary,hiringOrganization,employmentType,url,descriptionShort
0,Senior Scientist,"At Element Biosciences, we are passionate abou...","San Diego, CA",126000.0,165000.0,Element Biosciences,Full-Time,https://builtin.com/job/sr-scientist/2179388,"At Element Biosciences, we are passionate abou..."
1,Data Scientist Finance,About Us:\n\nLive experiences help people cros...,Remote,173000.0,205000.0,Gametime United,,https://builtin.com/job/data-scientist-finance...,About Us: Live experiences help people cross t...
2,Air Quality Modeling Scientist,PSE Healthy Energy is seeking a full-time air ...,Remote,75000.0,97500.0,PSE Healthy Energy,Remote,https://builtin.com/job/air-quality-modeling-s...,PSE Healthy Energy is seeking a full-time air ...
3,Data Scientist,We are interested in every qualified candidate...,"Chicago, IL",,,Enova,,https://builtin.com/job/data-scientist-hybrid/...,We are interested in every qualified candidate...
4,Marketing Data Research Specialist,Company Overview\n\nAdCellerant is an award-wi...,Greater Denver Area,55000.0,60000.0,AdCellerant,,https://builtin.com/job/data/marketing-data-re...,Company Overview AdCellerant is an award-winni...


In [9]:
# There are still 319 unique jobTitles for jobs that should have similar titles. 
df.nunique()

jobTitle              319
description           507
jobLocation           145
minSalary             134
maxSalary             127
hiringOrganization    234
employmentType          3
url                   507
descriptionShort      506
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 507 entries, 0 to 506
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   jobTitle            507 non-null    object 
 1   description         507 non-null    object 
 2   jobLocation         497 non-null    object 
 3   minSalary           225 non-null    float64
 4   maxSalary           225 non-null    float64
 5   hiringOrganization  244 non-null    object 
 6   employmentType      54 non-null     object 
 7   url                 507 non-null    object 
 8   descriptionShort    507 non-null    object 
dtypes: float64(2), object(7)
memory usage: 35.8+ KB


# End DeepCleaning
# Finding Sections, Testing, and Notes

In [None]:
'''
def findEnts(text):
    doc = nlp(text)
    newText = [token.text for token in doc if token.ent_type_ in ["GPE", "LOC", "DATE", "CARDINAL", "ORDINAL"]]
    #if newText != '':
    return " ".join(newText)

def findSection(text, phrase):
    pattern = rf'(?i)(\n.*?)({re.escape(phrase)}.*?)(\n|$)'
    newText = re.findall(pattern, text, flags=re.IGNORECASE)
    newTextFormatted = ' '.join([''.join(match) for match in newText]) # Must do a double join to get the text as a string instead of a list
    return newTextFormatted

def findPhrase(text, phrase):
    pattern = rf'(?i)({re.escape(phrase)}.*?)(\n|$)'
    newText = re.findall(pattern, text, flags=re.IGNORECASE)
    newTextFormatted = ' '.join([''.join(match) for match in newText]) # Must do a double join to get the text as a string instead of a list
    return newTextFormatted

test = pd.DataFrame()
'''

In [None]:
#test['description find Ents'] = df['descriptionShort'].apply(findEnts)

In [None]:
#df['description'].apply(lambda x: re.findall("(\w+\/\w+)+", x, flags=re.IGNORECASE))#.value_counts()
#df['description'].apply(lambda x: re.findall("(\w+&\w+)+", x, flags=re.IGNORECASE)).value_counts()
#df['description'].apply(lambda x: re.findall("(\w+\-\w+)+", x, flags=re.IGNORECASE)).value_counts()

In [None]:
#test['find section']= df['descriptionShort'].apply(findSection, phrase='dental')

# These words were cleared. 
# 


#test.to_csv('test')

#test['find section'].value_counts()

In [None]:
#test['find PTO section'] = df['descriptionShort'].apply(findSection, phrase='PTO ')
#test['find Benefits section'] = df['descriptionShort'].apply(findSection, phrase='Benefits')
#test['find Compensation section'] = df['descriptionShort'].apply(findSection, phrase='Compensation')
