<a href="https://colab.research.google.com/github/schmcklr/skill_extractor/blob/main/skill_extractor_preprocessing_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install dependencies, load packages & define functions
*   nltk (for visualization)
*   gensim (for topic modeling)
*   spacy language model "de_core_news_sm" (for 
tokenization)

In [33]:
# install and import needed dependencies
!pip install --upgrade googletrans
!pip install --upgrade translatepy
!pip install langdetect
import nltk
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re
# download of needed packages
nltk.download('punkt')
nltk.download('stopwords')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
# function for data translation
from translatepy import Translator
from langdetect import detect

# initialization of global variables
translated_job_ads = 0
all_job_adds = 0

# initialization of translator
translator = Translator()


# function for translation of job description
def translate_job_description(text, count):
    # global keyword to access global variables
    global all_job_adds
    if count == 'y':
        all_job_adds += 1
    if detect(text) != 'en':
        # global keyword to access global variables
        global translated_job_ads
        if count == 'y':
            translated_job_ads += 1
        translation = translator.translate(text, "English")
        translated = translation.result

    else:
        translated = text
    return translated

In [35]:
# import and preprocessing of job advertisements
import nltk
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import re

# user info

print('Loading job advertisement data...')

# TODO: just for developing context, remove afterwards
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# fetching raw data
workbook = 'https://github.com/schmcklr/skill_analyser/blob/main/job_data/jobAdvertisements.xlsx?raw=true'

# import of tabs
job_data = pd.read_excel(workbook, sheet_name="data")

# copying unformatted job description
job_data['rawDescription'] = job_data['description']

# convert text to lower case
job_data = job_data.apply(lambda x: x.astype(str).str.lower())

# elimination of duplicates
job_data = job_data.drop_duplicates(subset=["title"])
job_data = job_data.drop_duplicates(subset=["description"])

# covert 'created_at' to datetime
job_data['created_at'] = pd.to_datetime(job_data['created_at'])

# removing html tags
job_data['description'] = job_data['description'].apply(lambda x: BeautifulSoup(x.replace('\n', ' ').replace('\t', ' '), 'html.parser').get_text(separator=' '))
job_data['description_without_html_tags'] = job_data['description']
job_data = job_data.dropna(subset=["description"], axis=0)

# removing punctuation
job_data['description'] = job_data['description'].str.replace('[^\w\s+#]', ' ', regex=True)

# user info
print('Translation of job advertisements...')


# translation of job description
job_data['title'] = job_data['title'].apply(lambda x: translate_job_description(x, 'y'))
job_data['description'] = job_data['description'].apply(lambda x: translate_job_description(x, 'n'))
print('Translation successful! ' + str(translated_job_ads) + '/' + str(all_job_adds) + ' job advertisements were translated')

Loading job advertisement data...
Translation of job advertisements...
Translation successful! 833/2694 job advertisements were translated


Filtering Job description

In [37]:
# filtering job description to only keep text after the first occurence of a skill section keyword

# specifies skill section keywords
skill_section_keywords = ["qualification", "qualifications", "competence", "competencies",
                          "skill", "skills", "requirement", "required", "requirements", "knowledge", 
                          "expected", "expectations", "we are looking for", "profile", "what you'll need", "responsibilities", "responsibility"]

# function for filtering dataframe by keywords in column 'description' (ignores case)
def extract_text(text, keywords):
    text_words = text.split()
    for i, word in enumerate(text_words):
        for keyword in keywords:
            if keyword in ' '.join(text_words[i:i+4]):
                text = text.replace(keyword, '')
                return ' '.join(text_words[i:]) # here you can adjust the number of words which will be kept after the keyphrase
    return text


# use of extract_text function
job_data['description'] = job_data['description'].apply(lambda x: extract_text(x, skill_section_keywords))

In [38]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# removing stopwords
job_data['description'] = job_data['description'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))

# removing more stop words
other_stop_words = ['good', 'drive', 'part', 'time', 'develop', 'one', 'well', 'help', 'opportunities', 'execution', 'requirements', 'service',
                    'people', 'within', 'ability',  'projects', 'us', 'strong', 'environment', 'product', 'customer', 'project',  'company',
                    'services', 'solutions', 'knowledge', 'celonis', 'customers', 'new', 'working', 'support', 'skills', 'experience', 'work']


job_data['description'] = job_data['description'].apply(lambda x: " ".join(x for x in x.split() if x not in other_stop_words))


# tokenize
job_data["descriptionTokenized"] = job_data["description"].apply(nltk.word_tokenize)

# function for removing tokens that only contain numbers
def remove_numeric_tokens(tokens):
    return [token for token in tokens if not bool(re.match(r'^[0-9]+$', token))]

# remove tokens that only contain numbers
job_data["descriptionTokenized"] = job_data["descriptionTokenized"].apply(lambda x: remove_numeric_tokens(x))
job_data["descriptionTokenized"] = job_data["descriptionTokenized"].apply(lambda x: " ".join(x))

# export dataframe to excel
job_data.to_excel('job_data_preprocessed_description_filtered.xlsx', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
