<a href="https://colab.research.google.com/github/schmcklr/skill_extractor/blob/main/skill_extractor_preprocessing_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Load raw data
*   Import of job advertisements


In [4]:
import pandas as pd
# Fetching raw data
workbook = 'https://github.com/schmcklr/skill_extractor/blob/main/job_data/job_advertisements.xlsx?raw=true'

# Import of tabs
job_data = pd.read_excel(workbook, sheet_name="data")

# Copying unformatted job description
job_data['rawDescription'] = job_data['description']

# 2. Preprocessing (1/2)

*   Convert to lower case
*   Elimination of duplicates
*   Convert dates to datetime
*   Removing HTML-tags
*   Removing most of the punctuation (+, # will not be removed)


In [5]:
# Import and preprocessing of job advertisements
import re
import nltk
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

# Convert text to lower case
job_data = job_data.apply(lambda x: x.astype(str).str.lower())

# Elimination of duplicates
job_data = job_data.drop_duplicates(subset=["title"])
job_data = job_data.drop_duplicates(subset=["description"])

# Convert 'created_at' to datetime
job_data['created_at'] = pd.to_datetime(job_data['created_at'])

# Removing html tags
job_data['description'] = job_data['description'].apply(lambda x: BeautifulSoup(x.replace('\n', ' ').replace('\t', ' '), 'html.parser').get_text(separator=' '))
job_data['description_without_html_tags'] = job_data['description']
job_data = job_data.dropna(subset=["description"], axis=0)

# Removing punctuation (+, # will not be removed)
job_data['description'] = job_data['description'].str.replace('[^\w\s+#]', ' ', regex=True)

#3. Translation

3.1 Define function for data translation

In [6]:
!pip install --upgrade googletrans
!pip install --upgrade translatepy
!pip install langdetect

# Function for data translation
from translatepy import Translator
from langdetect import detect

# Initialization of global variables
translated_job_ads = 0
all_job_adds = 0

# Initialization of translator
translator = Translator()


# Function for translation of job description
def translate_job_description(text, count):
    # Global keyword to access global variables
    global all_job_adds
    if count == 'y':
        all_job_adds += 1
    if detect(text) != 'en':
        # Global keyword to access global variables
        global translated_job_ads
        if count == 'y':
            translated_job_ads += 1
        translation = translator.translate(text, "English")
        translated = translation.result
    else:
        translated = text
    return translated

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 KB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sniffio
  Downloading sniffio-1.3.0-py3-none-any.whl (10 kB)
Collecting hstspreload
  Downloading hstspreload-2023.1.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.*
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 KB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.*
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90m━━━

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting translatepy
  Downloading translatepy-2.3-py3-none-any.whl (814 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/814.4 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.9/814.4 KB[0m [31m9.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m814.4/814.4 KB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safeIO>=1.2
  Downloading safeIO-1.2.tar.gz (8.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyuseragents
  Downloading pyuseragents-1.0.5-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 KB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting inquirer>=2.8.0
  Downloading inquirer-3.1.3-py3-none-any.whl (18 kB)
Collecting blessed>=1.19.0
  Downl

3.2 Translation of job advertisements

In [7]:
# Translation of job description
job_data['title'] = job_data['title'].apply(lambda x: translate_job_description(x, 'y'))
job_data['description'] = job_data['description'].apply(lambda x: translate_job_description(x, 'n'))

# User info (number of ads that have been translated )
print('Translation successful! ' + str(translated_job_ads) + '/' + str(all_job_adds) + ' job advertisements were translated')

Translation successful! 849/2694 job advertisements were translated


# 4. Extract skill section from job description
* keep only text after the first occurence of a skill section keyword

In [8]:
# Identifying skill section of job description (keep only text after the first occurence of a skill section keyword)

# Specifies skill section keywords
skill_section_keywords = ["qualification", "qualifications", "competence", "competencies",
                          "skill", "skills", "requirement", "required", "requirements", "knowledge", 
                          "expected", "expectations", "we are looking for", "profile", "what you'll need", "responsibilities", "responsibility"]

# Function for filtering dataframe by keywords in column 'description' (ignores case)
def extract_text(text, keywords):
    text_words = text.split()
    for i, word in enumerate(text_words):
        for keyword in keywords:
            if keyword in ' '.join(text_words[i:i+4]):
                text = text.replace(keyword, '')
                return ' '.join(text_words[i:]) # here you can adjust the number of words which will be kept after the keyphrase
    return text


# Extract skill section
job_data['description'] = job_data['description'].apply(lambda x: extract_text(x, skill_section_keywords))

# 5. Preprocessing (2/2)

*   Removing stopwords
*   Tokenization
*   Remove tokens that only contain numbers
*   Export dataframe to excel

In [9]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Define stopwords
stopwords = stopwords.words('english')

# Removing stopwords
job_data['description'] = job_data['description'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))

# Tokenize job description
job_data["descriptionTokenized"] = job_data["description"].apply(nltk.word_tokenize)

# Function for removing tokens that only contain numbers
def remove_numeric_tokens(tokens):
    return [token for token in tokens if not bool(re.match(r'^[0-9]+$', token))]

# Remove tokens that only contain numbers
job_data["descriptionTokenized"] = job_data["descriptionTokenized"].apply(lambda x: remove_numeric_tokens(x))
job_data["descriptionTokenized"] = job_data["descriptionTokenized"].apply(lambda x: " ".join(x))

# Export dataframe to excel
job_data.to_excel('job_data_preprocessed_description_filtered.xlsx', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
