<a href="https://colab.research.google.com/github/schmcklr/skill_extractor/blob/main/skill_extractor_preprocessing_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Part 1:** General Preprocessing and Translation

# 1. Load raw data
*   Import of job advertisements


In [22]:
import pandas as pd
# Fetching raw data
workbook = 'https://github.com/schmcklr/skill_extractor/blob/main/job_data/job_advertisements.xlsx?raw=true'

# Import of tabs
job_data = pd.read_excel(workbook, sheet_name="data")

# Copying unformatted job description
job_data['rawDescription'] = job_data['description']

# 2. Preprocessing (1/2)

*   Convert to lower case
*   Elimination of duplicates
*   Convert dates to datetime
*   Removing HTML-tags
*   Removing most of the punctuation (+, # will not be removed)


In [23]:
# Import and preprocessing of job advertisements
import re
import nltk
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

# Convert text to lower case
job_data = job_data.apply(lambda x: x.astype(str).str.lower())

# Elimination of duplicates
job_data = job_data.drop_duplicates(subset=["title"])
job_data = job_data.drop_duplicates(subset=["description"])

# Convert 'created_at' to datetime
job_data['created_at'] = pd.to_datetime(job_data['created_at'])

# Removing html tags
job_data['description'] = job_data['description'].apply(lambda x: BeautifulSoup(x.replace('\n', ' ').replace('\t', ' '), 'html.parser').get_text(separator=' '))
job_data['description_without_html_tags'] = job_data['description']
job_data = job_data.dropna(subset=["description"], axis=0)

# Removing punctuation (+, # will not be removed)
job_data['description'] = job_data['description'].str.replace('[^\w\s+#]', ' ', regex=True)

#3. Translation

3.1 Define function for data translation (any language supported by google translator to english)

In [24]:
!pip install --upgrade googletrans --quiet
!pip install --upgrade translatepy --quiet
!pip install langdetect --quiet

from translatepy import Translator
from langdetect import detect
import langdetect

# Function for language detection
def detect_language(text):
    try:
        return detect(text)
    except langdetect.lang_detect_exception.LangDetectException:
        return 'unknown'

# Initialization of global variables
translated_job_ads = 0
all_job_adds = 0

# Initialization of translator
translator = Translator()

# Function for translation of job description
def translate_job_description(text, count):
    # Global keyword to access variables global
    global all_job_adds
    global translated_job_des

    if count == 'y':
        all_job_adds += 1
    # Translate if text not in English
    if detect_language(text) != 'en':
        # Global keyword to access global variables
        global translated_job_ads
        # Variable to count number of translated job ads
        if count == 'y':
            translated_job_ads += 1
        try:
          translation = translator.translate(text, "English")
          translated = translation.result
        except Exception as e:
          translated = text

        # Store translated job description (for development purposes only)
        if count == 'n':
          translated_job_des.append([text, translated])
    else:
        translated = text
    return translated

3.2 Define a function to translate the content within HTML tags when there are no spaces between the tags and the enclosed text

In [25]:
from bs4 import BeautifulSoup
import pandas as pd

def process_html(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')

    # Iterate over the relevant tags and apply the function to the element string
    for element in soup.find_all():
        if element.string is not None:
            modified_text = translate_job_description(element.string, 'n')
            element.string.replace_with(modified_text)

    # Format the HTML text to improve readability and structure
    formatted_html_text = soup.prettify()
    return formatted_html_text

3.3 Translation of job advertisements

In [26]:
# Initialize list to store translated job descriptions
translated_job_des = []

# Translation of job ads (columns: title, description, describtion with html tags)
job_data['title'] = job_data['title'].apply(lambda x: translate_job_description(x, 'y'))
job_data['description'] = job_data['description'].apply(lambda x: translate_job_description(x, 'n'))
job_data['rawDescriptionTranslated'] = job_data['rawDescription'].apply(lambda x: translate_job_description(x, 'n'))

# Needed to also translate the headlines and tags without spaces
job_data['rawDescriptionTranslatedWithTags'] = job_data['rawDescriptionTranslated'].apply(lambda x: process_html(x))

# User info (number of ads that have been translated )
print('Translation successful! ' + str(translated_job_ads) + '/' + str(all_job_adds) + ' job advertisements were translated.')

# create a DataFrame with two columns using the translated_job_des list and export (for development purposes only)
#translated_descriptions = pd.DataFrame(translated_job_des, columns=['Original Text', 'Translated Text'])
#translated_descriptions.to_excel('translated_job_adx.xlsx', index=False)

# Export translated text (for development purposes only)
#translated_descriptions.to_excel('translated_job_adx.xlsx', index=False)

Translation successful! 841/2694 job advertisements were translated


# 4. Preprocessing (2/2)

*   Removing stopwords
*   Tokenization
*   Remove tokens that only contain numbers
*   Export dataframe to excel

In [27]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Function for removing tokens that only contain numbers
def remove_numeric_tokens(tokens):
    return [token for token in tokens if not bool(re.match(r'^[0-9]+$', token))]

# Define stopwords
stopwords = stopwords.words('english')

# Removing stopwords
job_data['description'] = job_data['description'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))

# Tokenize job description
job_data["descriptionTokenized"] = job_data["description"].apply(nltk.word_tokenize)

# Remove tokens that only contain numbers
job_data["descriptionTokenized"] = job_data["descriptionTokenized"].apply(lambda x: remove_numeric_tokens(x))
job_data["descriptionTokenized"] = job_data["descriptionTokenized"].apply(lambda x: " ".join(x))

# Export dataframe to excel
job_data.to_excel('job_data_preprocessed_description_filtered.xlsx', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
