<a href="https://colab.research.google.com/github/schmcklr/skill_extractor/blob/main/(I)_general_preprocessing_and_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Part 1:** General Preprocessing and Translation

The following program is designed to filter out duplicate job postings, translate non-English job postings into English, and perform initial general preprocessing. The results will be outputted in an Excel file, which can be used for further analysis.




# 1. Load raw data
*   Import of job advertisements


In [1]:
import pandas as pd
# Fetching raw data
workbook = 'https://github.com/schmcklr/skill_extractor/blob/main/job_data/job_advertisements.xlsx?raw=true'

# Import of tabs
job_data = pd.read_excel(workbook, sheet_name="data")

# 2. General Preprocessing

*   Convert to lower case
*   Convert dates to datetime
*   Elimination of duplicates


In [2]:
# Initial preprocessing of job advertisements
import pandas as pd
from nltk.corpus import stopwords

# Convert text to lower case
job_data = job_data.apply(lambda x: x.astype(str).str.lower())

# Convert 'created_at' to datetime
job_data['created_at'] = pd.to_datetime(job_data['created_at'])

# Extract the year from the 'created_at'
job_data['year'] = job_data['created_at'].dt.year

# Elimination of duplicates
job_data = job_data[~job_data.duplicated(subset=['title', 'year'], keep='first')]
job_data = job_data[~job_data.duplicated(subset=['description', 'year'], keep='first')]

#3. Translation

3.1 Function for data translation

In [3]:
#!pip install --upgrade googletrans --quiet
!pip install --upgrade translatepy --quiet
!pip install langdetect --quiet
from bs4 import BeautifulSoup
from translatepy import Translator
from langdetect import detect
import langdetect

# Function for language detection
def detect_language(text):
    try:
        return detect(text)
    except langdetect.lang_detect_exception.LangDetectException:
        return 'unknown'

# Initialization of global variables
translated_job_ads = 0
all_job_adds = 0

# Initialization of translator
translator = Translator()

# Function for translation of job description
def translate_job_description(text, count, html):
    # Global keyword to access variables global
    global all_job_adds
    global translated_job_des

    if count == 'y':
        all_job_adds += 1

    if html:
      # Erstellen eines BeautifulSoup-Objekts
      soup = BeautifulSoup(text, 'html.parser')
      # Entfernen von HTML-Tags und Ersetzen durch Leerzeichen
      cleaned_text = soup.get_text(separator=' ')
    else:
      cleaned_text = text

    # Translate if text not in English
    if detect_language(cleaned_text) != 'en':
        # Global keyword to access global variables
        global translated_job_ads
        # Variable to count number of translated job ads
        if count == 'y':
            translated_job_ads += 1
        try:
          translation = translator.translate(text, "English")
          translated = translation.result
        except Exception as e:
          translated = text

        # Store translated job description (for development purposes only)
        if count == 'n':
          translated_job_des.append([text, translated])
    else:
        translated = text
    return translated

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/814.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/814.4 kB[0m [31m972.8 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/814.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m645.1/814.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m814.4/814.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.4/58.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for safeIO (setup.py) ...

3.2 Function to translate the content within HTML tags when there are no spaces between the tags and the enclosed text (disabled by default, because of higher runtime)

In [4]:
from bs4 import BeautifulSoup
import pandas as pd

def process_html(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')

    # Iterate over the relevant tags and apply the function to the element string
    for element in soup.find_all():
        if element.string is not None:
            modified_text = translate_job_description(element.string, 'n')
            element.string.replace_with(modified_text)

    # Format the HTML txext to improve readability and structure
    formatted_html_text = soup.prettify()
    return formatted_html_text

3.3 Translation of job advertisements

In [5]:
# Initialize list to store translated job descriptions
translated_job_des = []

# Translation of job ads (columns: title, description, describtion with html tags)
job_data['title'] = job_data['title'].apply(lambda x: translate_job_description(x, 'y', False))
job_data['rawDescriptionTranslated'] = job_data['description'].apply(lambda x: translate_job_description(x, 'n', True))

# Needed to also translate the headlines and tags without spaces (disabled by default)
#job_data['rawDescriptionTranslatedWithTags'] = job_data['rawDescriptionTranslated'].apply(lambda x: process_html(x))

# User info (number of ads that have been translated )
print('Translation successful! ' + str(translated_job_ads) + '/' + str(all_job_adds) + ' job advertisements were translated.')

# create dataframe that includes text before and after translation (for development purposes only)
#translated_descriptions = pd.DataFrame(translated_job_des, columns=['Original Text', 'Translated Text'])
#translated_descriptions.to_excel('translated_job_adx.xlsx', index=False)

# Export translated text (for development purposes only)
#translated_descriptions.to_excel('translated_job_adx.xlsx', index=False)

Translation successful! 942/3062 job advertisements were translated.


# 4. Export

*   Export dataframe to excel

In [6]:
# Export dataframe to excel
job_data.to_excel('job_data_general_preprocessed_and_translated.xlsx', index=False)