In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

From May 8, 2019, to June 22, 2019, Kaggle host an analytics competition named "Data Science for Good: City of Los Angeles". The goal of this competition is "Help the City of Los Angeles to structure and analyze its job descriptions".

The City of Los Angeles faces a big hiring challenge: 1/3 of its 50,000 workers are eligible to retire by July of 2020. The city has partnered with Kaggle to create a competition to improve the job bulletins that will fill all those open positions.

The City of LA provided some important datasets for this competition. 600+ job bulletins as a text file, and other additional data. The goal is to convert a folder full of plain-text job postings into a single structured CSV file and then to use this data to:

Problem Description: The city of LA provided a folder full of plain text job postings. There are 683 job postings in the folder. Each job posting is a text file containing raw job description. The goal is that we have to extract particular important information from the raw text file and store those in a CSV file (A row of the CSV is single job information). For example, we can extract a job's salary, open date, requirements and store each of the info in a separate column in the CSV. The city of LA also provides a data dictionary containing general information about what information we should extract from the job postings.

Let's see an example of job posting first for better understanding.

In [None]:
import re
import pandas as pd
import numpy as np
import re
import glob
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import seaborn as sns
from plotly import tools
from plotly.offline import init_notebook_mode, iplot
%matplotlib inline

**Let's go**
1. Upload the data and convert it to csv

In [None]:
print(os.listdir("../input/data-science-for-good-city-of-los-angeles"))


In [None]:
rx_dict = {
    'TITLE': re.compile(r'(?P<title>.*)(\n+.*Class Code:)'),
    'Class Code': re.compile(r'(Class Code:)(?P<classcode>.*)(\n)'),
    'Open Date': re.compile(r'(Open Date:)(?P<open_date>.*)(\n)'),
    'Annual Salary': re.compile(r'ANNUAL\s*SALARY\s*\n*\s*(?P<Lowerend>\$\d*,\d*)(\s*to\s*(?P<Upperend>\$\d*,\d*)|.*)'),
    'Salary DWP': re.compile(r'ANNUAL\s*SALARY(.|\n)*?.*Department of Water and Power.*(?P<Lowerend>\$\d*,\d*)(\s*to\s*(?P<Upperend>\$\d*,\d*))'),
    'DUTIES': re.compile(r'(DUTIES(?P<Duties>(.|\n)*?))(SELECTION PROCESS|APPLICATION DEADLINE|WHERE TO APPLY|MINIMUM\s*QUALIFICATIONS|QUALIFICATIONS|REQUIREMENTS|QUALIFICATION|REQUIREMENT|\Z)'),
    'REQUIREMENTS': re.compile(r'((REQUIREMENT|REQUIREMENT/MINIMUM QUALIFICATION|QUALIFICATION|QUALIFICATIONS)(?P<Requirement>(.|\n)*?))(SELECTION PROCESS|APPLICATION DEADLINE|WHERE TO APPLY|\Z)'),
    'WHERE TO APPLY': re.compile(r'(WHERE TO APPLY(?P<WheretoApply>(.|\n)*?))(SELECTION PROCESS|APPLICATION DEADLINE|\Z)'),
    'APPLICATION DEADLINE': re.compile(r'(APPLICATION DEADLINE(?P<ApplicationDeadline>(.|\n)*?))(SELECTION PROCESS|\Z)'),
    'SELECTION PROCESS': re.compile(r'(SELECTION PROCESS(?P<SelectionProcess>(.|\n)*?))(\Z)'),
    'ALL':re.compile(r'(DUTIES(?P<Duties>(.|\n)*))((REQUIREMENTS|REQUIREMENTS\\MINIMUM QUALIFICATIONS|QUALIFICATIONS|REQUIREMENT|QUALIFICATION)(?P<Requirements>(.|\n)*))(WHERE TO APPLY(?P<Wheretoapply>(.|\n)*))(APPLICATION DEADLINE(?P<ApplicationDeadline>(.|\n)*))(SELECTION PROCESS(?P<SelectionProcess>(.|\n)*))')
}

In [None]:
def convert_jobs_to_df(
    path='../input/data-science-for-good-city-of-los-angeles/cityofla/CityofLA/Job Bulletins/*.txt',
    raw_text_col_name='raw_job_text'):
    
    """
    Convert each text file in job bulletins to pandas dataframe
    
    -----------
    Returns
        Pandas Dataframe
            ------------------------------------
            |    index     |  raw_text         |
            |-----------------------------------
            |     0    |  raw_job_descriptions | 
            ____________________________________
    """
    
    
    job_list = []
    
    files = glob.glob(path)
    for file in files:
        with open(file, 'r', errors='replace') as f:
            content = f.read()
            job_list.append(content)
            
    return pd.DataFrame({raw_text_col_name:job_list})

In [None]:
def _whole_clean_text(text):
    return text.replace("\n","").replace("\t","").strip()

def pre_processing(dataframe):
    # remove all first new line charecters from text
    dataframe['raw_job_text'] = dataframe['raw_job_text'].apply(
        lambda x: x.lstrip())
    return dataframe

def extract_job_title(dataframe):
    # split at newline charecter, then grab first text
    # and that is the title
    dataframe['JOB_CLASS_TITLE'] = dataframe['raw_job_text'].apply(
        lambda x: x.split('\n', 1)[0])
    dataframe['JOB_CLASS_TITLE'] = dataframe['JOB_CLASS_TITLE'].apply(
        lambda x: _whole_clean_text(x))
    return dataframe

def extract_class_code(dataframe):
    # remove all extra white spaces
    temp = dataframe['raw_job_text'].apply(lambda x: ' '.join(x.split()))
    # find class code
    dataframe['JOB_CLASS_NO'] = temp.apply(lambda x: _class_code_apply(x))
    return dataframe

def extract_open_date(dataframe):
    # remove all extra white spaces
    temp = dataframe['raw_job_text'].apply(lambda x: ' '.join(x.split()))
    
    dataframe['OPEN_DATE'] = temp.apply(lambda x: _open_date_apply(x))
    return dataframe

def extract_exam_type(dataframe):
    # remove all extra white spaces
    temp = dataframe['raw_job_text'].apply(lambda x: ' '.join(x.split()))
    
    dataframe['TEMP_EXAM_TYPE'] = temp.apply(lambda x: _exam_type_apply(x))
    return dataframe

def extract_salary(dataframe):
    # remove all extra white spaces
    temp = dataframe['raw_job_text'].apply(lambda x: ' '.join(x.split()))
    
    dataframe['TEMP_SALARY'] = temp.apply(lambda x: _salary_apply(x))
    return dataframe


def extract_duties(dataframe):
    # remove all extra white spaces
    temp = dataframe['raw_job_text'].apply(lambda x: ' '.join(x.split()))
    
    dataframe['JOB_DUTIES'] = temp.apply(lambda x: _duties_apply(x))
    return dataframe

def extract_requirements(dataframe):
    # remove all extra white spaces
    temp = dataframe['raw_job_text'].apply(lambda x: ' '.join(x.split()))
    
    dataframe['TEMP_REQUIREMENTS'] = temp.apply(lambda x: _requirements_apply(x))
    return dataframe

def extract_where_to_apply(dataframe):
    # remove all extra white spaces
    temp = dataframe['raw_job_text'].apply(lambda x: ' '.join(x.split()))
    
    dataframe['WHERE_TO_APPLY'] = temp.apply(lambda x: _where_to_apply(x))
    return dataframe

def extract_deadline(dataframe):
    # remove all extra white spaces
    temp = dataframe['raw_job_text'].apply(lambda x: ' '.join(x.split()))
    
    dataframe['DEADLINE'] = temp.apply(lambda x: _deadline_apply(x))
    return dataframe

def extract_selection_process(dataframe):
    # remove all extra white spaces
    temp = dataframe['raw_job_text'].apply(lambda x: ' '.join(x.split()))
    
    dataframe['SELECTION_PROCESS'] = temp.apply(lambda x: _selection_process_apply(x))
    return dataframe

In [None]:
def _class_code_apply(text):
    """
    This class extract job class code
    """
    match = re.search('Class Code: (\d+)', text)
    class_code = None
    try:
        class_code = match.group(1)
    except:
        class_code = None
    return class_code
        

def _open_date_apply(text):
    
    """
    Extract entire job open date section
    """
    
    open_date = ''
    result= re.search(
        "(Class Code:|Class  Code:)(.*)(ANNUAL SALARY|ANNUALSALARY)",
        text)
    
    shortContent=''
    if result:
        shortContent=result.group(2).strip()
        result= re.search(
            "Open Date:(.*)REVISED",
            shortContent,flags=re.IGNORECASE)
        if result:
            open_date=result.group(1).strip()
        if open_date=='':
            result= re.search(
                "Open Date:(.*)\(Exam",
                shortContent,flags=re.IGNORECASE)
            if result:
                open_date=result.group(1).strip()
        if open_date=='':
            result= re.search(
                "Open Date:(.*)",
                shortContent,flags=re.IGNORECASE)
            if result:
                open_date=result.group(1).strip()
    return open_date


def _exam_type_apply(text):
    
    """
    Extract entire exam type section
    """
    
    exam_type = ""
    result= re.search(
        "(Class Code:|Class  Code:)(.*)(ANNUAL SALARY|ANNUALSALARY)",
        text)
    
    shortContent=''
    if result:
        shortContent=result.group(2).strip()
        result= re.search(
            "\(+(.*?)\)", shortContent,flags=re.IGNORECASE)
        if result:
            exam_type=result.group(1).strip()
    return exam_type


def _salary_apply(text):
    """
    Extract entire salary section
    """
    salary = ''
    salary_notes = ''
    result=re.search(
        "(ANNUAL SALARY|ANNUALSALARY)(.*?)DUTIES", text)
    if result:
        salContent= result.group(2).strip()
        if "NOTE:" in salContent or "NOTES:" in salContent:
            result=re.search(
                "(.*?)(NOTE:|NOTES:)",
                salContent,flags=re.IGNORECASE)
            if result:
                salary=result.group(1).strip()  
            result= re.search(
                "(NOTE:|NOTES:)(.*)",
                salContent,flags=re.IGNORECASE)
            if result:
                salary_notes= result.group(2).strip()
        else:
            salary = salContent
    else:
        result=re.search(
            "(ANNUAL SALARY|ANNUALSALARY)(.*?)REQUIREMENT",
            text,flags=re.IGNORECASE)
        if result:
            salContent= result.group(2).strip()
            if "NOTE:" in salContent or "NOTES:" in salContent:
                result=re.search(
                    "(.*?)(NOTE:|NOTES:)",
                    salContent,flags=re.IGNORECASE)
                if result:
                    salary=result.group(1).strip()  
                result= re.search(
                    "(NOTE:|NOTES:)(.*)",
                    salContent,flags=re.IGNORECASE)
                if result:
                    salary_notes= result.group(2).strip()
            else:
                salary= salContent
    salary_text = "|||||||||||||||".join([salary, salary_notes])
    return salary_text


def _duties_apply(text):
    """
    Extract job duties section
    """
    duties=''
    result=duties= re.search("DUTIES(.*?)REQUIREMENT", text)
    if result:
        duties= result.group(1).strip()
    return duties

def _requirements_apply(text):
    """
    Extract entire job requirements section
    """
    req='|'.join(["REQUIREMENT/MIMINUMUM QUALIFICATION",
                  "REQUIREMENT/MINUMUM QUALIFICATION",
                  "REQUIREMENT/MINIMUM QUALIFICATION",
                  "REQUIREMENT/MINIMUM QUALIFICATIONS",
                  "REQUIREMENT/ MINIMUM QUALIFICATION",
                  "REQUIREMENTS/MINUMUM QUALIFICATIONS",
                  "REQUIREMENTS/ MINIMUM QUALIFICATIONS",
                  "REQUIREMENTS/MINIMUM QUALIFICATIONS",
                  "REQUIREMENTS/MINIMUM REQUIREMENTS",
                  "REQUIREMENTS/MINIMUM QUALIFCATIONS",
                  "MINIMUM REQUIREMENTS:",
                  "REQUIREMENTS",
                  "REQUIREMENT"])
    
    result= re.search(f"({req})(.*)(WHERE TO APPLY|HOW TO APPLY)", text)
    requirements=''
    if result:
        requirements = result.group(2).strip()
    return requirements


def _where_to_apply(text):
    
    """
    Extract entire 'WHERE TO APPLY' section
    """
    
    where_to_apply = ''
    result= re.search(
        "(HOW TO APPLY|WHERE TO APPLY)(.*)(APPLICATION DEADLINE|APPLICATION PROCESS)",
        text)
    if result:
        where_to_apply= result.group(2).strip()
    else:
        result= re.search(
            "(HOW TO APPLY|WHERE TO APPLY)(.*)(SELECTION PROCESS|SELELCTION PROCESS)",
            text)
        if result:
            where_to_apply= result.group(2).strip()
    return where_to_apply

def _deadline_apply(text):
    """
    Extract entire deadline section
    """
    
    deadline=''
    result= re.search(
        "(APPLICATION DEADLINE|APPLICATION PROCESS)(.*?)(SELECTION PROCESS|SELELCTION PROCESS)",
        text)
    if result:
        deadline= result.group(2).strip()
    else:
        result= re.search(
            "(APPLICATION DEADLINE|APPLICATION PROCESS)(.*?)(Examination Weight:)",
            text)
        if result:
            deadline= result.group(2).strip()
            
    return deadline

def _selection_process_apply(text):
    
    """
    Extract selectioin process section
    """
    
    selection_process=''
    result=selection_process= re.search(
        "(SELECTION PROCESS|Examination Weight:)(.*)(APPOINTMENT|APPOINTMENT IS SUBJECT TO:)",
        text)
    if result:
        selection_process= result.group(2).strip()
    else:
        result=selection_process= re.search(
            "(SELECTION PROCESS|Examination Weight:)(.*)",
            text)
        if result:
            selection_process= result.group(2).strip()
            
    return selection_process

In [None]:
# first let's convert folder of raw text job bulletins
# to pandas dataframe
data = convert_jobs_to_df()

# do some initial text cleaning
data = pre_processing(data)

###############################
# Here is actual extraction of main section begin
# we just call the function
###############################
data = extract_job_title(data) # extract job title

data = extract_class_code(data) # extract class code

data = extract_open_date(data) # extract open date

data = extract_exam_type(data) # extract exam type section

data = extract_salary(data) # extract salary section

data = extract_duties(data) # extract duties section

data = extract_requirements(data) # extract requirements section

data = extract_where_to_apply(data) # extract where to apply section

data = extract_deadline(data) # extract deadline section

data = extract_selection_process(data) # extract selectin pro section

# create a new column containing whole text but clean from new line and tab 
data['raw_clean_job_text'] = data['raw_job_text'].apply(
    lambda x: _whole_clean_text(x))

# finally let's see what we have got
data.head()

**Analytics**

In [None]:
data.tail()

In [None]:
data.shape

Let's divide the words by gender

In [None]:
female = [
    "agree","affectionate","child","cheer","collab","commit","communal",
    "compassion","connect","considerate","cooperat","co-operat",
    "depend","emotiona","empath","feel","flatterable","gentle",
    "honest","interpersonal","interdependen","interpersona","inter-personal",
    "inter-dependen","inter-persona","kind","kinship","loyal","modesty",
    "nag","nurtur","pleasant","polite","quiet","respon","sensitiv",
    "submissive","support","sympath","tender","together","trust","understand",
    "warm","whin","enthusias","inclusive","yield","share","sharin"
]

masculine = [
    "active","adventurous","aggress","ambitio",
    "analy","assert","athlet","autonom","battle","boast","challeng",
    "champion","compet","confident","courag","decid","decision","decisive",
    "defend","determin","domina","dominant","driven","fearless","fight",
    "force","greedy","head-strong","headstrong","hierarch","hostil",
    "impulsive","independen","individual","intellect","lead","logic",
    "objective","opinion","outspoken","persist","principle","reckless",
    "self-confiden","self-relian","self-sufficien","selfconfiden",
    "selfrelian","selfsufficien","stubborn","superior","unreasonab"
]

hyphenated_coded_words = [
    "co-operat","inter-personal","inter-dependen","inter-persona",
    "self-confiden","self-relian","self-sufficien"
]

possible_codings = (
    "strongly feminine-coded","feminine-coded","neutral",
    "masculine-coded","strongly masculine-coded"
)

In [None]:
def base_info(data):
  print(f' Zero \n:{data.isnull().sum()}\n')
  print(f'NaN :\n {data.isna().sum()}\n')
  print(f"type of data:\n {data.dtypes}\n")

In [None]:
base_info(data)

Delete nan

In [None]:
data.info

JOB_CLASS_NO must be int!

In [None]:
data['JOB_CLASS_NO'].fillna(0, inplace=True)

In [None]:
data['JOB_CLASS_NO'] = data['JOB_CLASS_NO'].astype(str).astype(int)

In [None]:
data['JOB_CLASS_NO'].describe()

large std

In [None]:
data.JOB_CLASS_TITLE.str.contains(r'gender_words').sum()  

In [None]:
for i in masculine:
    sum = data.JOB_CLASS_TITLE.str.contains(r'i').sum() + data.TEMP_EXAM_TYPE.str.contains(r'i').sum() + data.TEMP_REQUIREMENTS.str.contains(r'i').sum() + data.SELECTION_PROCESS   .str.contains(r'i').sum()
    print(f'{i} : {sum}')

In [None]:
for i in masculine:
    sum = data.JOB_CLASS_TITLE.str.contains(r'i').sum() + data.TEMP_EXAM_TYPE.str.contains(r'i').sum() + data.TEMP_REQUIREMENTS.str.contains(r'i').sum() + data.SELECTION_PROCESS	.str.contains(r'i').sum()
    print(f'{i} : {sum}')

the same number of featured words is everywhere