# Data Science for Good: City of Los Angeles


### Introduction

This kernal produces  [here](#sdf) a structured data file in the format required by the competition.

I am happy for people who want to focus on the recommendation part of the task to use the file.

I have then used the data to produce some diagrams which explore the [explicit links](#el)  between job classes.

#### Explicit links 

The first set of plots show the [subordinate](#el) positions to a role, ie who could be promoted.

The second set of plots show the subordinate positions to one of the [difficult](#difficult) to fill roles report by the hosts.

The third set of plots show what [promotions](#promotional) are available to a role. There are some amazing routes...



## Data

#### 683 bulletins were provided. Of these 2 were duplicate job classes and only the latest bulletin has been included:

CHIEF CLERK POLICE 1249 083118.txt

SENIOR UTILITY SERVICES SPECIALIST 3573 113018.txt

#### In 3 cases the body text does not match the file name and so these bulletins have also been excluded:

ANIMAL CARE TECHNICIAN SUPERVISOR 4313 122118.txt

WASTEWATER COLLECTION SUPERVISOR 4113 121616.txt

SENIOR EXAMINER OF QUESTIONED DOCUMENTS 3231 072216 REVISED 072716.txt

#### In one case a class code is not provided:

Vocational Worker DEPARTMENT OF PUBLIC WORKS.txt

The structured data file therefore contains 678 jobs with 677 class codes.



In [None]:

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import networkx as nx
import nltk, string
import matplotlib.pyplot as plt
import re,glob
import time
import os
import random
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

%matplotlib inline

import os
print(os.listdir("../input"))

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

stop_words = set(stopwords.words('english'))
stop_words.update(['gives'])


'''remove punctuation, lowercase, stem'''
punct = '-'
remove_punctuation_map = dict((ord(char), ' ') for char in punct)    
def normalize(text):
    return nltk.word_tokenize(text.lower().translate(remove_punctuation_map))

def clean_text(text):
    text = text.lower().translate(remove_punctuation_map)
    
    return ' '.join(lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text))

numwords={}
numbers = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight","nine", "ten","eleven","twelve","thirteen","fourteen","fifteen","sixteen","seventeen","eighteen"]
for idx, word in enumerate(numbers):    numwords[word] = (idx)



## Getting the headings from the Sample Template

In [None]:
path = '../input/cityofla/CityofLA/Additional data/'
filename ='sample job class export template.csv'
with open(path + "/" + filename, 'r', errors='ignore') as f:
        c_h = f.readline()
        #column_heads = ["FILE_NAME","JOB_CLASS_NO"]
c_h = c_h.replace("\n","")
col_heads = {}
column_heads = []

for i, word in enumerate(c_h.split(',')): 
    column_heads.append(word)
for i, word in enumerate (column_heads): col_heads[word] = (i)
print (column_heads)

# Loading in the Job Classes and adding missing one

In [None]:
path = '../input/cityofla/CityofLA/Additional data/'
filename ='job_titles.csv'
with open(path + "/" + filename, 'r', errors='ignore') as f:
        j_t = f.readlines()
len_j_t = len(j_t)
for i in range(len_j_t):
        j_t[i] = j_t[i].replace("\n","")
        if j_t[i]  ==  'Vocational Worker  DEPARTMENT OF PUBLIC':
            j_t[i] = 'VOCATIONAL WORKER'

j_t.append('SEASONAL POOL MANAGER')
j_t.append('OPEN WATER LIFEGUARD')
j_t.append('ELECTRICAL ENGINEER')
j_t.append('WASTEWATER TREATMENT MECHANIC')
j_t.append('TELECOMMUNICATIONS PLANNER')
j_t.append('CONSTRUCTION EQUIPMENT SERVICE SUPERVISOR')
j_t.append('COMPUTER OPERATOR')
j_t.append('SPECIAL PROGRAM ASSISTANT')
j_t.append('FIRE PROTECTION ENGINEER')
j_t.append('PRINT SHOP TRAINEE')
j_t.append('IMPROVEMENT ASSESSOR')
j_t.append('ENGINEERING ASSOCIATE')
j_t.append('SAFETY ENGINEER PRESSURE VESSELS')
j_t.append('SENIOR ROOFER')
j_t.append('SENIOR CLERK TYPIST')
j_t.append('SENIOR STREET SERVICES INVESTIGATOR')
j_t.append('LIBRARY CLERICAL ASSISTANT')
j_t.append('WASTEWATER TREATMENT ELECTRICIAN')
j_t.append('PERFORMING ARTS PROGRAM COORDINATOR')
j_t.append('PROCUREMENT AIDE')
j_t.append('LOAD DISPATCHER')
j_t.append('ASSOCIATE ZOO CURATOR')



#Executive, Senior, Coordinating or Web Content Producer.?


#sort the list so that longest by number of words is first
#this allows us to remove a match and so duplicates are avoided
j_t.sort(key=lambda x: len(x.split()), reverse=True)

#print (j_t)

# Displaying the Sample Template

In [None]:
# path = '../input/cityofla/CityofLA/Additional data/'
# sample_job_class = pd.read_csv(path + 'sample job class export template.csv')
# df_job_class = sample_job_class.copy()
# sample_job_class.head()

# Functions called by the cell that produces the structured data file

In [None]:
def find_experience(line,job):
    len_j_t = len(j_t)
    max_pos = 0
    exp = ''
    last_exp = ''
    line = line.upper()
    full_line = line
    job = job.upper()
    
    assist_pattern =  '(.*)'+ 'ASSISTING' + '(.*)'
    if re.search(assist_pattern, line):
        line = re.search(assist_pattern, line).group(1)
    assist_pattern =  '(.*)'+ 'SUBSTITUT' + '(.*)'
    if re.search(assist_pattern, line):
        line = re.search(assist_pattern, line).group(1)
    assist_pattern =  '(.*)'+ 'YEARS OF WHICH' + '(.*)'
    if re.search(assist_pattern, line):
        line = re.search(assist_pattern, line).group(1)
    assist_pattern =  '(.*)'+ 'INCLUDING'+ ('.*?')+ ('YEARS') + '(.*)'
    if re.search(assist_pattern, line):
        line = re.search(assist_pattern, line).group(1)

    #print ('find',line)
    for i in range(len_j_t):
        #print (j_t[i])
        #print (job)
#        pattern =  j_t[i]+ '( I{0,3} )|(" ")'
        pattern =  j_t[i]+ '( I{0,3} )'
        pattern =  j_t[i]+ '( I{0,3}[ |\.|;|,])'
        pattern2 =  j_t[i] + '( |\.|;|,)'
        
        special = 'FIREFIGHTER'
        special_ex = 'ENDORSEMENT'
        special2 = 'ARCHITECT'
        special2_ex = 'LICENSE'
        
        if job == j_t[i] and re.search(pattern2, line):
            #remove instances of the job title in the requirements section
            #print ('job', job)
            src = re.search(pattern2, line)
            line =line.replace(src.group(0),'')
                           
        
        if re.search(pattern2, line) and job!= j_t[i] and job != ('ASSISTANT '+ j_t[i]):
            if not(j_t[i] == special and  re.search(special + " " +special_ex, line)):
                 if not(j_t[i] == special2 and  re.search(special2 + " " +special2_ex, line)):
                    src = re.search(pattern2, line)
                    #print ('1', src.group(0))
                    #print ('1a line',line)
                    matches = re.findall(pattern, line) 
                    if matches:
                        for match in matches:
                            exp = exp + j_t[i]  + match +', '
                            #print ('exp in matches',exp)
                    else:
                        exp = exp + j_t[i] +', '
                    line =line.replace(src.group(0),'')
                    #print ('exp',exp)
                    #print ('line after replace', line)
                    src = re.search(j_t[i], full_line)
                    #print ('2', src.group(0))
                    if src.start(0) > max_pos:
                        #print ('start, title',src.start(1),j_t[i])
                        max_pos = src.start(0)
                        last_exp = j_t[i]
    exp = exp.rstrip(', ')
    #print (exp)
    #print(last_exp)
    return exp, last_exp

# TEST CODE
# line = "1. Two years of full-time paid experience as a Safety Engineer Pressure Vessels with the City of Los Angeles; and"
# job = 'Senior Safety Engineer Pressure Vessels'

# find_experience(line,job)

In [None]:
def examination_type(line, level):
    #print ('line, level',line, level)
    if (level == 0):
        exam_type_pattern = '(THIS EXAM)(.*?)( IS TO BE GIVEN)(.*)'   
        exam_type_pattern2 = 'FOR EXEMPT EMPLOYEES SEEKING TO BECOME(.*)'   
        if re.search(exam_type_pattern,line):
            #print (re.search(exam_type_pattern,line).group(4))
            row[col_heads['EXAM_TYPE']]  = re.search(exam_type_pattern,line).group(4)
            level = 1
        if  re.search(exam_type_pattern2,line):
            row[col_heads['EXAM_TYPE']]  = line
            level = 1            
    else:
#         #print (line)
#         row[col_heads['EXAM_TYPE']]  = row[col_heads['EXAM_TYPE']] + ' ' + line
        level = 0
        exam_type_pattern3 = 'The City of'   
        
        if not re.search(exam_type_pattern3,line):
            exam_type = row[col_heads['EXAM_TYPE']] + ' ' + line  
            exam_type = exam_type.replace ('AN AN','AN')
            exam_type = exam_type.replace ('AND OPEN','AND AN OPEN')
            exam_type = exam_type.replace ('AND OPEN','AND AN INTERDEPARTMENTAL')
            exam_type = exam_type.replace ('BOTH ON','ON')
            exam_type = exam_type.replace ('ON BOTH','ON')
            exam_type = exam_type.replace ('ON A ','ON AN ') 
            exam_type = exam_type.replace ('AND ON AN','AND AN')  
            exam_type = exam_type.replace ('AND ON AN','AND AN')  
            exam_type = exam_type.replace ('TO ON','ON')  
            exam_type = exam_type.replace ('ONLY ON A','ON A')  
            exam_type = exam_type.replace ('BASIS ONLY','BASIS')  
       
            exam_type = exam_type.replace ('INTERDEPARMENTAL','INTERDEPARTMENTAL')  
            exam_type = exam_type.replace ('BASIS   NVVC','BASIS')  
   
        
            exam_type = exam_type.replace ('AND INTERDEPARTMENTAL','AND AN INTERDEPARTMENTAL')  
       
            exam_type = exam_type.replace ('COMPETITVE','COMPETITIVE')
            exam_type = exam_type.replace ('ON AN OPEN COMPETITIVE AND AN INTERDEPARTMENTAL PROMOTIONAL BASIS','ON AN INTERDEPARTMENTAL PROMOTIONAL AND AN OPEN COMPETITIVE BASIS')
            
            row[col_heads['EXAM_TYPE']]  = exam_type
        else:
            row[col_heads['EXAM_TYPE']] = row[col_heads['EXAM_TYPE']].replace ('ONLY ON A','ON A') 
        
   
    return level

In [None]:
def driver_licence(line):
# cases to consider:
# a valid california driver's license is required.
# a valid california driver's license may be required.
# for positions requiring a valid class b driver's license
# a valid unrestricted california commercial class a or class b driver's license and valid medical certificate approved
# a valid california commercial class b driver's license 
#Some positions may require a valid California Class C and/or Class B driver's license.
# 

    line = line.lower()
#     if ('driver'  in line):
#         if ('some positions may require' not in line):
#             if ("a valid california driver's license is required" not in line):
#                 print (line)
            
    driver_lic_pattern = '(driver)(.*)(required)'  
    driver_lic_may_pattern = '(driver)(.*)(may)(.*)(required|approved)'   
    
    #driver_lic_class_pattern = '(.*)(class)(.*)(driver)(.*)(required|approved)'   
    driver_lic_class_pattern = '(.*?)(class)(?!\..*?)(driver)(.*?)(;|\.|:|,|\r|\n)'   
    driver_lic_may_class_pattern = 'positions(.*) (requir)(.*?)(class)(.*)(driver)'   
    
    driver_lic_endorse_pattern = '(license with )(.*)(endorsement.)'   
    
    if re.search(driver_lic_may_pattern,line):
#            print ('dlp', ' P')
            row[col_heads['DRIVERS_LICENSE_REQ']]  = 'P'
    elif re.search(driver_lic_pattern,line):
#            print ('dlp', ' R')
            row[col_heads['DRIVERS_LICENSE_REQ']]  = 'R'
    if re.search(driver_lic_class_pattern,line):
            may = re.search(driver_lic_class_pattern,line).group(1)
 #           print ('may', may)
 #           print ("re.search('may',may)",re.search('may',may))
            if re.search('may',may) == None:
                lclass = re.search(driver_lic_class_pattern,line).group(3)
 #               print ('dlpc', lclass)
                lclass = lclass.rstrip(" ").lstrip(" ")
                lclass = lclass.rstrip("\'").lstrip("\'").upper()
 #               print ('dlpcs', lclass)
                if row[col_heads['DRIV_LIC_TYPE']] == '':
                    row[col_heads['DRIV_LIC_TYPE']]  =  lclass.upper() + " "
                else:
                    row[col_heads['DRIV_LIC_TYPE']]  = row[col_heads['DRIV_LIC_TYPE']] + lclass + " "

                if row[col_heads['DRIVERS_LICENSE_REQ']] != 'R':
                    row[col_heads['DRIVERS_LICENSE_REQ']]  = 'R'
#                 print ('R')

    if re.search(driver_lic_may_class_pattern,line):
  #              print("row[col_heads['DRIVERS_LICENSE_REQ']]",row[col_heads['DRIVERS_LICENSE_REQ']])
                lclass = re.search(driver_lic_may_class_pattern,line).group(5)
                if row[col_heads['DRIVERS_LICENSE_REQ']] == '':
                    row[col_heads['DRIV_LIC_TYPE']]  = lclass.upper()
                    row[col_heads['DRIVERS_LICENSE_REQ']]  = 'P'
  #                  print ('dlmpc', 'P',lclass.upper)
                
    if re.search(driver_lic_endorse_pattern,line):
            lclass = re.search(driver_lic_endorse_pattern,line).group(2) + re.search(driver_lic_endorse_pattern,line).group(3)
   #         print ('endorse', lclass)
            row[col_heads['ADDTL_LIC']]  = lclass.upper()
    
    return    

In [None]:

def entry_salaries (body,row):
    
    line = body.replace (',','')
    line = body.replace ('*','')
    line = line.lower()
    line = line.replace(' to ','-')
    
    #print ("sal line", line)
    salary_flat_pattern = '(\$|\$ )(\d{4,8})(.*)(flat)(.*?)(rated)'
    salary_range_pattern = '(\$|\$ )(\d{4,8})(-)(\$|'')(\d{4,8})'
    

    salary_pattern = '(.*)(department of water and power)(.*)'  
    if re.search(salary_pattern,line):
        part1 = re.search(salary_pattern,line).group(1)
        part2 = re.search(salary_pattern,line).group(3)
    else:
        part1 = line
        part2 = ''

    part1s = ''
    part2s = ''

    #salary_flat_pattern = '(\$)(.*)(flat-rated\))'
    if re.search(salary_flat_pattern,part1):
        part1s = re.search(salary_flat_pattern,part1).group(1) + re.search(salary_flat_pattern,part1).group(2)  +'(flat-rated)' 
    if re.search(salary_flat_pattern,part2):
        part2s = re.search(salary_flat_pattern,part2).group(1) + re.search(salary_flat_pattern,part2).group(2)  +'(flat-rated)'
        
    if re.search(salary_range_pattern,part1):
        part1s = re.search(salary_range_pattern,part1).group(0) 
    if re.search(salary_range_pattern,part2):
        part2s = re.search(salary_range_pattern,part2).group(0) 
    if (part1s != ''):
        row[col_heads['ENTRY_SALARY_GEN']]  = "\\" + part1s
    if (part2s != ''):
        row[col_heads['ENTRY_SALARY_DWP']]  = "\\" + part2s
    if re.search('scale pending',part1):
         row[col_heads['ENTRY_SALARY_GEN']]  = part1
    if re.search('scale pending',part2):
         row[col_heads['ENTRY_SALARY_GEN']]  = part2
       
    
    #print (line)
    #print (part1s)
    #print (part2s)

    return row


In [None]:
def req_clr (row):

    row[col_heads['EDUCATION_YEARS']] = ''
    row[col_heads['SCHOOL_TYPE']] = ''
    row[col_heads['EDUCATION_MAJOR']] = ''
    row[col_heads['EXPERIENCE_LENGTH']] = ''
    row[col_heads['FULL_TIME_PART_TIME']] = ''
    row[col_heads['COURSE_LENGTH']] = ''
    row[col_heads['COURSE_COUNT']] = ''
    row[col_heads['COURSE_SUBJECT']] = ''
    row[col_heads['MISC_COURSE_DETAILS']] = ''
    row[col_heads['EXP_JOB_CLASS_FUNCTION']] = ''
    row[col_heads['EXP_JOB_CLASS_ALT_RESP']] = ''
    row[col_heads['EXP_JOB_CLASS_TITLE']] = ''
    row[col_heads['REQUIREMENT_SET_ID']] = ''
    row[col_heads['REQUIREMENT_SUBSET_ID']] = ''

    return (row)

In [None]:
def fill_row(title,content):
    #print ('content', content)
    #print ('row[col_heads[title]] ', row[col_heads[title]] )
    row[col_heads[title]] = content
    return True

def clr_and_fill_row(title,content):
    row[col_heads[title]] = content
    return 
 

In [None]:
#modify some text to make searches work

def clean_txt (line):
    line = line.replace ('health and safety',' health & safety')
    line = line.replace ('Sr.','Senior')
    line = line.replace ('Pre-','Pre')
    line = line.replace ('Construction Maintenance Superintendent','Construction and Maintenance Superintendent')
    return line

def  convert_words_to_number (line):
# convert word numbers to numbers as strings
            
    oldline = line.replace ('-',' ')
    line =""
    for i, word in enumerate(oldline.split()):
        #print (word)
        word_l = word.lower()
        if word_l in numwords:
            #print ('numwords[word_l]',numwords[word_l])
            word = str (numwords[word_l])
        line = line + " " + word
    #print ('oldline',oldline)
    #print ('line',line)
    return line

num_index_pattern = '^(.|'')(\d)(\.)(.*)'
char_index_pattern = '(?i)^(.|''|.\()([a-z])(\.|\))(.*)'
edpattern = '(high school, university, college, trade or technical school|trade school or college|college or trade school|college or university or trade school|college or university|college|university|American Bar Association accredited law school)'


def requirement_and_sub(line, row):
    #print ('set line', line)
    if  re.search(num_index_pattern,line):
        curr_req = re.search(num_index_pattern,line).group(2)
        #row[col_heads['REQUIREMENT_SET_ID']] = int(curr_req)
        row[col_heads['REQUIREMENT_SET_ID']] = curr_req
#        row[col_heads['REQUIREMENT_SUBSET_ID']] = "A"
    elif  re.search(char_index_pattern,line):
        curr_reqsub = re.search(char_index_pattern,line).group(2)
        row[col_heads['REQUIREMENT_SUBSET_ID']] = curr_reqsub.upper()
    else:   
        row[col_heads['REQUIREMENT_SET_ID']] = ''
        row[col_heads['REQUIREMENT_SUBSET_ID']] = ''
        
        
    return row

def education(line, row):
    semester_pattern ='(\d{1,3})(.)(semester)'
    qtr_pattern ='(\d{1,3})(.)(quarter units)(.*?)( from|in|with|of|at)(.*)(;|\.)'
    courses_pattern = '(?i)(completion of )(\d{1,3})(.)(course)'
    pattern2 = 'college|university'
    major_pattern = '(?i)(major in|degree in|college in)(.*?)(;|including|and |from|\.)'
    pattern3 = 'high school or G.E.D. equivalent'
    course_len = ''
    exp_found = False
    if (re.search(major_pattern, line) and not re.search('may be substituted', line)):
        exp_found = fill_row('EDUCATION_MAJOR',(re.search(major_pattern, line).group(2).replace(", ", "|").replace(" or ", "|")))
    if (re.search(edpattern, line)):
        clr_and_fill_row('SCHOOL_TYPE', re.search(edpattern, line).group(0))
        course_len_pattern ='(\d{1,3})(.)(year)(.{1,5})(college or university or trade school|college or university|college|university)'
        if (re.search(course_len_pattern, line)):
            #exp_found = fill_row('EDUCATION_YEARS',int(re.search(course_len_pattern, line).group(1)))
            exp_found = fill_row('EDUCATION_YEARS',(re.search(course_len_pattern, line).group(1)))

    if (re.search(semester_pattern, line)):
        course_len = re.search(semester_pattern, line).group(1)+ 'S'
    if (re.search(qtr_pattern, line)):
        course_len = course_len + re.search(qtr_pattern, line).group(1)+ 'Q'
        exp_found = fill_row('COURSE_SUBJECT',re.search(qtr_pattern, line).group(6))
        exp_found = fill_row('COURSE_LENGTH', course_len)
    if (re.search(courses_pattern, line)):
        exp_found = fill_row('COURSE_COUNT',re.search(courses_pattern, line).group(2))
#school                    
    if (re.search(pattern3, line)):
        exp_found = fill_row('SCHOOL_TYPE', pattern3)
    return row,exp_found

def cert_and_completion(line, row):
#certification
    exp_found = False
    cert_pattern ='(?i)(possession)(.*?)(certificate)(.*)(;|\.)'
    if (re.search(cert_pattern, line)):
        exp_found = fill_row('MISC_COURSE_DETAILS', (re.search(cert_pattern, line)).group(0))
    cert_pattern ='(?i)(certification)(.*?)(;|\.)'
    if (re.search(cert_pattern, line)):
        exp_found = fill_row('MISC_COURSE_DETAILS', (re.search(cert_pattern, line)).group(0))
    cert_pattern ='(?i)(.*?)(certificate)(.*?)(;|\.)'
    if (re.search(cert_pattern, line)):
        exp_found = fill_row('MISC_COURSE_DETAILS', re.search(cert_pattern, line).group(2) +\
                                                    re.search(cert_pattern, line).group(3) + \
                                                    re.search(cert_pattern, line).group(4))

        #completion of misc course requirement
    comp_pattern ='(?i)(completion|attainment)(.*?)(in |of )(.*?)(;|\.)'
    if (re.search(comp_pattern, line))  and not re.search(edpattern, line): 
        exp_found = fill_row('MISC_COURSE_DETAILS', (re.search(comp_pattern, line)).group(0))
    if (re.search(comp_pattern, line))  and  re.search('high school, university, college, trade or technical school|community college or trade school', line):
        exp_found = fill_row('MISC_COURSE_DETAILS', (re.search(comp_pattern, line)).group(0))
  
    return row,exp_found

def experience_len(line, row):
    exp_found = False

    pattern ='(full.time|part.time|years as a|\d{2,6} hours)(.*)(|;|\.)'
    if (re.search(pattern, line)):
        if re.search(pattern, line).group(1) != 'years as a':
            row[col_heads['FULL_TIME_PART_TIME']] = re.search(pattern, line).group(1).upper()
        month_pattern = '(\d{1,3})(.)(month)'
        year_pattern = '(\d{1,3})(.)(year)(.{1,5})(full.time|part.time|as a)'
        if (re.search(month_pattern, line)):
            fract_yr = (int(re.search(month_pattern, line).group(1)))/12
            fract_yr = str(fract_yr)
            exp_found = fill_row('EXPERIENCE_LENGTH', fract_yr)
            exp_found = fill_row('EXP_JOB_CLASS_FUNCTION', re.search(pattern, line).group(2))
        if (re.search(year_pattern, line)):
            #yr = (int(re.search(year_pattern, line).group(1)))
            yr = str(re.search(year_pattern, line).group(1))
            #print ('yr exp A',yr,re.search(pattern, line).group(1))
            exp_found = fill_row('EXPERIENCE_LENGTH', yr)
            exp_found = fill_row('EXP_JOB_CLASS_FUNCTION',  re.search(pattern, line).group(2))
    year_pattern2 ='(\d{1,3})(.)(years of experience)(.*)(|;|\.|and|,)'
    if (re.search(year_pattern2, line)):
        #yr = (int(re.search(year_pattern2, line).group(1)))
        yr = str(re.search(year_pattern2, line).group(1))
        #print ('yr exp B',yr,re.search(year_pattern2, line).group(1))
        #print ('yr exp lne',line)
        exp_found = fill_row('EXPERIENCE_LENGTH', yr)
        exp_found = fill_row('EXP_JOB_CLASS_FUNCTION',  re.search(year_pattern2, line).group(3) + re.search(year_pattern2, line).group(4)  )

    return row,exp_found

def catch_all (line,row):
#strip the req id and then print without mod
    if  re.search(num_index_pattern,line):
            line = re.search(num_index_pattern,line).group(4)
    if  re.search(char_index_pattern,line):
            line = re.search(char_index_pattern,line).group(4)
    exp_found = fill_row('EXP_JOB_CLASS_FUNCTION', line)
    return row

def experience(line, row,exp,last_exp):              
    row[col_heads['EXP_JOB_CLASS_TITLE']] = exp
    job_pattern =last_exp.lower()+'(.*)'+'(;|\.|:|,|\r|\n)'
    #print('exp',line)
    if (re.search(job_pattern, line.lower())):
        alt = re.search(job_pattern, line.lower()).group(1)
        or_pattern = '(.{1,2})(or.)'
        if (re.match(or_pattern, alt)):
            #print('exp2',line)
            exp_found = fill_row('EXP_JOB_CLASS_ALT_RESP', re.search(job_pattern, line.lower()).group(1))
            exp_found = clr_and_fill_row('EXP_JOB_CLASS_FUNCTION', '')
        else:
            class_pattern = '(.*)(in a class|at the level|performing the duties|paid experience as)(.*)(;|\.|:|,|\r|\n)'
            if re.search(class_pattern,line.lower()):
                #print('exp3',line)

                exp_found = clr_and_fill_row('EXP_JOB_CLASS_FUNCTION', '')
                alt_class = "or " + re.search(class_pattern, line.lower()).group(2) +re.search(class_pattern, line.lower()).group(3)+ re.search(class_pattern, line.lower()).group(4)
                exp_found = fill_row('EXP_JOB_CLASS_ALT_RESP', alt_class)
            else:
                job_pattern2 ='(experience)(.*)' +last_exp.lower()+'(.*)'+'(;|\.|:|,|\r|\n)'
                #print('exp4',line)

                if re.search(job_pattern2, line.lower()):
                    #see water t sup
                    exp_found = fill_row('EXP_JOB_CLASS_FUNCTION', re.search(job_pattern2, line.lower()).group(0))
                else:
                    exp_found = fill_row('EXP_JOB_CLASS_FUNCTION', re.search(job_pattern, line.lower()).group(1))
    return row



In [None]:
def process_state (state,body,line,row,job,data_list):
    global eda_row
    if state == 'duties':
        body += line
        row[col_heads['JOB_DUTIES']] = body.replace('\n','')
    if state == 'annualsalary':
        line = line.replace (',','').replace(' to ','-')
        body += line
        entry_salaries (body,row)
    if state == 'requirements':
        exp_found = False
        line = clean_txt (line)       
        if (line !=''):
            sub_pattern = 'substitut'
            if (re.search(sub_pattern, line)):
                    exp_found = fill_row('EXP_JOB_CLASS_FUNCTION', line)
            else:
                line = convert_words_to_number (line)
                row = requirement_and_sub(line, row)
                row,exp_found =  education(line, row)
                row,exp_found = cert_and_completion(line, row)
                row,exp_found = experience_len(line, row)
                exp,last_exp = find_experience (line,job)
                if exp == '' and not exp_found:
                    catch_all (line,row)
                else:
                    if (exp):
                        row = experience(line, row,exp,last_exp)
                    #Specials where job = class mentioned
                    #print('zoo?',line)
                    if re.search('Zoo Registrar',line):
                        exp_found = fill_row('EXP_JOB_CLASS_ALT_RESP','performing the duties of a Zoo Registrar')
                        exp_found = clr_and_fill_row('EXP_JOB_CLASS_FUNCTION', '')
   
                        
            save_row = row.copy()
            data_list.append(save_row)

            len_row = len(row)
            for i in range(len_row):
                if eda_row[i] == '':
                     eda_row[i] = row[i]
                elif  row[i] not in eda_row[i]:
                    eda_row[i] = eda_row[i] + ' ' + row[i]
            row = req_clr (row)
    return row, body,data_list

In [None]:

def backfill (data_list, start_job_index, row):
    
    save_row = eda_row.copy()
    eda_data_list.append(save_row)
    
    last_job_index = len(data_list)
    for i in range ( start_job_index,last_job_index):
        data_list[i][col_heads['EXAM_TYPE']] = row[col_heads['EXAM_TYPE']]
        data_list[i][col_heads['DRIVERS_LICENSE_REQ']] = row[col_heads['DRIVERS_LICENSE_REQ']]
        data_list[i][col_heads['DRIV_LIC_TYPE']] = row[col_heads['DRIV_LIC_TYPE']]
        data_list[i][col_heads['ADDTL_LIC']] = row[col_heads['ADDTL_LIC']]
    return last_job_index

In [None]:
 def line_interp (line, title_line, row, state, body,level,job,data_list):
    #print ('newline',line)
    #print ('data_list',data_list)
    class_pattern ='(Code:.*)(\d{4})(.*)'
    sub_pattern = 'substitut'
    if line != '' and title_line:
        job= line.replace('\n','').lower().title()
        #in some cases the first line of the file is not the job title
        #better to check against filename tha this test
        if job.upper() != 'CAMPUS INTERVIEWS ONLY':
            row[col_heads['JOB_CLASS_TITLE']] = job
            title_line = False
    if (re.search(class_pattern, line)):
        if row[col_heads['JOB_CLASS_NO']]  =='':
            #SENIOR ELECTRIC SERVICE REPRESENTATIVE has wrong code at btm of file
            row[col_heads['JOB_CLASS_NO']] = re.search(class_pattern, line).group(2)
    elif "Open Date:" in line:
        row[col_heads['OPEN_DATE']]  = line.split("Open Date:")[1].split("(")[0].strip().replace ('-','/')
    elif (line.isupper()and not "$" in line):
        state = ''
        body =''
    if state != '':
        row, body,data_list = process_state (state,body,line,row,job,data_list)
    elif re.search('DUTIES',line):
        state = 'duties'
    elif re.search('REQUIREMENT',line):
        state = 'requirements'
    elif re.search('(ANNUAL SALARY)|(ANNUALSALARY)',line):
        state = 'annualsalary'
    driver_licence(line)
    level = examination_type(line,level)
    return line, title_line, row, state, body,level, job,data_list

In [None]:
def chk_file_valid(filename):
    #this function could be refactored so that further invalid bulletin are excluded automatically
    file_valid = True
    if filename == 'ANIMAL CARE TECHNICIAN SUPERVISOR 4313 122118.txt':
        #excluded because bulliten text is wrong
        file_valid = False
    if filename == 'WASTEWATER COLLECTION SUPERVISOR 4113 121616.txt':
        #excluded because bulliten text is wrong
        file_valid = False
    if filename == 'SENIOR EXAMINER OF QUESTIONED DOCUMENTS 3231 072216 REVISED 072716.txt':
        #excluded because bulliten text is wrong
        file_valid = False
    if filename == 'SENIOR UTILITY SERVICES SPECIALIST 3753 121815 (1).txt':
        #excluded because a newer bulliten exists
        file_valid = False
    if filename == 'CHIEF CLERK POLICE 1219 061215.txt':
        #excluded because a newer bulliten exists
        file_valid = False
       
    if file_valid == False:
        print ()
        print ('invalid', filename)
        print()
    return file_valid
    

# Cell that produces the structured data file

In [None]:
bulletin_dir = "../input/cityofla/CityofLA/Job Bulletins"
data_list = []
eda_data_list = []
body = ''
state = ''
job = ''
level = 0
cnt = 0
start_job_index = 0
global eda_row

for filename in os.listdir(bulletin_dir):
 #    if cnt >280 and cnt <299:
        row = [''] * 25
        eda_row= [''] * 25
        with open(bulletin_dir + "/" + filename, 'r', errors='ignore') as f:
            row[col_heads['FILE_NAME']] = filename
            #print (filename)
            print ('. ',end="")
            title_line = True
            file_valid = chk_file_valid(filename)
            if (file_valid):
                for index,line in enumerate(f.readlines()):
                    line = line.rstrip().lstrip()
                    if (line !='' and line != "OR"):
                        #                    pattern = '(.*?)(; or|; and)(.*)' 
                        pattern = '(.*?)(; or)(.*)' 
                        #sometimes a significant alternative is included within a requirement
                        if re.search(pattern, line) and(len(re.search(pattern, line).group(3))) > 40:
                            _, title_line, row, state, body,level,job,data_list = \
                                    line_interp (re.search(pattern, line).group(1) + re.search(pattern, line).group(2), title_line, row, state, body,level,job,data_list)
                            _, title_line, row, state, body,level,job,data_list = \
                                    line_interp (re.search(pattern, line).group(3), title_line, row, state, body,level,job,data_list)
                        else:
                            line, title_line, row, state, body,level,job,data_list = \
                                line_interp (line, title_line, row, state, body,level,job,data_list)             
                start_job_index = backfill(data_list,start_job_index,row)
        #    cnt += 1

df_job_class = pd.DataFrame(data_list)
df_job_class.columns = column_heads
df_eda_exam = pd.DataFrame(eda_data_list)
df_eda_exam.columns = column_heads


In [None]:
df_eda_exam.describe()

In [None]:
#types not right yet
#df_job_class["REQUIREMENT_SET_ID"] = df_job_class["REQUIREMENT_SET_ID"].astype('int16')
#df_job_class["EXPERIENCE_LENGTH"] = df_job_class["EXPERIENCE_LENGTH"].astype('float')

df_job_class["OPEN_DATE"] = df_job_class["OPEN_DATE"].astype('datetime64[ns]')
df_job_class.head(20)

# Structured Data File<a id='sdf'></a>

In [None]:
#to print out the full file
pd.options.display.max_colwidth = 200
with pd.option_context("display.max_rows", 2000): display (df_job_class)

# Output for competition


In [None]:
df_job_class.to_csv("competition_output.csv")
df_job_class.describe()

## Saving and reloading the structured data file locally

In [None]:
df_job_class.head()

In [None]:
workingpath = ('../working')

df_job_class.to_csv(workingpath + 'job_class.csv')

In [None]:

df_saved_job_class = pd.read_csv(workingpath + 'job_class.csv')
df_saved_job_class.head()
df_saved_job_class.describe()

In [None]:
df_saved_job_class.head()

In [None]:
with pd.option_context("display.max_rows", 2000): display (df_job_class)

#Â EDA

The df_eda_exam dataframe has one row for each job class and is useful for analysis


In [None]:
#to print out the full file
pd.options.display.max_colwidth = 200
with pd.option_context("display.max_rows", 2000): display (df_eda_exam)

In [None]:
df_exam_group = df_eda_exam.groupby('EXAM_TYPE').count()

df_exam_group.head(20)

In [None]:
df_eda_type = df_eda_exam.sort_values('EXAM_TYPE')
df_eda_type.describe()

In [None]:
df_eda_job = df_eda_exam.sort_values('JOB_CLASS_NO')
with pd.option_context("display.max_rows", 2000): display (df_eda_job)


In [None]:
#Used for cleaning data
#find duplicates class titles
#no duplicates now

# df_eda_exam_len = len(df_eda_exam)
# print (df_eda_exam_len)
# index = 0
# while index < df_eda_exam_len:
#     index2 = 0
#     #print (index)
#     var = df_eda_exam.iloc[index]['JOB_CLASS_TITLE']
#     while index2 < df_eda_exam_len:
#         if df_eda_exam.iloc[index]['FILE_NAME'] != df_eda_exam.iloc[index2]['FILE_NAME']:
     
#             if var == df_eda_exam.iloc[index2]['JOB_CLASS_TITLE']:
#                 print (df_eda_exam.iloc[index]['FILE_NAME'], df_eda_exam.iloc[index2]['FILE_NAME'])
#         index2 += 1
#     index += 1



In [None]:
# #Used for cleaning data
# #find duplicates class codes

# df_eda_exam_len = len(df_eda_exam)
# print (df_eda_exam_len)
# index = 0
# while index < df_eda_exam_len:
#     index2 = 0
#     #print (index)
#     var = df_eda_exam.iloc[index]['JOB_CLASS_NO']
#     while index2 < df_eda_exam_len:
#         if df_eda_exam.iloc[index]['FILE_NAME'] != df_eda_exam.iloc[index2]['FILE_NAME']:
     
#             if var == df_eda_exam.iloc[index2]['JOB_CLASS_NO']:
#                 print (df_eda_exam.iloc[index]['FILE_NAME'], df_eda_exam.iloc[index2]['FILE_NAME'])
#         index2 += 1
#     index += 1



In [None]:
# #find number of open jobs where internal explicit candidates has been identified

# df_job_class_len = len(df_job_class)
# print (df_job_class_len)
# index = 0
# jobs_found = 0
# jobs_not_found = 0
# job_no_sub = 0
# job_chk = 0
# while index < df_job_class_len:
#     if df_job_class.iloc[index]['EXP_JOB_CLASS_TITLE'] != '':
#         job_chk +=1
#         print (df_job_class.iloc[index]['EXAM_TYPE'])
#         if re.search('ON AN INTERDEPARTMENTAL PROMOTIONAL AND AN OPEN COMPETITIVE BASIS',df_job_class.iloc[index]['EXAM_TYPE'] ) or re.search('ON AN OPEN COMPETITIVE BASIS',df_job_class.iloc[index]['EXAM_TYPE']) :
#             print (df_job_class.iloc[index]['FILE_NAME'], df_job_class.iloc[index]['EXAM_TYPE'])
#             jobs_found += 1
            
            
#             print ("exp", df_job_class.iloc[index]['FILE_NAME'],df_job_class.iloc[index]['EXP_JOB_CLASS_TITLE'])
#         elif re.search('ON AN DEPARTMENTAL PROMOTIONAL BASIS',df_job_class.iloc[index]['EXAM_TYPE']) \
#                 or re.search('ON AN INTERDEPARTMENTAL PROMOTIONAL BASIS',df_job_class.iloc[index]['EXAM_TYPE']):
#             jobs_not_found += 1
#     else:
#         job_no_sub += 1
#     index += 1
# print ('jobs_found',jobs_found)
# print ('jobs_not_found',jobs_not_found)
# print ('jos_no_sub',job_no_sub)

# print ('job_chk',job_chk)



In [None]:

df_eda_exam = df_eda_exam.sort_values('ENTRY_SALARY_GEN')

df_eda_exam['entry_salary'] = 99
df_eda_exam['final_salary'] = 99
df_eda_exam['pc_range'] = 99

#with pd.option_context("display.max_rows", 2000): display (df_eda_exam)


In [None]:
salary_flat_pattern = '(\d{4,6})(.*)(flat-rated)'
salary_range_pattern = '(\d{4,6})(.*?)(\d{4,6})'

#df_salary_eda['entry_salary'] = df_salary_eda['ENTRY_SALARY_GEN']

for i, row in df_eda_exam.iterrows():
    salary_range =  ''
    if row['ENTRY_SALARY_GEN'] != '':
        salary_range = row['ENTRY_SALARY_GEN']
    else:
        salary_range = row['ENTRY_SALARY_DWP']
    #print ('FILE_NAME', row['FILE_NAME'])
    #print ('salary_range',salary_range)
      
    entry_salary = -1
    final_salary = -1

    if re.search(salary_flat_pattern,salary_range):
        #print ('re.search(salary_flat_pattern,salary_range).group(1)',re.search(salary_flat_pattern,salary_range).group(1))
        entry_salary = int(re.search(salary_flat_pattern,salary_range).group(1))
        final_salary = 0
    if re.search(salary_range_pattern,salary_range):
        #print ('re.search(salary_range_pattern,salary_range).group(1)',re.search(salary_range_pattern,salary_range).group(1))
        entry_salary = int(re.search(salary_range_pattern,salary_range).group(1))
        final_salary = int(re.search(salary_range_pattern,salary_range).group(3))
    #print ('entry_salary',entry_salary)
    #print ('final_salary',final_salary)
    if final_salary != 0:
        pc_range = 100* (final_salary - entry_salary)/entry_salary
    else:
        pc_range = 0
    df_eda_exam.loc[i,'entry_salary'] = entry_salary
    df_eda_exam.loc[i,'final_salary'] = final_salary
    df_eda_exam.loc[i,'pc_range'] = pc_range
                                  


In [None]:
df_eda_exam = df_eda_exam.sort_values('entry_salary')
with pd.option_context("display.max_rows", 2000): display (df_eda_exam)


In [None]:
df_eda_exam = df_eda_exam.sort_values('JOB_CLASS_TITLE')
with pd.option_context("display.max_rows", 2000): display (df_eda_exam)


# Finding Explicit Links

In [None]:
df_explicit = pd.DataFrame()
data_list = []

df_job_class_len = len(df_job_class)
index = 0
while index < df_job_class_len:
    job = df_job_class.iloc[index]['JOB_CLASS_TITLE']
    reqs = df_job_class.iloc[index]['EXP_JOB_CLASS_TITLE']
    exp_len = df_job_class.iloc[index]['EXPERIENCE_LENGTH']
    
    #print ('job',job)                                
    for i, word in enumerate(reqs.split(',')): 
        word = word.rstrip(' ').lstrip(' ')
        #print (word)
        data_list_len = len(data_list)
        copy_found  = False
        j = 0
        while copy_found == False and j < data_list_len:
            list_job = data_list[j][0].upper()
            list_word = data_list[j][1].upper()
            if list_job == job and list_word == word:
                #print ('list_job',list_job)
                copy_found = True
            j += 1
        if word != '' and copy_found == False:
            data_list.append([job.upper(), word,exp_len])
    index += 1
df_explicit = pd.DataFrame(data_list)
df_explicit.columns = ["JOB", "REQUIREMENT", "EXPERIENCE_LENGTH"]
df_explicit.head()


In [None]:
#with pd.option_context("display.max_rows", 2000): display (df_explicit)##


## Job classes with the most first level subordinates

In [None]:
df_explicit_g = df_explicit.groupby('JOB').count()

df_explicit_s_g = df_explicit_g.sort_values('REQUIREMENT',ascending = False)
df_explicit_s_g.head()

## Looking for subordinates, ie who could apply

In [None]:

G = nx.Graph()

job = 'WATER UTILITY SUPERINTENDENT'
#G.add_node(job)
df_explicit_len = len(df_explicit)
index = 0
edges={}

while index < df_explicit_len:
    if df_explicit.iloc[index]['JOB'] == job:
        #print (job, ":   ",df_explicit.iloc[index]['REQUIREMENT'] )
        G.add_edge(job,df_explicit.iloc[index]['REQUIREMENT'])
        edges[job,df_explicit.iloc[index]['REQUIREMENT']] = '2yr'
        
    index +=1
plt.figure(figsize=(15, 15)) 
plt.axis('off')
pos = nx.circular_layout(G)

nx.draw_networkx_edge_labels(G,pos,edge_labels=edges
,font_color='red')

nx.draw_networkx(G, pos,with_labels=True, node_color='red', font_size=12, node_size=20000, arrows = True, width = 2)
plt.show()
#print (edges)


Now find the total career paths all the way back to an entry job...

In [None]:

def findsubsrecurse (job, edges, depth ):
    G.add_node(job)
    df_explicit_len = len(df_explicit)
    index = 0
    #print ('depth', depth)
    
    while index < df_explicit_len:
        #print ('index',index)
        
        if  (df_explicit.iloc[index]['JOB'] == job or\
             df_explicit.iloc[index]['JOB'] == job +' I' or\
             df_explicit.iloc[index]['JOB'] == job + " II" or\
             df_explicit.iloc[index]['JOB'] == job + " III") and depth<10:
            #print ('link found')
            #print ('job,req', job,df_explicit.iloc[index]['REQUIREMENT'] )
            G.add_edge(df_explicit.iloc[index]['REQUIREMENT'],job)
            edges[df_explicit.iloc[index]['REQUIREMENT'],job] = \
            str(df_explicit.iloc[index]['EXPERIENCE_LENGTH'])+'yr'
            depth += 1
            #print ('depth in loop1',depth)
            edges, depth  = findsubsrecurse (df_explicit.iloc[index]['REQUIREMENT'], edges, depth )
            
        index +=1
    return edges, depth

def findsubs (job):
    edges={}
    job= job.upper()
    depth = 0
    edges, depth  = findsubsrecurse (job,edges, depth )
    plt.figure(figsize=(15, 15)) 
    plt.axis('off')
    pos = nx.circular_layout(G)
    nx.draw_networkx_edge_labels(G,pos,edge_labels=edges,font_color='red')

    nx.draw_networkx(G, pos,with_labels=True, node_color='red', 
                     font_size=12, node_size=2000, arrows = True, width = 2)
    plt.show()

    
    #print (edges)
    return

## Network diagrams to show explicit links between job classes<a id='el'></a>

In [None]:
G = nx.DiGraph()
findsubs ('SENIOR SYSTEMS ANALYST')

In [None]:
G = nx.DiGraph()
findsubs ('WATER UTILITY SUPERINTENDENT')

In [None]:
G = nx.DiGraph()
findsubs ('CHIEF INSPECTOR')

In [None]:
G = nx.DiGraph()
findsubs ('ELECTRICAL SERVICES MANAGER')


In [None]:
G = nx.DiGraph()
findsubs ('UTILITY SERVICES MANAGER')

### Most of the difficult to full roles are open to everyone not just LA City employees.<a id='difficult'></a>



The following 17 job classes can be challenging to fill with qualified candidates:

    Accountant
    Accounting Clerk
    Applications Programmer
    Assistant Street Lighting Electrician
    Building Mechanical Inspector
    Detention Officer
    Electrical Mechanic
    Equipment Mechanic
    Field Engineering Aide
    Housing Inspector
    Housing Investigator
    Librarian
    Security Officer
    Senior Administrative Clerk
    Senior Custodian
    Senior Equipment Mechanic
    Tree Surgeon

In the future, our Personnel Department expects to find it challenging to fill the following classes:

    IT-related classes (e.g., Applications Programmer)
    Wastewater classes
    Inspector classes
    Journey-level classes


In [None]:
G = nx.DiGraph()
# no subs findsubs ('Applications Programmer') education plus paid experience performing systems or programming tasks in a professional IT environment
# no subs findsubs ('Accountant') graduation required  not work experience required
# no subs findsubs ('Accounting Clerk') but paid clerical acconting work is required
# no subs findsubs ('Assistant Street Lighting Electrician') experience working in the construction, maintenance, and repair of street lighting circuitry
# findsubs ('Building Mechanical Inspector')  #ASSISTANT INSPECTOR
#findsubs ('Detention Officer') #PARK RANGER
# no subs findsubs (' Equipment Mechanic')
# no subs findsubs (' Field Engineering Aide')
# no subs findsubs ('Housing Inspector')#ASSISTANT INSPECTOR
# no subs findsubs ('Housing Investigator')
# no subs findsubs ('Librarian')
#findsubs ('Security Officer')  #PARK RANGER
# no subs findsubs ('Senior Administrative Clerk')
# no subs findsubs ('Senior Custodian')
findsubs ('Senior Equipment Mechanic') #HEAVY DUTY EQUIPMENT MECHANIC Auto Electrician
#findsubs ('Tree Surgeon') # Tree Surgeon assistant



## Finding promotion routes<a id='promotional'></a>

In [None]:

def findsuprecurse (experience, edges):
    G.add_node(experience)
    df_explicit_len = len(df_explicit)
    index = 0
    while index < df_explicit_len:
        if  (df_explicit.iloc[index]['REQUIREMENT'] == experience or\
             df_explicit.iloc[index]['REQUIREMENT'] == experience +' I' or\
             df_explicit.iloc[index]['REQUIREMENT'] == experience + " II" or\
             df_explicit.iloc[index]['REQUIREMENT'] == experience + " III"):
             
            G.add_edge(experience,df_explicit.iloc[index]['JOB'])
            edges[experience, df_explicit.iloc[index]['JOB'] ]= \
                 str(df_explicit.iloc[index]['EXPERIENCE_LENGTH'])+'yr'
            findsuprecurse (df_explicit.iloc[index]['JOB'], edges)
        index +=1
    return edges
def findsup (job):
    edges={}
   
    edges = findsuprecurse (job, edges)
    plt.figure(figsize=(15, 15)) 
    plt.axis('off')
    pos = nx.circular_layout(G)
    nx.draw_networkx_edge_labels(G,pos,edge_labels=edges,font_color='red')

    nx.draw_networkx(G, pos,with_labels=True, node_color='red', 
                     font_size=12, node_size=2000, arrows = True, width = 2)
    plt.show()
    #print (edges)
    return


In [None]:
G = nx.DiGraph()
findsup ('SYSTEMS ANALYST')

In [None]:
G = nx.DiGraph()
findsup('SECRETARY')

In [None]:
G = nx.DiGraph()
findsup('PUBLIC RELATIONS SPECIALIST')

In [None]:
G = nx.DiGraph()
findsup('WELDER')

In [None]:
G = nx.DiGraph()
findsup('ELECTRICAL CRAFT HELPER')