### Objective
This notebook is an initiative to make a structured csv file from the unstructured data (text files) using regex.<br><br>
<u><b>The task here is:</b><br></u>
    - To find relevant information from different text files using text processing methods (like regex).
    - And then arranging them in an efficient way into csv file.
    
<b>Following information has been included in the structured data:</b><br>
1) FILE_NAMES<br>
2) JOB_TITLE <br>
3) JOB_CLASS_NO <br>
4) JOB_DUTIES<br>
5) OPEN_DATES<br>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
import os
import re

In [2]:
OTHER_DATA_PATH = '../input/cityofla/CityofLA/Additional data'
sample_csv = pd.read_csv(OTHER_DATA_PATH+'/'+'sample job class export template.csv')
sample_csv.head()

Unnamed: 0,FILE_NAME,JOB_CLASS_TITLE,JOB_CLASS_NO,REQUIREMENT_SET_ID,REQUIREMENT_SUBSET_ID,JOB_DUTIES,EDUCATION_YEARS,SCHOOL_TYPE,EDUCATION_MAJOR,EXPERIENCE_LENGTH,FULL_TIME_PART_TIME,EXP_JOB_CLASS_TITLE,EXP_JOB_CLASS_ALT_RESP,EXP_JOB_CLASS_FUNCTION,COURSE_COUNT,COURSE_LENGTH,COURSE_SUBJECT,MISC_COURSE_DETAILS,DRIVERS_LICENSE_REQ,DRIV_LIC_TYPE,ADDTL_LIC,EXAM_TYPE,ENTRY_SALARY_GEN,ENTRY_SALARY_DWP,OPEN_DATE
0,SYSTEMS ANALYST 1596 102717.txt,Systems Analyst,1596,1,A,"A Systems Analyst analyzes procedures, methods...",4.0,COLLEGE OR UNIVERSITY,COMPUTER SCIENCE|INFORMATION SYSTEMS|GEOGRAPHI...,,,,,,,,,,,,,OPEN_INT_PROM,68611-100307,70908-88092,10/27/17
1,SYSTEMS ANALYST 1596 102717.txt,Systems Analyst,1596,2,A,"A Systems Analyst analyzes procedures, methods...",4.0,COLLEGE OR UNIVERSITY,,2.0,FULL-TIME,MANAGEMENT ASSISTANT,,"the development, analysis, implementation or m...",,,,,,,,OPEN_INT_PROM,68611-100307,70908-88092,10/27/17
2,SYSTEMS ANALYST 1596 102717.txt,Systems Analyst,1596,2,B,"A Systems Analyst analyzes procedures, methods...",4.0,COLLEGE OR UNIVERSITY,,2.0,FULL-TIME,MANAGEMENT ASSISTANT,,"performing cost benefit, feasibility and requi...",,,,,,,,OPEN_INT_PROM,68611-100307,70908-88092,10/27/17
3,SYSTEMS ANALYST 1596 102717.txt,Systems Analyst,1596,2,C,"A Systems Analyst analyzes procedures, methods...",4.0,COLLEGE OR UNIVERSITY,,2.0,FULL-TIME,MANAGEMENT ASSISTANT,,performing system implementation and support a...,,,,,,,,OPEN_INT_PROM,68611-100307,70908-88092,10/27/17
4,SYSTEMS ANALYST 1596 102717.txt,Systems Analyst,1596,3,A,"A Systems Analyst analyzes procedures, methods...",,,,2.0,FULL-TIME,SYSTEMS AIDE,,,4.0,"3M, 4Q",INFORMATION SYSTEMS|SYSTEMS ANALYSIS|closely r...,At least three of the courses must be from the...,M,,,OPEN_INT_PROM,68611-100307,70908-88092,10/27/17


This is the sample file containing the a sample of the structured data.

### Generating File

In [3]:
# Helpful Functions

def append_data(list_name, data_to_append):
    if data_to_append:
        list_name.append(data_to_append)
    else:
        list_name.append('')
    return None

def get_job_class_no(file_lines):
    job_class_no = re.search('Class Code (\d+)', file_lines)
    if job_class_no:
        job_class_nos.append(job_class_no.group(1))
    else:
        job_class_nos.append('')
    return None

def get_job_duty(file_lines):
    job_duty = re.search('DUTIES(\W+)(.*\n)', file_lines)
    if job_duty:
        try:
            job_duties.append(str(job_duty.group(0).split('\n')[2]))
        except Exception as e:
            job_duty = re.search('Duties include:[^.]*', file_lines)
            job_duties.append(job_duty.group(0).split(': ')[1])
    else:
        job_duties.append('')
    return None

def get_open_dates(file_lines):
    open_date = re.search('Open Date:[^.](.*\n)', file_lines)
    if open_date:
        open_dates.append(open_date.group(0).split(':')[1].lstrip().rstrip('\n'))
    else:
        open_dates.append('')
    return None


In [4]:
DATA_PATH = '../input/cityofla/CityofLA/Job Bulletins'

structured_data = pd.DataFrame()
col_names = ['FILE_NAMES', 'JOB_TITLE', 'JOB_CLASS_NO', 'JOB_DUTIES']

job_titles = []
file_names = []
job_class_nos = []
job_duties = []
open_dates = []

for index, file_name in enumerate(os.listdir(DATA_PATH)):
    with open(DATA_PATH+'/'+file_name, encoding = "ISO-8859-1") as f:
        file_lines = f.read()
        file_names.append(file_name)
        job_title = file_lines.split('\n')[0]
        append_data(job_titles, job_title)
        get_job_class_no(file_lines)       
        get_job_duty(file_lines)        
        get_open_dates(file_lines) 
#         get_annual_salary(file_lines)

In [5]:
def make_data():
    structured_data = pd.DataFrame()
    structured_data['FILE_NAMES'] = file_names
    structured_data['JOB_TITLE'] = job_titles
    structured_data['JOB_CLASS_NO'] = job_class_nos
    structured_data['JOB_DUTIES'] = job_duties
    structured_data['OPEN_DATES'] = open_dates
    return structured_data

In [6]:
structured_data = make_data()
structured_data.head()

Unnamed: 0,FILE_NAMES,JOB_TITLE,JOB_CLASS_NO,JOB_DUTIES,OPEN_DATES
0,PARK MAINTENANCE SUPERVISOR 3145 102618.txt,PARK MAINTENANCE SUPERVISOR,3145,A Park Maintenance Supervisor directly supervi...,10-26-18
1,ENVIRONMENTAL ENGINEER 7872 082616 REV 090116...,ENVIRONMENTAL ENGINEER,7872,An Environmental Engineer performs responsible...,08-26-16
2,MOTION PICTURE AND TELEVISION MANAGER 1789 111...,MOTION PICTURE AND TELEVISION MANAGER,1789,"A Motion Picture and Television Manager plans,...",11-17-17
3,HOUSING INVESTIGATOR 8516 062918.txt,HOUSING INVESTIGATOR,8516,A Housing Investigator conducts preliminary an...,06-29-18
4,DEPARTMENTAL CHIEF ACCOUNTANT 1593 111717 revi...,DEPARTMENTAL CHIEF ACCOUNTANT,1593,A Departmental Chief Accountant plans and dire...,11-17-17



### This is a work in progress and more will come here!!