# Splitting 20k jobs into sectors
Richard Kuzma, 25NOV2020

In [1]:
import pickle
import pandas
import snorkel
import matplotlib.pyplot as plt

#### Load data

In [2]:
DATA_PATH = '../data/cleaned/'
DF_NAME = 'monster_jobs_df_small.pkl'
with open(DATA_PATH + DF_NAME, 'rb') as f:
    df = pickle.load(f)

#### exploring the data

In [3]:
print(df.columns)
df.head()

Index(['id', 'job_title', 'job_description', 'job_type', 'location',
       'organization', 'sector'],
      dtype='object')


Unnamed: 0,id,job_title,job_description,job_type,location,organization,sector
0,0,IT Support Technician Job in Madison,TeamSoft is seeing an IT Support Specialist to...,Full Time Employee,"Madison, WI 53702",,IT/Software Development
1,1,Business Reporter/Editor Job in Madison,The Wisconsin State Journal is seeking a flexi...,Full Time,"Madison, WI 53708",Printing and Publishing,
2,2,Johnson & Johnson Family of Companies Job Appl...,Report this job About the Job DePuy Synthes Co...,"Full Time, Employee",DePuy Synthes Companies is a member of Johnson...,Personal and Household Services,
3,3,Engineer - Quality Job in Dixon,Why Join Altec? If you’re considering a career...,Full Time,"Dixon, CA",Altec Industries,Experienced (Non-Manager)
4,4,Shift Supervisor - Part-Time Job in Camphill,Position ID# 76162 # Positions 1 State CT C...,Full Time Employee,"Camphill, PA",Retail,Project/Program Management


In [4]:
print(df.shape)
# drop duplicates
df.drop_duplicates(subset = ['job_description'], inplace=True)
print(df.shape)

# remove the sectors that are actually full job descriptions
df = df[df['sector'].str.len() < 150]
print(df.shape)

(22000, 7)
(18744, 7)
(14074, 7)


In [5]:
# save the no-duplicates df
with open(DATA_PATH + 'monster_jobs_df_no_dups.pkl', 'wb') as f:
    pickle.dump(df, f)

### divide into sectors for later snorkel use

In [6]:
# see all sector names
sector_names = df['sector'].unique()
print(sector_names)

['IT/Software Development' 'Experienced (Non-Manager)'
 'Project/Program Management' 'Customer Support/Client Care' 'Entry Level'
 'Building Construction/Skilled Trades'
 'Civil & Structural EngineeringGeneral/Other: Engineering'
 'Installation/Maintenance/Repair' 'Business/Strategic Management'
 'Accounting/Finance/Insurance' 'General/Other: Engineering' 'Engineering'
 'Editorial/Writing' 'Medical/Health' 'Marketing/Product'
 'Manager (Manager/Supervisor of Staff)' 'Administrative/Clerical'
 'Student (Undergraduate/Graduate)' 'Biotech/R&D/Science'
 'Logistics/Transportation' 'General/Other: Customer Support/Client Care'
 'Sales/Retail/Business Development' 'Education/Training' 'Other'
 'General/Other: Installation/Maintenance/RepairVehicle Repair and Maintenance'
 'General/Other: IT/Software Development'
 'Brand/Product MarketingGeneral/Other: Marketing/ProductProduct Management'
 'General/Other: Editorial/Writing'
 'Executive (SVP, VP, Department Head, etc)'
 'Banking/Real Estate/Mor

In [7]:
# extract tech sectors
tech_sectors = []
for i in sector_names:
    for j in ['IT', 'Software', 'Computing', 'Data', 'Network']:
        if j in i:
            print(i)
            tech_sectors.append(i)

tech_df = df[df['sector'].isin(tech_sectors)]
print(tech_df.shape)

with open(DATA_PATH + 'tech_sector_jobs.pkl', 'wb') as f:
    pickle.dump(tech_df, f)

IT/Software Development
IT/Software Development
General/Other: IT/Software Development
General/Other: IT/Software Development
Computer/Network SecurityGeneral/Other: IT/Software Development
Computer/Network SecurityGeneral/Other: IT/Software Development
Computer/Network SecurityGeneral/Other: IT/Software Development
General/Other: IT/Software DevelopmentSoftware/Web Development
General/Other: IT/Software DevelopmentSoftware/Web Development
Network and Server AdministrationSoftware/System ArchitectureSystems Analysis - IT
Network and Server AdministrationSoftware/System ArchitectureSystems Analysis - IT
Network and Server AdministrationSoftware/System ArchitectureSystems Analysis - IT
Network and Server Administration
Administrative SupportData Entry/Order ProcessingGeneral/Other: Administrative/Clerical
Systems Analysis - ITWeb/UI/UX Design
IT Project ManagementGeneral/Other: Project/Program ManagementProject Management
Software/Web Development
Systems Analysis - IT
Computer/Network Se

In [8]:
# extract safety_compliance sectors
safety_compliance_sectors = []
for i in sector_names:
    for j in ['quality', 'control', 'safety', 'compliance', 'process', 'assurance', 'process', 'production']:
        if j in i.lower():
            print(i)
            safety_compliance_sectors.append(i)

safety_compliance_df = df[df['sector'].isin(safety_compliance_sectors)]
print(safety_compliance_df.shape)

with open(DATA_PATH + 'safety_compliance_sector.pkl', 'wb') as f:
    pickle.dump(safety_compliance_df, f)


Manufacturing/Production/Operations
Quality Assurance/Safety
Quality Assurance/Safety
Quality Assurance/Safety
General/Other: Quality Assurance/Safety
General/Other: Quality Assurance/Safety
General/Other: Quality Assurance/Safety
Administrative SupportData Entry/Order ProcessingGeneral/Other: Administrative/Clerical
Administrative SupportData Entry/Order ProcessingGeneral/Other: Administrative/Clerical
ISO CertificationProduction Quality Assurance
ISO CertificationProduction Quality Assurance
ISO CertificationProduction Quality Assurance
Production Quality Assurance
Production Quality Assurance
Production Quality Assurance
General/Other: Production/Operations
General/Other: Production/OperationsSewing and Tailoring
Production/Operations Planning
Food Safety and Inspection
Food Safety and InspectionGeneral/Other: Quality Assurance/SafetyProduction Quality Assurance
Food Safety and InspectionGeneral/Other: Quality Assurance/SafetyProduction Quality Assurance
Food Safety and InspectionGe

In [9]:
# extract safety_compliance sectors
# mechanical and electrical
mechanical_electrical_sectors = []
for i in sector_names:
    for j in ['mechanic', 'electric', 'auto' 'inspect' 'material', 'maint', 'engineering', 'install', 'equip']:
        if j in i.lower():
            print(i)
            mechanical_electrical_sectors.append(i)

mechanical_electrical_df = df[df['sector'].isin(mechanical_electrical_sectors)]
print(mechanical_electrical_df.shape)

with open(DATA_PATH + 'mechanical_electrical_sector.pkl', 'wb') as f:
    pickle.dump(mechanical_electrical_df, f)


Civil & Structural EngineeringGeneral/Other: Engineering
Installation/Maintenance/Repair
Installation/Maintenance/Repair
General/Other: Engineering
Engineering
General/Other: Installation/Maintenance/RepairVehicle Repair and Maintenance
General/Other: Installation/Maintenance/RepairVehicle Repair and Maintenance
General/Other: Installation/Maintenance/Repair
General/Other: Installation/Maintenance/Repair
Electrical/Electronics EngineeringIndustrial/Manufacturing EngineeringMechanical Engineering
Electrical/Electronics EngineeringIndustrial/Manufacturing EngineeringMechanical Engineering
Electrical/Electronics EngineeringIndustrial/Manufacturing EngineeringMechanical Engineering
Computer/Electronics/Telecomm Install/Maintain/Repair
Computer/Electronics/Telecomm Install/Maintain/Repair
Oil Rig & Pipeline Install/Maintain/Repair
Oil Rig & Pipeline Install/Maintain/Repair
Computer/Electronics/Telecomm Install/Maintain/RepairGeneral/Other: Installation/Maintenance/Repair
Computer/Electronic