# Data Cleaning Portion #


In [1]:
#import libraries
import numpy as np
import pandas as pd

# Location and Salary Needs Cleaning #

In [2]:
jobdata = pd.read_csv("fake_job_postings.csv")
jobdata.head()
locationdata = jobdata["location"].copy()
salaryRange = jobdata["salary_range"].copy()

In [3]:
print(jobdata.dtypes)


job_id                  int64
title                  object
location               object
department             object
salary_range           object
company_profile        object
description            object
requirements           object
benefits               object
telecommuting           int64
has_company_logo        int64
has_questions           int64
employment_type        object
required_experience    object
required_education     object
industry               object
function               object
fraudulent              int64
dtype: object


Total Number of Nulls:

In [4]:
jobdata.isnull().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

Replace NaN with "Unspecified" string

In [5]:
jobdata.fillna("Unspecified", inplace=True)

In [6]:
jobdata.isnull().sum()

job_id                 0
title                  0
location               0
department             0
salary_range           0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
dtype: int64

Drop job_id, so we can find dupes

In [7]:
jobdata.drop(columns = ['job_id'])

Unnamed: 0,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,Marketing Intern,"US, NY, New York",Marketing,Unspecified,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Unspecified,0,1,0,Other,Internship,Unspecified,Unspecified,Marketing,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,Unspecified,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,Unspecified,Marketing and Advertising,Customer Service,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Unspecified,Unspecified,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,Unspecified,0,1,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,0
3,Account Executive - Washington DC,"US, DC, Washington",Sales,Unspecified,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,Bill Review Manager,"US, FL, Fort Worth",Unspecified,Unspecified,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,Account Director - Distribution,"CA, ON, Toronto",Sales,Unspecified,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,Unspecified,Computer Software,Sales,0
17876,Payroll Accountant,"US, PA, Philadelphia",Accounting,Unspecified,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",Unspecified,Unspecified,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,Unspecified,0,0,0,Full-time,Unspecified,Unspecified,Unspecified,Unspecified,0
17878,Graphic Designer,"NG, LA, Lagos",Unspecified,Unspecified,Unspecified,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [8]:
jobdata.duplicated().sum()

0

# Location Filtering and Seperating #

In [9]:
locationdata.head()

0      US, NY, New York
1        NZ, , Auckland
2         US, IA, Wever
3    US, DC, Washington
4    US, FL, Fort Worth
Name: location, dtype: object

In [10]:
locationList = list(locationdata.str.split(', ').values)
locationList

[['US', 'NY', 'New York'],
 ['NZ', '', 'Auckland'],
 ['US', 'IA', 'Wever'],
 ['US', 'DC', 'Washington'],
 ['US', 'FL', 'Fort Worth'],
 ['US', 'MD', ''],
 ['DE', 'BE', 'Berlin'],
 ['US', 'CA', 'San Francisco'],
 ['US', 'FL', 'Pensacola'],
 ['US', 'AZ', 'Phoenix'],
 ['US', 'NJ', 'Jersey City'],
 ['GB', 'LND', 'London'],
 ['US', 'CT', 'Stamford'],
 ['US', 'FL', 'Orlando'],
 ['AU', 'NSW', 'Sydney'],
 ['SG', '01', 'Singapore'],
 ['IL', '', 'Tel Aviv', 'Israel'],
 ['GB', 'SOS', 'Southend-on-Sea'],
 ['US', 'NY', 'New York'],
 ['US', 'PA', 'USA Northeast'],
 ['US', 'TX', 'Austin'],
 ['NZ', 'N', 'Auckland'],
 ['AE', '', ''],
 ['US', 'CA', 'Carlsbad'],
 ['GB', 'LND', 'London'],
 ['US', 'NY', 'New York '],
 ['SG', '', ''],
 ['AE', 'AZ', 'Abudhabi'],
 ['US', 'MO', 'St. Louis'],
 ['CA', 'ON', 'Toronto'],
 ['US', 'MA', 'Waltham'],
 ['US', 'KS', ''],
 ['US', 'WA', 'Everett'],
 ['US', 'CA', 'San Ramon'],
 ['GB', 'LND', ''],
 ['US', 'NY', 'Saint Bonaventure'],
 ['US', 'NY', 'Yonkers'],
 ['US', 'TX', 'H

In [11]:
for index, loc in enumerate(locationList):
    if loc is np.nan:
        locationList[index] = ['Unspecified'] * 3
    else:
        for il_ind, il in enumerate(loc):
            if il == '':
                loc[il_ind] = 'Unspecified'

In [12]:
locationList

[['US', 'NY', 'New York'],
 ['NZ', 'Unspecified', 'Auckland'],
 ['US', 'IA', 'Wever'],
 ['US', 'DC', 'Washington'],
 ['US', 'FL', 'Fort Worth'],
 ['US', 'MD', 'Unspecified'],
 ['DE', 'BE', 'Berlin'],
 ['US', 'CA', 'San Francisco'],
 ['US', 'FL', 'Pensacola'],
 ['US', 'AZ', 'Phoenix'],
 ['US', 'NJ', 'Jersey City'],
 ['GB', 'LND', 'London'],
 ['US', 'CT', 'Stamford'],
 ['US', 'FL', 'Orlando'],
 ['AU', 'NSW', 'Sydney'],
 ['SG', '01', 'Singapore'],
 ['IL', 'Unspecified', 'Tel Aviv', 'Israel'],
 ['GB', 'SOS', 'Southend-on-Sea'],
 ['US', 'NY', 'New York'],
 ['US', 'PA', 'USA Northeast'],
 ['US', 'TX', 'Austin'],
 ['NZ', 'N', 'Auckland'],
 ['AE', 'Unspecified', 'Unspecified'],
 ['US', 'CA', 'Carlsbad'],
 ['GB', 'LND', 'London'],
 ['US', 'NY', 'New York '],
 ['SG', 'Unspecified', 'Unspecified'],
 ['AE', 'AZ', 'Abudhabi'],
 ['US', 'MO', 'St. Louis'],
 ['CA', 'ON', 'Toronto'],
 ['US', 'MA', 'Waltham'],
 ['US', 'KS', 'Unspecified'],
 ['US', 'WA', 'Everett'],
 ['US', 'CA', 'San Ramon'],
 ['GB', 'L

In [13]:
for index, loc in enumerate(locationList):
    if len(loc) > 3:
        print(loc)

['IL', 'Unspecified', 'Tel Aviv', 'Israel']
['US', 'CA', 'Menlo Park', 'CA']
['US', 'Unspecified', 'Stocton', 'CA']
['US', 'CA', 'Bakersfield', 'CA / Mt. Poso']
['SA', '01', 'Riyadh', 'Olaya']
['EG', 'C', 'Cairo', 'Nasr City']
['GR', 'I', 'Neo Iraklio', 'Athens']
['GB', 'UKM', 'Stockholm', 'Sweden']
['GB', 'Unspecified', 'Angel', 'London']
['US', 'CO', 'Boulder', 'CO']
['US', 'TX', 'Austin', 'San Antonio', 'Houston']
['EG', 'C', 'Cairo', 'Nasr City']
['US', 'NJ', 'Hillside', 'NJ']
['GR', 'I', 'Vrilissia ', 'Greece']
['GR', 'E', 'Larisa', 'Volos']
['GR', 'I', 'Chalandri', 'Athens']
['US', 'NJ', 'Whitehouse station', 'Unspecified']
['US', 'DC', 'Washington', 'DC']
['US', 'CA', 'Salinas', 'CA']
['US', 'CA', 'Fresno', 'CA']
['SA', '01', 'Riyadh', 'Olaya']
['US', 'FL', 'Okeechobee', 'FL']
['GR', 'I', 'Athens', 'Rouf']
['CA', 'Unspecified', 'Rio Rancho', 'NM']
['GR', 'I', 'Kifisia', 'Athens']
['US', 'MI', 'Hartford', 'Paw Paw', 'Lawton']
['US', 'CA', 'San Jose', 'SFO', 'Austin', 'OH']
['US',

In [14]:
for index, loc in enumerate(locationList):
    if len(loc) < 3:
        print(loc)

['US']
['US']
['US']
['US']
['US']
['US']
['US']
['AU']
['US']
['GR']
['US']
['CA']
['HK']
['DE']
['GB']
['US']
['US']
['US']
['IN']
['US']
['US']
['US']
['US']
['US']
['US']
['US']
['US']
['US']
['US']
['GB']
['US']
['US']
['US']
['AU']
['US']
['US']
['GR']
['US']
['GB']
['GB']
['US']
['US']
['US']
['US']
['BE']
['CN']
['TH']
['US']
['US']
['QA']
['US']
['US']
['US']
['US']
['US']
['US']
['US']
['US']
['GR']
['NZ']
['GB']
['QA']
['GB']
['US']
['GB']
['US']
['US']
['US']
['GB']
['US']
['US']
['US']
['AU']
['IN']
['IL']
['US']
['US']
['DE']
['US']
['US']
['GB']
['US']
['GB']
['US']
['US']
['CA']
['US']
['US']
['US']
['US']
['US']
['US']
['BH']
['US']


In [15]:
locationList = list(map(lambda loc: list(loc), locationList))

In [16]:
locationList

[['US', 'NY', 'New York'],
 ['NZ', 'Unspecified', 'Auckland'],
 ['US', 'IA', 'Wever'],
 ['US', 'DC', 'Washington'],
 ['US', 'FL', 'Fort Worth'],
 ['US', 'MD', 'Unspecified'],
 ['DE', 'BE', 'Berlin'],
 ['US', 'CA', 'San Francisco'],
 ['US', 'FL', 'Pensacola'],
 ['US', 'AZ', 'Phoenix'],
 ['US', 'NJ', 'Jersey City'],
 ['GB', 'LND', 'London'],
 ['US', 'CT', 'Stamford'],
 ['US', 'FL', 'Orlando'],
 ['AU', 'NSW', 'Sydney'],
 ['SG', '01', 'Singapore'],
 ['IL', 'Unspecified', 'Tel Aviv', 'Israel'],
 ['GB', 'SOS', 'Southend-on-Sea'],
 ['US', 'NY', 'New York'],
 ['US', 'PA', 'USA Northeast'],
 ['US', 'TX', 'Austin'],
 ['NZ', 'N', 'Auckland'],
 ['AE', 'Unspecified', 'Unspecified'],
 ['US', 'CA', 'Carlsbad'],
 ['GB', 'LND', 'London'],
 ['US', 'NY', 'New York '],
 ['SG', 'Unspecified', 'Unspecified'],
 ['AE', 'AZ', 'Abudhabi'],
 ['US', 'MO', 'St. Louis'],
 ['CA', 'ON', 'Toronto'],
 ['US', 'MA', 'Waltham'],
 ['US', 'KS', 'Unspecified'],
 ['US', 'WA', 'Everett'],
 ['US', 'CA', 'San Ramon'],
 ['GB', 'L

In [17]:
for index, loc in enumerate (locationList):
    if len(loc) > 3:
        locationList[index] = loc[:2] + [', '.join(loc[2:])]
    if len(loc) < 3:
        locationList[index] += ['Unspecificed'] * 2

In [18]:
locationList

[['US', 'NY', 'New York'],
 ['NZ', 'Unspecified', 'Auckland'],
 ['US', 'IA', 'Wever'],
 ['US', 'DC', 'Washington'],
 ['US', 'FL', 'Fort Worth'],
 ['US', 'MD', 'Unspecified'],
 ['DE', 'BE', 'Berlin'],
 ['US', 'CA', 'San Francisco'],
 ['US', 'FL', 'Pensacola'],
 ['US', 'AZ', 'Phoenix'],
 ['US', 'NJ', 'Jersey City'],
 ['GB', 'LND', 'London'],
 ['US', 'CT', 'Stamford'],
 ['US', 'FL', 'Orlando'],
 ['AU', 'NSW', 'Sydney'],
 ['SG', '01', 'Singapore'],
 ['IL', 'Unspecified', 'Tel Aviv, Israel'],
 ['GB', 'SOS', 'Southend-on-Sea'],
 ['US', 'NY', 'New York'],
 ['US', 'PA', 'USA Northeast'],
 ['US', 'TX', 'Austin'],
 ['NZ', 'N', 'Auckland'],
 ['AE', 'Unspecified', 'Unspecified'],
 ['US', 'CA', 'Carlsbad'],
 ['GB', 'LND', 'London'],
 ['US', 'NY', 'New York '],
 ['SG', 'Unspecified', 'Unspecified'],
 ['AE', 'AZ', 'Abudhabi'],
 ['US', 'MO', 'St. Louis'],
 ['CA', 'ON', 'Toronto'],
 ['US', 'MA', 'Waltham'],
 ['US', 'KS', 'Unspecified'],
 ['US', 'WA', 'Everett'],
 ['US', 'CA', 'San Ramon'],
 ['GB', 'LND

In [19]:
locationDF = pd.DataFrame(locationList, columns=['country', 'state', 'city'])
locationDF

Unnamed: 0,country,state,city
0,US,NY,New York
1,NZ,Unspecified,Auckland
2,US,IA,Wever
3,US,DC,Washington
4,US,FL,Fort Worth
...,...,...,...
17875,CA,ON,Toronto
17876,US,PA,Philadelphia
17877,US,TX,Houston
17878,NG,LA,Lagos


In [20]:
jobdata = pd.concat([jobdata, locationDF], axis = 1)
jobdata.head(15)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country,state,city
0,1,Marketing Intern,"US, NY, New York",Marketing,Unspecified,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Unspecified,0,...,0,Other,Internship,Unspecified,Unspecified,Marketing,0,US,NY,New York
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,Unspecified,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,...,0,Full-time,Not Applicable,Unspecified,Marketing and Advertising,Customer Service,0,NZ,Unspecified,Auckland
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Unspecified,Unspecified,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,Unspecified,0,...,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,0,US,IA,Wever
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,Unspecified,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,...,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,DC,Washington
4,5,Bill Review Manager,"US, FL, Fort Worth",Unspecified,Unspecified,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,...,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,FL,Fort Worth
5,6,Accounting Clerk,"US, MD,",Unspecified,Unspecified,Unspecified,Job OverviewApex is an environmental consultin...,Unspecified,Unspecified,0,...,0,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,0,US,MD,Unspecified
6,7,Head of Content (m/f),"DE, BE, Berlin",ANDROIDPIT,20000-28000,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,Your Benefits: Being part of a fast-growing co...,0,...,1,Full-time,Mid-Senior level,Master's Degree,Online Media,Management,0,DE,BE,Berlin
7,8,Lead Guest Service Specialist,"US, CA, San Francisco",Unspecified,Unspecified,Airenvy’s mission is to provide lucrative yet ...,Who is Airenvy?Hey there! We are seasoned entr...,"Experience with CRM software, live chat, and p...",Competitive Pay. You'll be able to eat steak e...,0,...,1,Unspecified,Unspecified,Unspecified,Unspecified,Unspecified,0,US,CA,San Francisco
8,9,HP BSM SME,"US, FL, Pensacola",Unspecified,Unspecified,Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,Unspecified,0,...,1,Full-time,Associate,Unspecified,Information Technology and Services,Unspecified,0,US,FL,Pensacola
9,10,Customer Service Associate - Part Time,"US, AZ, Phoenix",Unspecified,Unspecified,"Novitex Enterprise Solutions, formerly Pitney ...",The Customer Service Associate will be based i...,Minimum Requirements:Minimum of 6 months custo...,Unspecified,0,...,0,Part-time,Entry level,High School or equivalent,Financial Services,Customer Service,0,US,AZ,Phoenix


In [21]:
jobdata = jobdata.drop(columns = 'location')

# Salary Range Split #

In [22]:
salaryRange.head(15)

0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
5               NaN
6       20000-28000
7               NaN
8               NaN
9               NaN
10    100000-120000
11              NaN
12              NaN
13              NaN
14              NaN
Name: salary_range, dtype: object

In [23]:
salaryRange.fillna('0-0', inplace = True)

In [24]:
sep_salary = list(salaryRange.str.split('-').values)
sep_salary

[['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['20000', '28000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['100000', '120000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['120000', '150000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['100000', '120000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['50000', '65000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['40000', '50000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['60', '80'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '

In [25]:
for index, sal_range in enumerate(sep_salary):
    if len(sal_range) < 2 or len(sal_range) > 2:
        print(index, sal_range)

5538 ['40000']


In [26]:
sep_salary[5538] = ['40000', '40000']

In [27]:
sep_salary

[['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['20000', '28000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['100000', '120000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['120000', '150000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['100000', '120000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['50000', '65000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['40000', '50000'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['60', '80'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '0'],
 ['0', '

In [28]:
errorSalary_index = []
for index, sal_range in enumerate (sep_salary):
    minSalary, maxSalary = sal_range
    if not minSalary.isdigit() or not maxSalary.isdigit():
        print(index, (minSalary, maxSalary))
        errorSalary_index += [index]

159 ('9', 'Dec')
1884 ('3', 'Apr')
1981 ('4', 'Apr')
2313 ('Oct', '15')
4299 ('8', 'Sep')
9124 ('4', 'Jun')
9902 ('10', 'Oct')
9911 ('Oct', '20')
10316 ('Jun', '18')
10785 ('10', 'Oct')
10788 ('11', 'Nov')
10860 ('10', 'Nov')
10883 ('10', 'Oct')
10889 ('10', 'Nov')
10896 ('10', 'Oct')
10905 ('10', 'Nov')
11361 ('11', 'Dec')
11495 ('2', 'Apr')
11606 ('10', 'Nov')
12421 ('10', 'Oct')
13449 ('11', 'Nov')
13482 ('2', 'Jun')
14196 ('Oct', '20')
15483 ('10', 'Oct')
17233 ('10', 'Nov')
17656 ('Dec', '25')


In [29]:
for index in errorSalary_index:
    sep_salary[index] = ['0','0']

In [30]:
salaryRangeDF = pd.DataFrame(np.array(sep_salary, dtype='int64'), columns = ['minSalary','maxSalary'])
salaryRangeDF['haveSalary'] = ((salaryRangeDF.minSalary != 0) | (salaryRangeDF.maxSalary != 0)).astype('int64')
salaryRangeDF.head(20)

Unnamed: 0,minSalary,maxSalary,haveSalary
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
5,0,0,0
6,20000,28000,1
7,0,0,0
8,0,0,0
9,0,0,0


In [31]:
jobdata = pd.concat([jobdata, salaryRangeDF], axis = 1)
jobdata.head(10)

Unnamed: 0,job_id,title,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,...,required_education,industry,function,fraudulent,country,state,city,minSalary,maxSalary,haveSalary
0,1,Marketing Intern,Marketing,Unspecified,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Unspecified,0,1,...,Unspecified,Unspecified,Marketing,0,US,NY,New York,0,0,0
1,2,Customer Service - Cloud Video Production,Success,Unspecified,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,...,Unspecified,Marketing and Advertising,Customer Service,0,NZ,Unspecified,Auckland,0,0,0
2,3,Commissioning Machinery Assistant (CMA),Unspecified,Unspecified,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,Unspecified,0,1,...,Unspecified,Unspecified,Unspecified,0,US,IA,Wever,0,0,0
3,4,Account Executive - Washington DC,Sales,Unspecified,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,...,Bachelor's Degree,Computer Software,Sales,0,US,DC,Washington,0,0,0
4,5,Bill Review Manager,Unspecified,Unspecified,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,...,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,FL,Fort Worth,0,0,0
5,6,Accounting Clerk,Unspecified,Unspecified,Unspecified,Job OverviewApex is an environmental consultin...,Unspecified,Unspecified,0,0,...,Unspecified,Unspecified,Unspecified,0,US,MD,Unspecified,0,0,0
6,7,Head of Content (m/f),ANDROIDPIT,20000-28000,"Founded in 2009, the Fonpit AG rose with its i...",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,Your Benefits: Being part of a fast-growing co...,0,1,...,Master's Degree,Online Media,Management,0,DE,BE,Berlin,20000,28000,1
7,8,Lead Guest Service Specialist,Unspecified,Unspecified,Airenvy’s mission is to provide lucrative yet ...,Who is Airenvy?Hey there! We are seasoned entr...,"Experience with CRM software, live chat, and p...",Competitive Pay. You'll be able to eat steak e...,0,1,...,Unspecified,Unspecified,Unspecified,0,US,CA,San Francisco,0,0,0
8,9,HP BSM SME,Unspecified,Unspecified,Solutions3 is a woman-owned small business who...,Implementation/Configuration/Testing/Training ...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,Unspecified,0,1,...,Unspecified,Information Technology and Services,Unspecified,0,US,FL,Pensacola,0,0,0
9,10,Customer Service Associate - Part Time,Unspecified,Unspecified,"Novitex Enterprise Solutions, formerly Pitney ...",The Customer Service Associate will be based i...,Minimum Requirements:Minimum of 6 months custo...,Unspecified,0,1,...,High School or equivalent,Financial Services,Customer Service,0,US,AZ,Phoenix,0,0,0


In [32]:
jobdata = jobdata.drop(columns='salary_range')

In [33]:
jobdata.to_csv('reallyCleanData.csv', index=False)