In [60]:
'''This script is a part of Monash Sprint Challenge'''
   
__author__ = 'Sandeep Kumar Kola'
__email__ = 'sandeep.kola07@gmail.com' 

Date: 13/05/2018

Version: 1.0

Environment: Python 3.6 and Jupyter notebook

Libraries used:
* pandas (for dataframes, included in Anaconda Python 3.6)
* numpy (for arrays, included in Anaconda Python 3.6)
* re (for regex mathcing, included in Anaconda Python 3.6)
* fuzzywuzzy (for string matching, should be downloaded seperatley)
* Please use the command "pip install fuzzywuzzy" on your terminal to install fuzzywuzzy package on your machine.
* If you are using Anaconda or Jupyter notebook, please check if the installed fuzzywuzzy library is loaded properly.

## Monash Sprint Challenge Task 1 :
Auditing and Cleansing the Job dataset

#### 1) Import the libraries.

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import re

#### 2) Read the data using pandas. 

In [2]:
dataset1 = pd.read_csv("dataset1_with_error.csv")

# Let's make a copy of it and start using it.
dataset1_solution = dataset1

#### 3) Check columns

In [3]:
dataset1_solution.columns

Index(['Id', 'Title', 'Location', 'ContractType', 'ContractTime', 'Company',
       'Category', 'Salary per annum', 'SourceName', 'OpenDate', 'CloseDate'],
      dtype='object')

#### 4) Check first few rows

In [4]:
dataset1_solution.head(5)

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,Salary per annum,SourceName,OpenDate,CloseDate
0,12612628,Engineering Systems Analyst,Dorking,not available,permanent,Gregory Martin International,Engineering Jobs,24996,cv-library.co.uk,20121103T000000,20121203T000000
1,12612830,Stress Engineer Glasgow,Glasgow,not available,permanent,Gregory Martin International,Engineering Jobs,30000,cv-library.co.uk,20130108T150000,20130408T150000
2,12612844,Modelling and simulation analyst,Hampshire,not available,permanent,Gregory Martin International,Engineering Jobs,30000,cv-library.co.uk,20130726T150000,20130924T150000
3,12613049,Engineering Systems Analyst / Mathematical Mod...,Surrey,not available,permanent,Gregory Martin International,Engineering Jobs,27504,cv-library.co.uk,20121214T000000,20130314T000000
4,12613647,"Pioneer, Miser Engineering Systems Analyst",Surrey,not available,permanent,Gregory Martin International,Engineering Jobs,24996,cv-library.co.uk,20131025T000000,20131224T000000


#### 5) Check for null values.

In [5]:
dataset1_solution.isnull().sum()

Id                     0
Title                  0
Location               0
ContractType           0
ContractTime           0
Company             3835
Category               0
Salary per annum       0
SourceName             0
OpenDate               0
CloseDate              0
dtype: int64

#### 6) Fix close date values.

In [6]:
# First repalce "T"
dataset1_solution["CloseDate"] = dataset1_solution["CloseDate"].replace("T","", regex=True)
# Covert close date to required format.
dataset1_solution["CloseDate"] = pd.to_datetime(dataset1_solution["CloseDate"], format="%Y%m%d%H%M%S", 
                 errors = "coerce")

In [7]:
# Check the values.
dataset1_solution["CloseDate"].head(5)

0   2012-12-03 00:00:00
1   2013-04-08 15:00:00
2   2013-09-24 15:00:00
3   2013-03-14 00:00:00
4   2013-12-24 00:00:00
Name: CloseDate, dtype: datetime64[ns]

#### 7) Fix open date values.

In [8]:
# First repalce "T"
dataset1_solution["OpenDate"] = dataset1_solution["OpenDate"].replace("T","", regex=True)
# Covert close date to required format.
dataset1_solution["OpenDate"] = pd.to_datetime(dataset1_solution["OpenDate"], format="%Y%m%d%H%M%S", 
                 errors = "coerce")
# Check the value.
dataset1_solution["CloseDate"].head(5)

0   2012-12-03 00:00:00
1   2013-04-08 15:00:00
2   2013-09-24 15:00:00
3   2013-03-14 00:00:00
4   2013-12-24 00:00:00
Name: CloseDate, dtype: datetime64[ns]

#### 8) Fix sourcename column.

In [9]:
# Take unique values into a seperate dataframe. 
SourceName = dataset1_solution["SourceName"].unique()
# Make this a dataframe.
SourceName = pd.DataFrame(SourceName)
# Add column name.
SourceName.columns = ["SourceName"]
# Sort the values.
SourceName = SourceName.sort_values(by="SourceName")

# Check the values for errors.
SourceName.head(5)

Unnamed: 0,SourceName
81,3desk.com
27,OilCareers.com
33,accountancyagejobs.com
77,actuaryjobs.co.uk
88,admin@caterer.com


In [10]:
# Fix admin@caterer.com.
error = dataset1_solution[dataset1_solution["SourceName"] == "admin@caterer.com"]
error

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,Salary per annum,SourceName,OpenDate,CloseDate
15379,68672352,Digital Account Manager **** plus a Great Bonus,South East London,not available,permanent,Blu Digital,Sales Jobs,27504,admin@caterer.com,2012-05-16,2012-07-15


In [11]:
# Let's look at other listings from this company.
Blu = dataset1_solution[dataset1_solution["Company"] == "Blu Digital"]
Blu

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,Salary per annum,SourceName,OpenDate,CloseDate
9047,68062012,Web Content / Email Manager,Bradford,not available,permanent,Blu Digital,"PR, Advertising & Marketing Jobs",28500,totaljobs.com,2013-12-08 15:00:00,2014-01-07 15:00:00
15379,68672352,Digital Account Manager **** plus a Great Bonus,South East London,not available,permanent,Blu Digital,Sales Jobs,27504,admin@caterer.com,2012-05-16 00:00:00,2012-07-15 00:00:00
16734,68686927,Senior Digital Account Manager Reading,Reading,not available,permanent,Blu Digital,Sales Jobs,37500,totaljobs.com,2012-05-09 15:00:00,2012-08-07 15:00:00
18850,68714304,Social Media Content Exec/Manager,Leeds,not available,permanent,Blu Digital,"PR, Advertising & Marketing Jobs",24000,totaljobs.com,2013-05-22 12:00:00,2013-08-20 12:00:00


In [12]:
# Fix up tha values.
dataset1_solution.SourceName.replace({"admin@caterer.com":"totaljobs.com"},inplace=True)

In [13]:
# Fix jobcareer value.
# Only one value, let's replace it straight away.
dataset1_solution[dataset1_solution["SourceName"] == "jobcareer"]

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,Salary per annum,SourceName,OpenDate,CloseDate
12871,68393935,Killer Javascript role Exclusive to Brightwater,Belfast,full_time,permanent,Brightwater Group,IT Jobs,39996,jobcareer,2012-02-05 15:00:00,2012-03-06 15:00:00


In [14]:
# Fixing it up.
dataset1_solution.SourceName.replace({"jobcareer":"jobcareer.com"},inplace=True)

In [15]:
# Fix monash student.
dataset1_solution[dataset1_solution["SourceName"] == "monashstudent"]

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,Salary per annum,SourceName,OpenDate,CloseDate
5663,66932999,Registered Midwives RM Lincolnshire Lincoln,Lincoln,part_time,not available,The A24 Group,Healthcare & Nursing Jobs,40140,monashstudent,2013-01-24 15:00:00,2013-02-23 15:00:00


In [16]:
# Other job listings from this company.
A24 = dataset1_solution[dataset1_solution["Company"] == "The A24 Group"]
A24

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,Salary per annum,SourceName,OpenDate,CloseDate
5663,66932999,Registered Midwives RM Lincolnshire Lincoln,Lincoln,part_time,not available,The A24 Group,Healthcare & Nursing Jobs,40140,monashstudent,2013-01-24 15:00:00,2013-02-23 15:00:00
11363,68300628,RSCN Maidstone,Maidstone,part_time,not available,The A24 Group,Healthcare & Nursing Jobs,34500,staffnurse.com,2013-03-07 12:00:00,2013-04-06 12:00:00
11364,68300683,Registered General nurse Margate Margate,Margate,part_time,not available,The A24 Group,Healthcare & Nursing Jobs,23004,staffnurse.com,2013-06-09 12:00:00,2013-08-08 12:00:00
11365,68300699,Registered General Nurse Hull,Hull,part_time,not available,The A24 Group,Healthcare & Nursing Jobs,40320,staffnurse.com,2013-07-11 15:00:00,2013-08-10 15:00:00
11367,68300858,Registered Mental Nurse Hull,Hull,part_time,not available,The A24 Group,Healthcare & Nursing Jobs,43200,staffnurse.com,2012-10-13 00:00:00,2012-12-12 00:00:00
11369,68300906,Registered General nurse Sittingbourne Sitti...,Sittingbourne,part_time,not available,The A24 Group,Healthcare & Nursing Jobs,23004,staffnurse.com,2013-11-24 15:00:00,2013-12-08 15:00:00
11372,68302080,Registered General nurse Sevenoaks Sevenoaks,Sevenoaks,part_time,not available,The A24 Group,Healthcare & Nursing Jobs,47040,staffnurse.com,2013-09-12 00:00:00,2013-11-11 00:00:00
11375,68302117,Registered General nurse Tonbridge Tonbridge,Tonbridge,part_time,not available,The A24 Group,Healthcare & Nursing Jobs,47040,staffnurse.com,2012-12-10 12:00:00,2012-12-24 12:00:00
11377,68302180,Registered General nurse Tunbridge Wells Tun...,Tunbridge Wells,part_time,not available,The A24 Group,Healthcare & Nursing Jobs,47040,staffnurse.com,2013-08-30 15:00:00,2013-09-29 15:00:00
11381,68302186,Registered General nurse Maidstone Maidstone,Maidstone,part_time,not available,The A24 Group,Healthcare & Nursing Jobs,47040,staffnurse.com,2012-10-21 15:00:00,2012-11-04 15:00:00


In [17]:
# Fix up the values.
dataset1_solution.SourceName.replace({"monashstudent":"staffnurse.com"},inplace=True)

#### 9) Fix salary per annum

In [18]:
# Upon inspeting the data we see that there are values which contain "K" 
# (such as 17k) and range values (such as: 1500 -1300)
salary_k = dataset1_solution[dataset1_solution["Salary per annum"].str.contains("K")]
salary_k.head()

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,Salary per annum,SourceName,OpenDate,CloseDate
133,46626860,Registered Home Manager Job Bournemouth,Bournemouth,full_time,not available,,Healthcare & Nursing Jobs,30K,careworx.co.uk,2013-03-29 15:00:00,2013-05-28 15:00:00
238,46627928,Care Home Manager Job North London ****K,London,full_time,not available,,Healthcare & Nursing Jobs,38K,careworx.co.uk,2012-11-07 15:00:00,2013-02-05 15:00:00
305,46628805,Home Care Workers Berkhampsted,UK,part_time,not available,,Healthcare & Nursing Jobs,14K,careworx.co.uk,2012-09-05 15:00:00,2012-11-04 15:00:00
596,46634306,Home Manager Mental Health 11 bed,Wales,not available,not available,,Healthcare & Nursing Jobs,24K,careworx.co.uk,2012-09-13 12:00:00,2012-11-12 12:00:00
830,46637596,Staff Nurse South Shields ****,South Shields,not available,not available,,Healthcare & Nursing Jobs,23K,careworx.co.uk,2012-04-29 15:00:00,2012-05-13 15:00:00


In [19]:
# Fix the values.
dataset1_solution["Salary per annum"].replace('K', '000', regex=True, inplace=True)

In [20]:
salary_range = dataset1_solution[dataset1_solution["Salary per annum"].str.contains("-")]
salary_range.head()

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,Salary per annum,SourceName,OpenDate,CloseDate
647,46634923,RGN Nurse Hull Days or Nights **** per hour,UK,full_time,not available,,Healthcare & Nursing Jobs,20896.2 - 23095.8,careworx.co.uk,2013-06-05 15:00:00,2013-09-03 15:00:00
896,48082563,Senior Chef de Partie One AA Rosette Hotel T...,Cumbria,not available,not available,Chef Results,Hospitality & Catering Jobs,16153.8 - 17854.2,caterer.com,2013-01-09 15:00:00,2013-01-23 15:00:00
980,49689021,Assessment Officer,Kent,not available,not available,,Healthcare & Nursing Jobs,23712.0 - 26208.0,careworx.co.uk,2012-02-20 00:00:00,2012-03-21 00:00:00
1062,51061201,Community Home Workers Melton Mowbray,Melton Mowbray,part_time,not available,,Healthcare & Nursing Jobs,16416.0 - 18144.0,careworx.co.uk,2013-10-23 12:00:00,2013-11-06 12:00:00
1141,52489008,RGN Care Home,Northwich,not available,not available,,Healthcare & Nursing Jobs,22321.2 - 24670.8,careworx.co.uk,2013-09-28 00:00:00,2013-11-27 00:00:00


In [21]:
# The startegy is to impute the mean of range values by taking these value into a new dataframe and adding the
# Mean values and then appending bakc to dataset1_solution.
# Reset index of new data frame.
salary_range.reset_index(drop=True, inplace=True)
# Split the dataset at "-".
salary_range_values = pd.DataFrame(salary_range["Salary per annum"].str.split("-",1).tolist(),
                                   columns = ['Min','Max'])
# Covert to numeric.
salary_range_values[['Min','Max']] = salary_range_values[['Min','Max']].apply(pd.to_numeric)
# Take mean and append as a new column.
salary_range_values["Mean"] = salary_range_values[['Min','Max']].mean(axis=1)

# Change column name.
salary_range["Salary per annum"] = salary_range_values["Mean"]
salary_range.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,Salary per annum,SourceName,OpenDate,CloseDate
0,46634923,RGN Nurse Hull Days or Nights **** per hour,UK,full_time,not available,,Healthcare & Nursing Jobs,21996.0,careworx.co.uk,2013-06-05 15:00:00,2013-09-03 15:00:00
1,48082563,Senior Chef de Partie One AA Rosette Hotel T...,Cumbria,not available,not available,Chef Results,Hospitality & Catering Jobs,17004.0,caterer.com,2013-01-09 15:00:00,2013-01-23 15:00:00
2,49689021,Assessment Officer,Kent,not available,not available,,Healthcare & Nursing Jobs,24960.0,careworx.co.uk,2012-02-20 00:00:00,2012-03-21 00:00:00
3,51061201,Community Home Workers Melton Mowbray,Melton Mowbray,part_time,not available,,Healthcare & Nursing Jobs,17280.0,careworx.co.uk,2013-10-23 12:00:00,2013-11-06 12:00:00
4,52489008,RGN Care Home,Northwich,not available,not available,,Healthcare & Nursing Jobs,23496.0,careworx.co.uk,2013-09-28 00:00:00,2013-11-27 00:00:00


In [22]:
# First remove the values containing "-".
dataset1_solution = dataset1_solution[dataset1_solution["Salary per annum"].str.contains("-") == False]
# Change datatypes to numeric.
dataset1_solution[['Salary per annum']] = dataset1_solution[['Salary per annum']].apply(pd.to_numeric)
# Append the values.
dataset1_solution = pd.concat([dataset1_solution, salary_range], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [23]:
# Change column name to a proper name.
dataset1_solution.rename(columns={'Salary per annum':'SalaryPerAnnum'}, inplace=True)
dataset1_solution.head(5)

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,SalaryPerAnnum,SourceName,OpenDate,CloseDate
0,12612628,Engineering Systems Analyst,Dorking,not available,permanent,Gregory Martin International,Engineering Jobs,24996.0,cv-library.co.uk,2012-11-03 00:00:00,2012-12-03 00:00:00
1,12612830,Stress Engineer Glasgow,Glasgow,not available,permanent,Gregory Martin International,Engineering Jobs,30000.0,cv-library.co.uk,2013-01-08 15:00:00,2013-04-08 15:00:00
2,12612844,Modelling and simulation analyst,Hampshire,not available,permanent,Gregory Martin International,Engineering Jobs,30000.0,cv-library.co.uk,2013-07-26 15:00:00,2013-09-24 15:00:00
3,12613049,Engineering Systems Analyst / Mathematical Mod...,Surrey,not available,permanent,Gregory Martin International,Engineering Jobs,27504.0,cv-library.co.uk,2012-12-14 00:00:00,2013-03-14 00:00:00
4,12613647,"Pioneer, Miser Engineering Systems Analyst",Surrey,not available,permanent,Gregory Martin International,Engineering Jobs,24996.0,cv-library.co.uk,2013-10-25 00:00:00,2013-12-24 00:00:00


#### 10) Let's look at category values.

In [24]:
# Take unique into a new dataframe and naming columns and sorting.
Category = dataset1_solution["Category"].unique()
Category = pd.DataFrame(Category)
Category.columns = ["Category"]
Category = Category.sort_values(by="Category")
Category

Unnamed: 0,Category
1,Accounting & Finance Jobs
0,Engineering Jobs
2,Healthcare & Nursing Jobs
3,Hospitality & Catering Jobs
4,IT Jobs
7,"PR, Advertising & Marketing Jobs"
5,Sales Jobs
6,Teaching Jobs


* There seems to be no erros.

#### 11) Let's look at company column.


In [25]:
# Take unique into a new dataframe and naming columns and sorting.
Company = dataset1_solution["Company"].unique()
Company = pd.DataFrame(Company)
Company.columns = ["Company"]
Company = Company.sort_values(by="Company")
Company.head(5)

Unnamed: 0,Company
4283,.
475,1 1 Recruitment Limited
4470,100 percent
110,121 International Recruitment Limited
4122,16 West Street


In [26]:
# Look the values in dataframe.
dataset1_solution[dataset1_solution["Company"] == "."]

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,SalaryPerAnnum,SourceName,OpenDate,CloseDate
20391,68849372,Apply Today Start Tomorrow,UK,not available,permanent,.,"PR, Advertising & Marketing Jobs",18000.0,jobstoday.co.uk,2013-07-01 12:00:00,2013-08-30 12:00:00


In [27]:
# Fix values.
dataset1_solution.Company.replace({'.': np.NaN},inplace=True)

In [28]:
dataset1_solution[dataset1_solution["Company"] == "@ITS  Limited"]

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,SalaryPerAnnum,SourceName,OpenDate,CloseDate
20319,68844688,Network Engineer Cisco CCNA,London,not available,contract,@ITS Limited,IT Jobs,57000.0,londonjobs.co.uk,2012-08-07,2012-09-06


In [29]:
# From the title it seems to be Cisco CCNA as the company name.
dataset1_solution.Company.replace({"@ITS  Limited":"Cisco CCNA @ITS Limited"},inplace=True)
dataset1_solution.Title.replace({"Network Engineer Cisco CCNA":"Network Engineer"},inplace=True)

#### 12) Fix contract time.

In [30]:
# check unique values.
dataset1_solution["ContractTime"].unique()

array(['permanent', 'not available', 'contract'], dtype=object)

In [31]:
# 3 unique values 
# Change Not available to Non Specified.
dataset1_solution.ContractTime.replace({'not available':'non-specified'},inplace=True)

#### 13) Fix contract type column.

In [32]:
# Unique values
dataset1_solution["ContractType"].unique()

array(['not available', 'full_time', 'part_time'], dtype=object)

In [33]:
# 3 unique values 
# Change values as per the description given.
dataset1_solution.ContractType.replace({'not available':'non-specified'},inplace=True)
dataset1_solution.ContractType.replace({'full_time':'full-time'},inplace=True)
dataset1_solution.ContractType.replace({'part_time':'part-time'},inplace=True)

#### 14) Fix location values.

In [34]:
# Aim is to check if there are any misspelt words by string matching.
# Take unique location values into a new dataframe.
location = dataset1_solution["Location"].unique()
location = pd.DataFrame(location)
location.columns = ["location"]
location = location.sort_values(by="location")
location.head(5)

Unnamed: 0,location
72,Aberdeen
143,Aberdeenshire
361,Abingdon
313,Accrington
443,Addlestone


In [35]:
# Make a list of location values.
location_values = list(location["location"])

In [36]:
possible_matches  = []
for i in location_values :
    temp = process.extract(i, location["location"], limit=2, scorer=fuzz.token_sort_ratio)
    
    if temp[1][1] > 75:
        possible_matches.append({i : temp[1][0]})
# Look art few values.
possible_matches[1:10]

[{'Aberdeenshire': 'Aberdeen'},
 {'Accrington': 'Darlington'},
 {'Alcester': 'Leicester'},
 {'Alfreton': 'Alton'},
 {'Alton': 'Luton'},
 {'Andover': 'Dover'},
 {'Axbridge': 'Uxbridge'},
 {'Aylesbury': 'Salisbury'},
 {'Barnsley': 'Burnley'}]

* Upon carefully observing this dataset, the below are found to be potential mispelt or same but different names.
* It needs further investigation.
* Aberdeenshire Aberdeen
* Buckinghamshire Buckingham
* Byfleet Fleet
* Harrow Harlow
* Leeds Leads
* Lewes Leeds
* Oxfords Oxford
* Reading Reeding
* Sure Surrey

#### Let's check frequency of the location in each case to see if it is an error or not.

In [37]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Aberdeenshire"]
len(Check_data)

19

In [38]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Aberdeen"]
len(Check_data)

141

* Let's assume both are differert values as we don't have any extra information from the description.

In [39]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Buckingham"]
len(Check_data)

3

In [40]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Buckinghamshire"]
len(Check_data)

100

* Let's assume both are differert values as we don't have any extra information from the description and cannot confirm the same.

In [41]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Byfleet"]
len(Check_data)

14

In [42]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Fleet"]
len(Check_data)

35

* Both are different.

In [43]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Harlow"]
Check_data

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,SalaryPerAnnum,SourceName,OpenDate,CloseDate
153,46627118,Maths Teacher Job Harlow,Harlow,full-time,non-specified,,Healthcare & Nursing Jobs,26496.0,careworx.co.uk,2013-01-14 00:00:00,2013-03-15 00:00:00
537,46633676,Support Worker Job Harlow,Harlow,full-time,non-specified,,Healthcare & Nursing Jobs,16248.0,careworx.co.uk,2012-10-11 15:00:00,2012-12-10 15:00:00
2304,62120819,Science Teacher in Harlow,Harlow,full-time,permanent,Support Services Group,Teaching Jobs,36000.0,fish4.co.uk,2012-11-24 00:00:00,2013-01-23 00:00:00
21202,69006499,Key Stage 2 Teacher required in Harlow,Harlow,non-specified,contract,Simply Education,Teaching Jobs,27996.0,cv-library.co.uk,2012-04-18 15:00:00,2012-05-18 15:00:00


In [44]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Harrow"]
Check_data.head(3)

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,SalaryPerAnnum,SourceName,OpenDate,CloseDate
242,46627932,Home Care Assistant Jobs Harrow,Harrow,part-time,non-specified,,Healthcare & Nursing Jobs,16320.0,careworx.co.uk,2012-01-23 00:00:00,2012-02-06 00:00:00
633,46634804,Home Care Workers Harrow,Harrow,part-time,non-specified,,Healthcare & Nursing Jobs,13440.0,careworx.co.uk,2013-01-24 15:00:00,2013-02-23 15:00:00
635,46634811,LiveIn Care Worker North West London,Harrow,full-time,non-specified,,Healthcare & Nursing Jobs,20400.0,careworx.co.uk,2013-12-07 00:00:00,2014-02-05 00:00:00


* Both are different as seen from the title values.

In [45]:
# Check few rows
Check_data = dataset1_solution[dataset1_solution["Location"] == "Leads"]
Check_data


Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,SalaryPerAnnum,SourceName,OpenDate,CloseDate
8831,68056649,Home Carers / Home Care Assistants,Leads,non-specified,permanent,Health & Social Care Jobs Ltd,Healthcare & Nursing Jobs,15744.0,totaljobs.com,2012-12-31 12:00:00,2013-01-30 12:00:00
16755,68687721,Business Development Manager (Collections & Re...,Leads,non-specified,permanent,Mobilus Limited.,Accounting & Finance Jobs,35004.0,totaljobs.com,2012-09-12 12:00:00,2012-12-11 12:00:00
18195,68706955,Team LeadersCaffe RitazzaLeeds Bradford Airport,Leads,non-specified,non-specified,SSP,Hospitality & Catering Jobs,12504.0,jobs.catererandhotelkeeper.com,2013-12-29 15:00:00,2014-01-12 15:00:00
24223,69182139,"Sous Chef Sleek, Stylish, Contemporary Brande...",Leads,non-specified,non-specified,Bee Recruitment London Ltd,Hospitality & Catering Jobs,24000.0,jobs.catererandhotelkeeper.com,2012-09-02 00:00:00,2012-09-16 00:00:00


In [46]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Leeds"]
len(Check_data)

343

In [47]:
# This seems to be an error. Let's fix it.
dataset1_solution.Location.replace({'Leads':'Leeds'},inplace=True)

In [48]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Lewes"]
len(Check_data)

13

* Diiferent values

In [49]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Oxford"]
len(Check_data)

190

In [50]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Oxfords"]
Check_data

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,SalaryPerAnnum,SourceName,OpenDate,CloseDate
4864,66699137,Cluster Cafe Manager,Oxfords,non-specified,non-specified,Platinum Recruitment Consultancy,Hospitality & Catering Jobs,32004.0,caterer.com,2013-06-23 00:00:00,2013-07-23 00:00:00
11253,68299716,Deputy Home Manager (Nursing) Oxon ****k Oxford,Oxfords,full-time,non-specified,Kare Plus Agencies Limited,Healthcare & Nursing Jobs,36000.0,staffnurse.com,2012-11-27 00:00:00,2012-12-27 00:00:00
12150,68359143,Senior Java Developer Groovy / Grails Oxford...,Oxfords,non-specified,permanent,CV Screen Ltd,IT Jobs,21000.0,cwjobs.co.uk,2012-05-21 12:00:00,2012-08-19 12:00:00
22574,69043228,"Development Manager,Java, Enterprise",Oxfords,non-specified,permanent,Haybrook IT Resourcing Ltd,IT Jobs,67500.0,cwjobs.co.uk,2013-11-08 12:00:00,2014-01-07 12:00:00


In [51]:
# This seems to be an error. Let's fix it.
dataset1_solution.Location.replace({'Oxfords':'Oxford'},inplace=True)

In [52]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Reeding"]
Check_data

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,SalaryPerAnnum,SourceName,OpenDate,CloseDate
269,46628099,Health Care Assistant Jobs Reading,Reeding,part-time,non-specified,,Healthcare & Nursing Jobs,18240.0,careworx.co.uk,2012-01-16 15:00:00,2012-03-16 15:00:00
7693,67770741,VBNET Web Developer pound;**** pound;**** nda...,Reeding,non-specified,non-specified,,IT Jobs,36996.0,britishjobsonthe.net,2013-01-02 12:00:00,2013-01-16 12:00:00
9621,68096028,Excel Specialist (6 month contract),Reeding,non-specified,contract,Toner Graham,Accounting & Finance Jobs,39360.0,totaljobs.com,2012-06-09 00:00:00,2012-06-23 00:00:00
24157,69174119,Online User Experience Manager,Reeding,non-specified,permanent,Michael Page Marketing,"PR, Advertising & Marketing Jobs",47004.0,totaljobs.com,2013-08-30 15:00:00,2013-09-29 15:00:00


In [53]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Reading"]
len(Check_data)

300

In [54]:
# This seems to be an error. Let's fix it.
dataset1_solution.Location.replace({'Reeding':'Reading'},inplace=True)

In [55]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Surrey"]
len(Check_data)

317

In [56]:
Check_data = dataset1_solution[dataset1_solution["Location"] == "Surey"]
Check_data

Unnamed: 0,Id,Title,Location,ContractType,ContractTime,Company,Category,SalaryPerAnnum,SourceName,OpenDate,CloseDate
122,46626802,Nursing Home Staff Nurse Job Surrey,Surey,full-time,non-specified,,Healthcare & Nursing Jobs,23040.0,careworx.co.uk,2013-05-12 12:00:00,2013-08-10 12:00:00
10289,68197473,"Brand Manager, Luxury Travel",Surey,non-specified,non-specified,,"PR, Advertising & Marketing Jobs",42504.0,onlymarketingjobs.com,2012-09-02 12:00:00,2012-10-02 12:00:00
14294,68591031,Assistant Rest ManagerBranded RestSurrey ****k,Surey,non-specified,non-specified,"Berkeley Scott Pubs, Bars & Restaurants",Hospitality & Catering Jobs,23496.0,caterer.com,2013-06-01 00:00:00,2013-07-01 00:00:00
19824,68822692,Field Application Engineer RFID,Surey,non-specified,non-specified,Redline Group,Sales Jobs,54996.0,gojobsearch.co.uk,2013-04-30 00:00:00,2013-05-14 00:00:00


In [57]:
# This seems to be an error. Let's fix it.
dataset1_solution.Location.replace({'Surey':'Surrey'},inplace=True)

* Assuming all special characters are not errors in the column Title, there seems to be no errors.
* There are also no errors in id column.

In [None]:
# Let's create a csv file of the solution.
dataset1_solution.to_csv('dataset1_solution.csv', sep=',',index=False)
print("Check the working directory for the file dataset1_solution.csv")

### End of Challenge Task 1