# Cleaning Many Datasets

In [2]:
import dask, dask.dataframe as dd, dask.array as da
from dask.diagnostics import ProgressBar
import matplotlib.pyplot as plt
import pandas as pd
import nltk, re, csv, os
import numpy as np
from dask import delayed, persist
from dask.distributed import Client
from glob import glob

pd.set_option('display.max_columns', None)
csv.field_size_limit(10000000)

%matplotlib inline

The following is one of the most important pieces of the puzzle to achieve paralellization with Dask, and that is the `Client`. This is the object that manages your workers. Your workers, in turn, manage the threads in your machine.

Your computer has a CPU which hold several cores. Dask tries to send all paralellizable computations to as many cores as you tell it to and because you are constraint by the constraints of your computer, it is crucial to get this step right.

Let's first check how many cores we have by using the following code.

In [None]:
os.cpu_count()

In [2]:
client = Client(
    n_workers=1,
    threads_per_worker=4,
    memory_limit='7GB'
)
client

0,1
Client  Scheduler: tcp://127.0.0.1:55822  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 7.00 GB


In [3]:
x = da.random.random((100000, 100000), chunks='16 MiB')
x

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,12.50 MB
Shape,"(10000, 10000)","(1250, 1250)"
Count,64 Tasks,64 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 800.00 MB 12.50 MB Shape (10000, 10000) (1250, 1250) Count 64 Tasks 64 Chunks Type float64 numpy.ndarray",10000  10000,

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,12.50 MB
Shape,"(10000, 10000)","(1250, 1250)"
Count,64 Tasks,64 Chunks
Type,float64,numpy.ndarray


In [4]:
%%time

y = (x + x.T) - x.mean(axis=0)
y.sum().compute()

CPU times: user 2.39 s, sys: 551 ms, total: 2.94 s
Wall time: 736 ms


50003353.18545063

In [5]:
del y
del x

## What to do before running these cells

In [7]:
path_in = '/Volumes/LaCie SSD/bgdata/data_19/test/'
path_in

'/Volumes/LaCie SSD/bgdata/data_19/test/'

In [8]:
best_list = ['JobID', 'CleanJobTitle', 'CanonCity', 'CanonState', 'JobDate', 'JobText', 'Source', 'CanonEmployer',
             'Latitude', 'Longitude', 'CanonIntermediary', 'CanonJobTitle', 'CanonCounty', 'DivisionCode', 'MSA', 'LMA',
             'InternshipFlag', 'ConsolidatedONET', 'CanonSkillClusters', 'CanonSkills', 'IsDuplicate', 'CanonMinimumDegree', 
             'CanonRequiredDegrees', 'CIPCode', 'MinExperience', 'ConsolidatedInferredNAICS', 'BGTOcc', 'MaxAnnualSalary',
             'MaxHourlySalary', 'MinAnnualSalary', 'MinHourlySalary', 'YearsOfExperience', 'CanonJobHours', 'CanonJobType',
             'CanonPostalCode', 'CanonYearsOfExperienceCanonLevel', 'CanonYearsOfExperienceLevel', 'ConsolidatedTitle', 
             'Language', 'BGTSubOcc', 'ConsolidatedDegreeLevels', 'MaxDegreeLevel', 'MinDegreeLevel']

In [9]:
dtypes={'JobID': np.str, 'CleanJobTitle': np.str, 'JobDomain': np.str, 
        'CanonCity': np.str, 'CanonCountry': np.str, 'CanonState': np.str, 
        'JobText': np.str, 'JobURL': np.str, 'PostingHTML': np.str, 
        'Source': np.str, 'JobReferenceID': np.str, 'Email': np.str, 
        'CanonEmployer': np.str, 'Latitude': np.str, 'Longitude': np.str, 
        'CanonIntermediary': np.str, 'Telephone': np.str, 'CanonJobTitle': np.str, 
        'CanonCounty': np.str, 'DivisionCode': np.str, 'MSA': np.str, 'LMA': np.str,
        'InternshipFlag': np.str, 'ConsolidatedONET': np.str, 'CanonCertification': np.str, 
        'CanonSkillClusters': np.str, 'CanonSkills': np.str, 'IsDuplicate': np.str, 
        'IsDuplicateOf': np.str, 'CanonMaximumDegree': np.str, 'CanonMinimumDegree': np.str, 
        'CanonOtherDegrees': np.str, 'CanonPreferredDegrees': np.str,
        'CanonRequiredDegrees': np.str, 'CIPCode': np.str, 'StandardMajor': np.str, 
        'MaxExperience': np.str, 'MinExperience': np.str, 'ConsolidatedInferredNAICS': np.str, 
        'BGTOcc': np.str, 'MaxAnnualSalary': np.str, 'MaxHourlySalary': np.str, 
        'MinAnnualSalary': np.str, 'MinHourlySalary': np.str, 'YearsOfExperience': np.str, 
        'CanonJobHours': np.str, 'CanonJobType': np.str, 'CanonPostalCode': np.str, 
        'CanonYearsOfExperienceCanonLevel': np.str, 'CanonYearsOfExperienceLevel': np.str, 
        'ConsolidatedTitle': np.str, 'Language': np.str, 'BGTSubOcc': np.str, 'JobDate': np.str,
        'ConsolidatedDegreeLevels': np.str, 'MaxDegreeLevel': np.str, 'MinDegreeLevel': np.str,
                       }

In [10]:
ddf = dd.read_csv(os.path.join(path_in, 'da*.csv'), 
                 engine='python', 
                 dtype=dtypes,
#                  sample=250_000_000,
#                  encoding='latin-1',
#                  encoding='utf-8',
                 assume_missing=True,
                 error_bad_lines=False,
                 blocksize=None,
                 usecols=best_list,
#                  parse_dates=['JobDate']
                )
ddf

Skipping line 49: unexpected end of data


Unnamed: 0_level_0,JobID,CleanJobTitle,CanonCity,CanonState,JobDate,JobText,Source,CanonEmployer,Latitude,Longitude,CanonIntermediary,CanonJobTitle,CanonCounty,DivisionCode,MSA,LMA,InternshipFlag,ConsolidatedONET,CanonSkillClusters,CanonSkills,IsDuplicate,CanonMinimumDegree,CanonRequiredDegrees,CIPCode,MinExperience,ConsolidatedInferredNAICS,BGTOcc,MaxAnnualSalary,MaxHourlySalary,MinAnnualSalary,MinHourlySalary,YearsOfExperience,CanonJobHours,CanonJobType,CanonPostalCode,CanonYearsOfExperienceCanonLevel,CanonYearsOfExperienceLevel,ConsolidatedTitle,Language,BGTSubOcc,ConsolidatedDegreeLevels,MaxDegreeLevel,MinDegreeLevel
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [11]:
%%time

ddf00 = ddf.repartition(npartitions=70)

CPU times: user 761 µs, sys: 31 µs, total: 792 µs
Wall time: 787 µs


In [12]:
%%time

ddf00.head()

CPU times: user 32.4 s, sys: 10.3 s, total: 42.7 s
Wall time: 50.2 s


Unnamed: 0,JobID,CleanJobTitle,CanonCity,CanonState,JobDate,JobText,Source,CanonEmployer,Latitude,Longitude,CanonIntermediary,CanonJobTitle,CanonCounty,DivisionCode,MSA,LMA,InternshipFlag,ConsolidatedONET,CanonSkillClusters,CanonSkills,IsDuplicate,CanonMinimumDegree,CanonRequiredDegrees,CIPCode,MinExperience,ConsolidatedInferredNAICS,BGTOcc,MaxAnnualSalary,MaxHourlySalary,MinAnnualSalary,MinHourlySalary,YearsOfExperience,CanonJobHours,CanonJobType,CanonPostalCode,CanonYearsOfExperienceCanonLevel,CanonYearsOfExperienceLevel,ConsolidatedTitle,Language,BGTSubOcc,ConsolidatedDegreeLevels,MaxDegreeLevel,MinDegreeLevel
0,38472243834,Sdet,San Francisco,CA,2019-01-01,SDET\n\nABOTTS Consulting\n\n-\n\nSan Francisc...,Job Board,,37.7798,-122.417,,,San Francisco,41884.0,41860: Metropolitan Statistical Area|488: Comb...,DV064188|MT064186,0,17205100,Specialized Skills|Information Technology: Sof...,"{'Analytical Skills': 'Specialized Skills', 'A...",False,Bachelor's in Computer Science,Bachelor's|Bachelor's in Computer Science,110701.0,5.0,,17-2051.00,,,,,5+ years|6 years,fulltime,temporary,94101,1-6,mid,Sdet,en,Civil Engineer,16.0,,16.0
1,38472243883,Skilled Nursing Biller,Kannapolis,NC,2019-01-01,Skilled Nursing Biller\n\nGatewood Healthcare ...,Job Board,,35.4971,-80.65,,,Cabarrus,,16740: Metropolitan Statistical Area,MT371674,0,29114100,Finance: Billing and Invoicing;Specialized Ski...,{'Billing': 'Finance: Billing and Invoicing;Sp...,False,,,,2.0,6231.0,29-1141.00,,,,,Minimum of two years,fulltime,permanent,28081,1-6,mid,Nursing Biller,en,Registered Nurse,,,
2,38472243901,Senior Engineer,Cincinnati,OH,2019-01-01,View All num of num Close (Esc)\n\nGreater Cin...,Job Board,Greater Cincinnati Water Works,39.1072,-84.5004,,,Hamilton,,17140: Metropolitan Statistical Area,MT391714,0,17205100,Specialized Skills|Architecture and Constructi...,"{'Calculation': 'Specialized Skills', 'Cost Es...",False,Bachelor's,Bachelor's,,,,17-2051.00,96535.1,46.41,71831.1,34.53,one year,fulltime,permanent,45201,,,Senior Engineer,en,Civil Engineer,16.0,,16.0
3,38472243915,Customer Service-Restaurant,Cincinnati,OH,2019-01-01,Popeyes Logo\n\nCustomer Service-Restaurant\n\...,Job Board,Popeyes,39.1072,-84.5004,,,Hamilton,,17140: Metropolitan Statistical Area,MT391714,0,43405100,Customer and Client Support: Cash Register Ope...,{'Cash Handling': 'Customer and Client Support...,False,,,,,722513.0,43-4051.00,,,,,,,,45201,,,Customer Service-Restaurant,en,Customer Service Representative (General),,,
4,38472243879,Companion Aide,Charlotte,NC,2019-01-01,Companion Aide\n\nThe Cypress of Charlotte Clu...,Job Board,,35.1943,-80.8266,,Companion Aide,Mecklenburg,,16740: Metropolitan Statistical Area,MT371674,0,39902100,Common Skills|Health Care: Basic Living Activi...,"{'Communication Skills': 'Common Skills', 'Com...",False,Higher Secondary Certificate,Higher Secondary Certificate,,1.0,7139.0,39-9021.00,,,,,1 year,parttime,permanent,28201,0-1,low,Companion Aide,en,Caregiver / Personal Care Aide,12.0,,12.0


In [13]:
%%time

ddf00.tail()

CPU times: user 47.1 s, sys: 21.2 s, total: 1min 8s
Wall time: 1min 33s


Unnamed: 0,JobID,CleanJobTitle,CanonCity,CanonState,JobDate,JobText,Source,CanonEmployer,Latitude,Longitude,CanonIntermediary,CanonJobTitle,CanonCounty,DivisionCode,MSA,LMA,InternshipFlag,ConsolidatedONET,CanonSkillClusters,CanonSkills,IsDuplicate,CanonMinimumDegree,CanonRequiredDegrees,CIPCode,MinExperience,ConsolidatedInferredNAICS,BGTOcc,MaxAnnualSalary,MaxHourlySalary,MinAnnualSalary,MinHourlySalary,YearsOfExperience,CanonJobHours,CanonJobType,CanonPostalCode,CanonYearsOfExperienceCanonLevel,CanonYearsOfExperienceLevel,ConsolidatedTitle,Language,BGTSubOcc,ConsolidatedDegreeLevels,MaxDegreeLevel,MinDegreeLevel
441294,38555508265,Sales And Visual Sales,Austin,TX,2019-07-08,"Part-time Sales and Visual Sales in Austin, Te...",Company,Container Store,30.2202,-97.7492,,,Travis,,12420: Metropolitan Statistical Area,MT481242,0,41401200,Specialized Skills|Common Skills|Specialized S...,"{'Cleaning': 'Specialized Skills', 'Communicat...",False,,,,,453998.0,41-4011.00,,,,,,parttime,permanent,73301.0,,,Visual Sales,en,Sales Representative,,,
441295,38555508266,Long Term Elementary Substitute Teacher 1 0 Ft...,,UT,2019-07-08,LONG TERM ELEMENTARY SUBSTITUTE TEACHER 1.0 FT...,Company,Granite School District,,,,Substitute Teacher,,,,,0,25309900,Education and Training: Teaching;Specialized S...,{'Lesson Planning': 'Education and Training: T...,False,,,,,6111.0,25-3099.00,,,,,,fulltime,temporary,,,,Substitute Teacher,en,Substitute Teacher,,,
441296,38555508284,Post-Doctoral Scholar In Water Resources Policy,Irvine,CA,2019-07-08,Postdoctoral Scholar in Water Resources Policy...,Job Board,Irvine,33.7425,-117.747,,,Orange,11244.0,31080: Metropolitan Statistical Area|348: Comb...,DV064204|MT063110,0,19201100,Common Skills|Specialized Skills|Analysis: Dat...,"{'Communication Skills': 'Common Skills', 'Cre...",False,Doctor of Philosophy,Doctor of Philosophy|Doctorate,,,6113.0,,,,,,,fulltime,permanent,92602.0,,,"Doctor/Scholar, Water Resources,Policy",en,,21.0,,21.0
441297,38555508301,Senior Design Engineer,San Diego,CA,2019-07-08,req11053 \n Senior Design Engineer 2 \n \n \n ...,Company,Asml Holding N V,32.7211,-117.164,,,San Diego,,41740: Metropolitan Statistical Area,MT064174,0,17214100,Finance: Budget Management;Specialized Skills|...,{'Budgeting': 'Finance: Budget Management;Spec...,False,Bachelor's,Bachelor's,141901.0,7.0,,17-2141.00,,,,,3-7 years|7 years,,,92101.0,6+,high,Senior Design Engineer,en,Mechanical Design Engineer,16.0,,16.0
441298,38555508302,Senior Supply Chain Specialist,Greensboro,NC,2019-07-08,Sr. Supply Chain Specialist\n\nCompany: N/A\n\...,Job intermediary,,35.0033,-79.3376,Belcan,Supply Chain Specialist,Guilford,,24660: Metropolitan Statistical Area,MT372466,0,13108100,Business: Business Strategy;Specialized Skills...,{'Business Planning': 'Business: Business Stra...,False,Bachelor's,Bachelor's,,2.0,,13-1081.00,,,,,,,,27395.0,1-6,mid,Supply Chain Specialist,en,Supply Chain Specialist,16.0,,16.0


In [14]:
ddf00.npartitions

70

In [15]:
EmployerCondition = ((ddf00['CanonEmployer'].isnull()) & (ddf00['CanonIntermediary'].notnull()))
EmployerClean = ddf00['CanonEmployer'].where(~EmployerCondition, 'Recruitment Agency')
ddf_clean0 = ddf00.drop('CanonEmployer', axis=1)
ddf_clean01 = ddf_clean0.assign(EmployerClean=EmployerClean)

In [16]:
%%time

missing_count = ((ddf_clean01.isna().sum() / ddf_clean01.index.size) * 100)
missing_count_pct = missing_count.compute()
missing_count_pct

CPU times: user 1min 45s, sys: 2min 52s, total: 4min 38s
Wall time: 8min 24s


JobID                                0.000000
CleanJobTitle                        0.009820
CanonCity                            0.920248
CanonState                           0.035404
JobDate                              0.000000
JobText                              0.000258
Source                               3.917837
Latitude                             0.885490
Longitude                            0.885490
CanonIntermediary                   89.286517
CanonJobTitle                       36.449668
CanonCounty                          0.922444
DivisionCode                        66.320508
MSA                                  3.031442
LMA                                  1.173503
InternshipFlag                       0.000000
ConsolidatedONET                     3.736940
CanonSkillClusters                   6.337719
CanonSkills                          0.000000
IsDuplicate                          0.000000
CanonMinimumDegree                  49.592336
CanonRequiredDegrees              

In [17]:
cols_to_drop = list(missing_count_pct[missing_count_pct >= 60].index)
cols_to_drop

['CanonIntermediary',
 'DivisionCode',
 'CIPCode',
 'MaxAnnualSalary',
 'MaxHourlySalary',
 'MinAnnualSalary',
 'MinHourlySalary',
 'MaxDegreeLevel']

In [18]:
ddf_clean1 = ddf_clean01.drop(cols_to_drop, axis=1)
# langs = ddf_clean1['Language'].value_counts().compute()
# most_common_lang = langs.sort_values(ascending=False).index[0]
# ddf_clean2 = ddf_clean1.fillna({'Language': most_common_lang})
ddf_clean2 = ddf_clean1.fillna({'Language': 'en'})

In [19]:
rows_to_drop = list(missing_count_pct[(missing_count_pct < 10) & (missing_count_pct > 0)].index)
rows_to_drop

['CleanJobTitle',
 'CanonCity',
 'CanonState',
 'JobText',
 'Source',
 'Latitude',
 'Longitude',
 'CanonCounty',
 'MSA',
 'LMA',
 'ConsolidatedONET',
 'CanonSkillClusters',
 'BGTOcc',
 'CanonPostalCode',
 'ConsolidatedTitle',
 'BGTSubOcc',
 'EmployerClean']

In [20]:
ddf_clean3 = ddf_clean2.dropna(subset=rows_to_drop)

In [21]:
remaining_cols_to_clean = list(missing_count_pct[(missing_count_pct >= 10) & (missing_count_pct < 60)].index)
unknown_default_dict = dict(map(lambda columnName: (columnName, 'Unknown'), remaining_cols_to_clean))
unknown_default_dict

{'CanonJobTitle': 'Unknown',
 'CanonMinimumDegree': 'Unknown',
 'CanonRequiredDegrees': 'Unknown',
 'MinExperience': 'Unknown',
 'ConsolidatedInferredNAICS': 'Unknown',
 'YearsOfExperience': 'Unknown',
 'CanonJobHours': 'Unknown',
 'CanonJobType': 'Unknown',
 'CanonYearsOfExperienceCanonLevel': 'Unknown',
 'CanonYearsOfExperienceLevel': 'Unknown',
 'ConsolidatedDegreeLevels': 'Unknown',
 'MinDegreeLevel': 'Unknown'}

In [22]:
ddf_clean4 = ddf_clean3.fillna(unknown_default_dict)

In [23]:
# print(ddf_clean4.isnull().sum().compute())
# ddf_clean4.persist()

In [24]:
# ddf_clean4['Language'].value_counts().compute()

In [25]:
clean_text = ddf_clean4.loc[:, 'JobText'].apply(lambda x: ' '.join(list(filter(None, x.split('\n')))), meta=np.str)
ddf_clean5 = ddf_clean4.drop('JobText', axis=1)
ddf_clean6 = ddf_clean5.assign(clean_text=clean_text)
english_condition = ddf_clean6['Language'].isin(['en'])
ddf_clean7 = ddf_clean6[english_condition]
dates = dd.to_datetime(ddf_clean7['JobDate'])
ddf_clean8 = ddf_clean7.drop('JobDate', axis=1)
ddf_clean9 = ddf_clean8.assign(JobDate=dates)

In [None]:
%%time

if not os.path.exists(os.path.join(path_in, 'clean')):
    os.makedirs(os.path.join(path_in, 'clean'))
(ddf_clean9
 .repartition(npartitions=50)
 .to_csv(os.path.join(path_in, 'clean/', 'data_cleaned_*.csv'), index=False)
 )

## Supervised Task

The vars below can be classified throughout the dataset.