# 06 Getting a New Dataset

In [2]:
import dask, dask.dataframe as dd, dask.array as da
from dask.diagnostics import ProgressBar
import matplotlib.pyplot as plt
import pandas as pd
import re, csv, os
import numpy as np
from dask import delayed, persist
from dask.distributed import Client
from glob import glob

pd.set_option('display.max_columns', None)
csv.field_size_limit(10000000)

%matplotlib inline

In [2]:
client = Client(
    n_workers=2,
    threads_per_worker=3,
    memory_limit='3GB'
)
client

0,1
Client  Scheduler: tcp://127.0.0.1:59146  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 6  Memory: 6.00 GB


In [3]:
x = da.random.random((10000, 10000), chunks='16 MiB')
x

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,12.50 MB
Shape,"(10000, 10000)","(1250, 1250)"
Count,64 Tasks,64 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 800.00 MB 12.50 MB Shape (10000, 10000) (1250, 1250) Count 64 Tasks 64 Chunks Type float64 numpy.ndarray",10000  10000,

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,12.50 MB
Shape,"(10000, 10000)","(1250, 1250)"
Count,64 Tasks,64 Chunks
Type,float64,numpy.ndarray


In [4]:
%%time

y = (x + x.T) - x.mean(axis=0)
y.sum().compute()

CPU times: user 420 ms, sys: 64.1 ms, total: 484 ms
Wall time: 1.41 s


49998710.332450785

In [5]:
del y
del x

In [6]:
path = '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean'
# partitions_in = 1000
partitions_out = 12

In [8]:
col_names = ['JobID', 'CleanJobTitle', 'CanonCity', 'CanonState', 'Source', 'Latitude', 
             'Longitude', 'CanonJobTitle', 'CanonCounty', 'MSA', 'LMA', 'InternshipFlag',
             'ConsolidatedONET', 'CanonSkillClusters', 'CanonSkills', 'CanonMinimumDegree',
             'CanonRequiredDegrees', 'MinExperience', 'ConsolidatedInferredNAICS', 'BGTOcc',
             'YearsOfExperience', 'CanonJobHours', 'CanonJobType', 'CanonPostalCode', 
             'CanonYearsOfExperienceCanonLevel', 'CanonYearsOfExperienceLevel', 'ConsolidatedTitle',
             'Language', 'BGTSubOcc', 'ConsolidatedDegreeLevels', 'MinDegreeLevel', 'EmployerClean',
             'clean_text', 'JobDate']

In [10]:
dtypes={'CanonSkills': np.str, 'Latitude': np.float32, 'JobID': np.str, 'CanonJobTitle': np.str,
        'CanonYearsOfExperienceLevel': np.str, 'Longitude': np.float32, 'CanonJobType': np.str, 
        'CleanJobTitle': np.str, 'ConsolidatedInferredNAICS': np.str, 'CanonRequiredDegrees': np.str,
        'YearsOfExperience': np.str, 'CanonCity': np.str, 'CanonCounty': np.str, 'CanonJobHours': np.str,
        'CanonState': np.str, 'ConsolidatedONET': np.str, 'MSA': np.str, 'CanonMinimumDegree': np.str,
        'ConsolidatedDegreeLevels': np.str, 'BGTSubOcc': np.str, 'ConsolidatedTitle': np.str,
        'CanonSkillClusters': np.str, 'Language': np.str, 'JobDate': np.str,
        'MinDegreeLevel': np.str, 'LMA': np.str, 'MinExperience': np.str, 'CanonPostalCode': np.str,
        'InternshipFlag': np.bool_, 'Source': np.str, 'BGTOcc': np.str,
        'CanonYearsOfExperienceCanonLevel': np.str
                       }

In [11]:
ddf = dd.read_csv(os.path.join(path, 'da*.csv'), 
                 engine='python',
#                  sample=250_000_000,
                 dtype=dtypes,
                 assume_missing=True,
                 error_bad_lines=False,
                 blocksize=None,
#                  blocksize="250MB",
                 usecols=col_names,
                )
ddf

Unnamed: 0_level_0,JobID,CleanJobTitle,CanonCity,CanonState,Source,Latitude,Longitude,CanonJobTitle,CanonCounty,MSA,LMA,InternshipFlag,ConsolidatedONET,CanonSkillClusters,CanonSkills,CanonMinimumDegree,CanonRequiredDegrees,MinExperience,ConsolidatedInferredNAICS,BGTOcc,YearsOfExperience,CanonJobHours,CanonJobType,CanonPostalCode,CanonYearsOfExperienceCanonLevel,CanonYearsOfExperienceLevel,ConsolidatedTitle,Language,BGTSubOcc,ConsolidatedDegreeLevels,MinDegreeLevel,EmployerClean,clean_text,JobDate
npartitions=25,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
,object,object,object,object,object,float32,float32,object,object,object,object,bool,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [12]:
%%time

ddf.tail()

CPU times: user 206 ms, sys: 22 ms, total: 228 ms
Wall time: 3.85 s


Unnamed: 0,JobID,CleanJobTitle,CanonCity,CanonState,Source,Latitude,Longitude,CanonJobTitle,CanonCounty,MSA,LMA,InternshipFlag,ConsolidatedONET,CanonSkillClusters,CanonSkills,CanonMinimumDegree,CanonRequiredDegrees,MinExperience,ConsolidatedInferredNAICS,BGTOcc,YearsOfExperience,CanonJobHours,CanonJobType,CanonPostalCode,CanonYearsOfExperienceCanonLevel,CanonYearsOfExperienceLevel,ConsolidatedTitle,Language,BGTSubOcc,ConsolidatedDegreeLevels,MinDegreeLevel,EmployerClean,clean_text,JobDate
31768,38550738893,Medical Techn,Paterson,NJ,Job Board,40.913799,-74.1726,Unknown,Passaic,35620: Metropolitan Statistical Area|408: Comb...,DV363564|MT363562,False,29209900,Health Care: Medical Research;Specialized Skil...,{'Medical Technology': 'Health Care: Medical R...,Bachelor's,Bachelor's,3,622110,29-2099.00,one year,fulltime,permanent,7501,1-6,mid,Medical Techn,en,Health Technician / Technologist (Other),16,16,St Josephs Healthcare System,Job Information St Josephs Healthcare System M...,2019-06-24
31769,38550738936,Accountant Bookkeeper - Chinese/English Biling...,Irvine,CA,Job Board,33.7425,-117.747002,Bookkeeper,Orange,31080: Metropolitan Statistical Area|348: Comb...,DV064204|MT063110,False,43303100,Finance: General Accounting;Specialized Skills...,{'Accounting': 'Finance: General Accounting;Sp...,Unknown,Unknown,Unknown,Unknown,43-3031.00,Unknown,fulltime,permanent,92602,Unknown,Unknown,Bookkeeper,en,Bookkeeper,Unknown,Unknown,"Pacific Surrogacy Usa, Llc",Posted on Accountant Bookkeeper - Chinese/Engl...,2019-06-24
31770,38550738945,Tool Rental,Ventura,CA,Job Board,34.293098,-119.293999,Unknown,Ventura,37100: Metropolitan Statistical Area,MT063710,False,41202100,Manufacturing and Production: Machinery;Specia...,{'Machinery': 'Manufacturing and Production: M...,Higher Secondary Certificate,Unknown,Unknown,2382,41-2021.00,0-2 years,Unknown,Unknown,93001,0,zero,Tool Rental,en,Rental Clerk,12,12,HD Supply,"Tool Rental HD Supply Support Services, Inc. i...",2019-06-24
31771,38550738950,"Retail Sales Representative , District",Tampa,FL,Job Board,27.9827,-82.340202,Retail Sales Representative,Hillsborough,45300: Metropolitan Statistical Area,MT124530,False,41203100,Administration: Administrative Support;Special...,{'Administrative Functions': 'Administration: ...,Higher Secondary Certificate,General Equivalency Degree|Higher Secondary Ce...,Unknown,311351,41-2031.00,3 or more years,fulltime,permanent,33601,Unknown,Unknown,Retail Sales Representative,en,Retail Sales Representative,16|12,12,Hershey Company,Job Information The Hershey Company Retail Sal...,2019-06-24
31772,38550738959,Warehouse Attendant,Sumner,WA,Job Board,47.162102,-122.241997,Unknown,Pierce,42660: Metropolitan Statistical Area|500: Comb...,DV534510|MT534266,False,53706200,Analysis: Mathematics;Specialized Skills|Analy...,{'Algebra': 'Analysis: Mathematics;Specialized...,Unknown,Unknown,1,Unknown,53-7062.00,One year|three to six months,fulltime,permanent,98352,0-1,low,Warehouse Attendant,en,Laborer / Warehouse Worker,Unknown,Unknown,Water Weights Incorporated,Posted on Warehouse Attendant Water Weights In...,2019-06-24


In [3]:
df_comps_list = pd.read_csv('bg_glass_hiearchy_merge.csv', usecols=['CanonEmployer'])
df_comps_list = list(df_comps_list['CanonEmployer'].unique())
df_comps_list[:10], len(df_comps_list)

(['H5', 'Yu', 'R3', 'G5', 'Hcp', 'Square', 'Argos', 'Hgi', 'Dt', 'Cps'], 76041)

In [17]:
%%time

comps_condition = ddf['EmployerClean'].isin(df_comps_list)
ddf0 = ddf[comps_condition]
len(ddf), len(ddf0)

CPU times: user 21.6 s, sys: 4.33 s, total: 25.9 s
Wall time: 5min 30s


(918251, 209046)

In [16]:
%%time

# using the same folder in your path, we will create a new one for the cleaned data
# and save our new files there
if not os.path.exists(os.path.join(path, 'filtered_comps')):
    os.makedirs(os.path.join(path, 'filtered_comps'))
    

# the following lines of code will take the last dataset, repartition it,
# and save it to the desired location. Notice the wildcard "*" below. That is
# the spot Dask will use to number your files starting from 0
(ddf0
 .repartition(npartitions=partitions_out)
 .to_csv(os.path.join(path, 'filtered_comps/', 'data_filtered_*.csv'), index=False)
 )

CPU times: user 13.3 s, sys: 2.54 s, total: 15.9 s
Wall time: 3min 26s


['/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps/data_filtered_00.csv',
 '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps/data_filtered_01.csv',
 '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps/data_filtered_02.csv',
 '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps/data_filtered_03.csv',
 '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps/data_filtered_04.csv',
 '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps/data_filtered_05.csv',
 '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps/data_filtered_06.csv',
 '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps/data_filtered_07.csv',
 '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps/data_filtered_08.csv',
 '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps/data_filtered_09.csv',
 '/Volumes/LaCie SSD/bgdata/data_19/some_data/clean/filtered_comps/data_filtered_10.csv',
 '/Volumes