# Clustering Test

In [1]:
import pandas as pd
import numpy as np
import concurrent.futures as cf
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint


%matplotlib inline

In [2]:
%%time

df = pd.read_csv('/Volumes/LaCie SSD/bgdata/random_data/all_years_rand_samp.csv', parse_dates=['JobDate'], low_memory=False)
df.head()

CPU times: user 15.6 s, sys: 4.6 s, total: 20.2 s
Wall time: 21.3 s


Unnamed: 0,Language,CanonCity,CanonState,CleanJobTitle,JobDate,JobText,JobID,Latitude,Longitude,CanonPostalCode,...,CanonJobHours,CanonJobType,CanonYearsOfExperienceCanonLevel,CanonYearsOfExperienceLevel,ConsolidatedDegreeLevels,ConsolidatedTitle,MaxDegreeLevel,MinDegreeLevel,BGTSubOcc,YearsOfExperience
0,en,Irvine,CA,Pt Faculty Pool - Recycling & Zero Waste,2013-01-03,Posting Details South Orange County Community ...,110103522,33.6881,-117.788,92604,...,fulltime,permanent,1-6,mid,16|18,Pt Faculty Pool - Recycling & Zero Waste,18.0,16.0,College Professor / Instructor,2) years|6) years|Two years
1,en,Marbury,MD,Class A Cdl Truck Driver,2013-11-20,Experienced Class A CDL Truck Drivers (20658),255851194,38.5624,-77.1639,20658,...,,,,,,Class A CDL Truck Driver,,,Tractor-Trailer Truck Driver (General),
2,en,Auburn,AL,Apartment Community Maintenance Technician - S...,2013-09-06,"Posted: 2013-09-05, 4:16PM EDT\n\nApartment Co...",246911326,32.5824,-85.5126,36830,...,,,1-6,mid,12,Maintenance Technician,,12.0,Building and General Maintenance Technician,minimum of six months|two years
3,en,Anchorage,AK,Registered Nurse - All Specialties And Shifts,2013-08-30,Description:\n* Registered Nurses - All Spec...,244857419,61.2157,-149.869,99501,...,,,,,,Registered Nurse,,,Registered Nurse,
4,en,Madison,WI,Project Manager,2013-11-30,"Apex Systems, Inc Skills: Project ma...",420796045,43.1153,-89.5249,53562,...,fulltime,permanent,6+,high,,Project Manager,,,IT Project Manager,7 years|5 years|3 years


# Data Inspection & Cleaning

In [3]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476428 entries, 0 to 476427
Data columns (total 43 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   Language                          476428 non-null  object        
 1   CanonCity                         466244 non-null  object        
 2   CanonState                        476394 non-null  object        
 3   CleanJobTitle                     476269 non-null  object        
 4   JobDate                           476428 non-null  datetime64[ns]
 5   JobText                           473483 non-null  object        
 6   JobID                             476428 non-null  int64         
 7   Latitude                          466253 non-null  float64       
 8   Longitude                         466253 non-null  float64       
 9   CanonPostalCode                   465000 non-null  object        
 10  CanonCounty                     

In [4]:
df.shape

(476428, 43)

In [32]:
missing_pct = df.isna().sum() / df.shape[0] * 100
missing_pct

Language                             0.000000
CanonCity                            2.137574
CanonState                           0.007136
CleanJobTitle                        0.033373
JobDate                              0.000000
JobText                              0.618142
JobID                                0.000000
Latitude                             2.135685
Longitude                            2.135685
CanonPostalCode                      2.398684
CanonCounty                          2.159403
DivisionCode                        68.090666
LMA                                  2.479073
MSA                                  4.781625
CanonEmployer                       22.104914
CanonJobTitle                       35.178453
ConsolidatedONET                     3.593618
CanonIntermediary                   85.666040
InternshipFlag                       0.000000
Source                               3.168789
CanonSkillClusters                   8.660700
CanonSkills                       

In [34]:
to_drop_cols = (missing_pct[missing_pct > 75]).index
to_drop_cols

Index(['CanonIntermediary', 'CIPCode', 'MaxAnnualSalary', 'MaxHourlySalary',
       'MinAnnualSalary', 'MinHourlySalary', 'MaxDegreeLevel'],
      dtype='object')

In [36]:
df.drop(to_drop_cols, axis=1, inplace=True)
df.shape

(476428, 37)

In [9]:
df.loc[(df['CanonEmployer'].isna()) & (df['CanonIntermediary'].notna()), :].shape

(68284, 43)

In [13]:
df.loc[(df['CanonEmployer'].isna()) & (df['CanonIntermediary'].notna()), :]['CanonEmployer'].head()

4     NaN
10    NaN
12    NaN
20    NaN
31    NaN
Name: CanonEmployer, dtype: object

In [14]:
df.loc[(df['CanonEmployer'].isna()) & (df['CanonIntermediary'].notna()), :]['CanonIntermediary'].head()

4            Apex Systems, Inc.
10    Nigel Frank International
12               Kelly Services
20                       Elance
31                     Randstad
Name: CanonIntermediary, dtype: object

In [10]:
df.shape[0] * .15

71464.2

In [18]:
df.loc[(df['CanonEmployer'].isna()) & (df['CanonIntermediary'].notna()), 'CanonEmployer'] = 'Recruiting Agency'

In [21]:
df['CanonEmployer'].isna().head()

0    False
1    False
2     True
3    False
4    False
Name: CanonEmployer, dtype: bool

In [23]:
len(df['CanonEmployer'].unique())

75805

In [24]:
len(df['CleanJobTitle'].unique())

271225

In [25]:
len(df['CleanJobTitle'].unique()) / df.shape[0] * 100

56.92885388768083

In [29]:
(df['JobDate'].dt.year.unique())

array([2013, 2007, 2020, 2017, 2019, 2011, 2018, 2010, 2016, 2012, 2015,
       2014])

In [30]:
df['year'] = df['JobDate'].dt.year

In [31]:
df.groupby('year')['year'].count()

year
2007    47695
2010    40487
2011    50144
2012    49926
2013    25631
2014    33690
2015    40945
2016    43376
2017    39534
2018    35000
2019    35000
2020    35000
Name: year, dtype: int64