In [3]:
import sys; 
sys.path.insert(0, '..')

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder. \
    appName("pyspark-1"). \
    getOrCreate()

### Read data

In [5]:
df = spark.read.csv("/dataset/nyc-jobs.csv", header=True)
df.printSchema()

root
 |-- Job ID: string (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: string (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: string (nullable = true)
 |-- Salary Range To: string (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/Shift: string (nullable = true)
 |-- Work Locatio

In [104]:
df.count()

2946

### Sample function

In [4]:
from utils.distinct_values import get_distinct_values
get_distinct_values(df = df, column= 'Salary Frequency')

ModuleNotFoundError: No module named 'pyspark'

From the printSchema output we can see that the dtypes are all string.  Its possible to use inferSchema with read_cvs but for large data sets this can take a long time since the complete dataset needs to be scanned and for each column.

Instead, we can create the schema and use that to re create df

First, lets take a look at the data

In [None]:
import utils.pre_processing_functions as PPF

In [6]:
help(PPF)

Help on module utils.pre_processing_functions in utils:

NAME
    utils.pre_processing_functions

FUNCTIONS
    get_counts_map(df: pyspark.sql.dataframe.DataFrame) -> dict
        Return dict of DataFrame df's columns and their respective non-null counts. 
        
        Can be used to determine whether there are nulls in a dataframe, i.e.:
        If the count for each coumn != df.count() there are missing values
        (count be a neat function to do this already, and will use that when/if
        I find it, but for now this function is useful) 
        
        Usage: 
        df = .....        
        counts_map=get_missing_counts(df)
        print(counts_map)
        {  'Job ID': '2946',
            'Agency': '2946',
            'Posting Type': '2946',
            '# Of Positions': '2946',
            .
            .
        }
        
        :param df: input dataframe
        :return: dict/map of column: <count>

FILE
    /utils/pre_processing_functions.py




In [None]:
counts_map = PPF.get_counts_map(df)
counts_map

In [130]:
len(df.columns)


28

In [131]:
desc = df.describe().toPandas().transpose()
df_count=df.count()
print(f"count() {df_count}")
desc[0].sort_values()

count() 2946


Recruitment Contact               1183
Post Until                        1447
Work Location 1                   1808
Hours/Shift                       1884
Residency Requirement             2268
Additional Information            2383
Posting Date                      2429
Posting Updated                   2438
Process Date                      2521
Preferred Skills                  2687
Full-Time/Part-Time indicator     2751
To Apply                          2766
Minimum Qual Requirements         2928
Job Category                      2944
Job Description                   2946
Work Location                     2946
Salary Frequency                  2946
Salary Range To                   2946
Salary Range From                 2946
Level                             2946
Title Code No                     2946
Civil Service Title               2946
Business Title                    2946
# Of Positions                    2946
Posting Type                      2946
Agency                   

In [11]:
desc.head()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Job ID,2946,384821.5631364562,53075.33897715407,132292,97899
Agency,2946,,,ADMIN FOR CHILDREN'S SVCS,TEACHERS RETIREMENT SYSTEM
Posting Type,2946,,,External,Internal
# Of Positions,2946,2.4959266802443993,9.281312826466838,1,91


In [12]:
desc.loc["summary"]

0     count
1      mean
2    stddev
3       min
4       max
Name: summary, dtype: object

In [13]:
desc, desc.columns = desc[1:], desc.loc["summary"]

In [14]:
desc

summary,count,mean,stddev,min,max
Job ID,2946,384821.5631364562,53075.33897715407,132292,97899
Agency,2946,,,ADMIN FOR CHILDREN'S SVCS,TEACHERS RETIREMENT SYSTEM
Posting Type,2946,,,External,Internal
# Of Positions,2946,2.4959266802444,9.281312826466838,1,91
Business Title,2946,,,.NET DEVELOPER,executive Vice President for Operations
Civil Service Title,2946,,,ACCOUNTANT,YOUTH COORDINATOR (YOUTH SERVI
Title Code No,2946,35558.51334552102,28141.297679769723,0527A,95841
Level,2946,1.0531400966183575,1.1403671232078134,0,M7
Job Category,2944,,,Administration & Human Resources,"Technology, Data & Innovation Social Services"
Full-Time/Part-Time indicator,2751,,,F,P


#### Job ID 

should be int and there are no missing values

* Required for data analysis:  No


In [30]:
df.select("Job ID").limit(10).show()


+------+
|Job ID|
+------+
| 87990|
| 97899|
|132292|
|132292|
|133921|
|133921|
|137433|
|138531|
|151131|
|152738|
+------+



In [31]:
df_count == counts_map["Job ID"]

True

#### Agency 

String and there are no missing values

* Required for data analysis:  Yes

>   What's the job posting having the highest salary per agency? 


In [32]:
df_count == counts_map["Agency"]

True

#### Posting Type 

Would be a categorical type as there are only 2 distict values (none missing)

* Required for data analysis:  No

In [33]:
df.groupBy("Posting Type").count().orderBy('count', ascending=False).limit(10).show()

+------------+-----+
|Posting Type|count|
+------------+-----+
|    Internal| 1684|
|    External| 1262|
+------------+-----+



#### "# Of Positions"

should int type (no missing values)

* Required for data analysis:  No

In [34]:
df_count == counts_map["# Of Positions"]

True

#### Business Title 

string type and no missing

* Required for data analysis:  No

In [35]:
df_count == counts_map["Business Title"]

True

#### Civil Service Title 

string type and no missing

* Required for data analysis:  No

In [36]:
df_count == counts_map["Civil Service Title"]

True

#### Title Code No 

String, as there are numerics mixed in with numbers.
No missing values

* Required for data analysis:  No

In [37]:
df_count == counts_map["Title Code No"]

True

In [48]:
df.select("Title Code No").show()

+-------------+
|Title Code No|
+-------------+
|        40563|
|        10009|
|        90698|
|        90698|
|        91830|
|        91830|
|        12158|
|        21822|
|        1002D|
|        10251|
|        1002C|
|        13642|
|        13642|
|        10209|
|        10209|
|        10251|
|        13611|
|        1002C|
|        10251|
|        13632|
+-------------+
only showing top 20 rows



#### Level
String type
No missing values

* Required for data analysis:  No

In [49]:
df_count == counts_map["Level"]

True

In [56]:
df.select("Level").orderBy("Level",ascending=False).limit(10).show()

+-----+
|Level|
+-----+
|   M7|
|   M7|
|   M7|
|   M7|
|   M7|
|   M7|
|   M7|
|   M7|
|   M7|
|   M7|
+-----+



In [57]:
df.select("Level").orderBy("Level",ascending=True).limit(10).show()

+-----+
|Level|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+



#### Job Category

String Type, missing 2 values:

* Required for data analysis:  YES

> What's the salary distribution per job category 
    - 3 line graph, min, max, avg - grouped by category

> What's the number of jobs posting per category (Top 10)
    - report
    - histogram

In [132]:
df_count == counts_map["Job Category"]

False

In [116]:
df.groupBy("Job Category").count().show()

+--------------------+-----+
|        Job Category|count|
+--------------------+-----+
|Administration & ...|    1|
|Health Policy, Re...|    4|
|Administration & ...|    3|
|Information Techn...|    2|
|Finance, Accounti...|    1|
|Engineering, Arch...|    8|
|Legal Affairs Pol...|    6|
|Administration & ...|    2|
|Constituent Servi...|  129|
|Building Operatio...|  181|
|Engineering, Arch...|    2|
|Constituent Servi...|    8|
|Administration & ...|    1|
|       Legal Affairs|  226|
|Engineering, Arch...|    2|
|Finance, Accounti...|    4|
|Constituent Servi...|    2|
|Administration & ...|    2|
|Health Legal Affairs|    4|
|Administration & ...|    6|
+--------------------+-----+
only showing top 20 rows



In [None]:
### there are 2 missing Job Catergory's

In [59]:
df_count - counts_map["Job Category"]

2

In [7]:
import pyspark.sql.functions as F

In [68]:
df.where(F.col('Job Category').isNull()).toPandas().transpose()

Unnamed: 0,0,1
Job ID,87990,97899
Agency,DEPARTMENT OF BUSINESS SERV.,DEPARTMENT OF BUSINESS SERV.
Posting Type,Internal,Internal
# Of Positions,1,1
Business Title,Account Manager,"EXECUTIVE DIRECTOR, BUSINESS DEVELOPMENT"
Civil Service Title,CONTRACT REVIEWER (OFFICE OF L,ADMINISTRATIVE BUSINESS PROMOT
Title Code No,40563,10009
Level,1,M3
Job Category,,
Full-Time/Part-Time indicator,,F


- There are only 2 records with null Job Category.  I'll update these to "not specified"

#### Full-Time/Part-Time indicator

String with missing values

* Required for data analysis:  No


In [71]:
df.groupBy("Full-Time/Part-Time indicator").count().show()

+-----------------------------+-----+
|Full-Time/Part-Time indicator|count|
+-----------------------------+-----+
|                            F| 2625|
|                         null|  195|
|                            P|  126|
+-----------------------------+-----+



- There are 195 records with null.  I'll update these to "not specified" if required for reporting

#### Salary Range From

* Required for data analysis:  YES

> mutilple salary related questions to answer

Should be numeric, lets look at the data:

In [80]:
df_count - counts_map["Salary Range From"]

0

- check that all columns are int - actually select those that cannot be cast to int.

In [18]:
df.select("Salary Range From").where(F.col("Salary Range From").cast('float').isNull()).show()

+-----------------+
|Salary Range From|
+-----------------+
+-----------------+



In [82]:
desc.loc["Salary Range From"]

summary
count                   2946
mean      58904.139793856084
stddev    26986.575935791352
min                        0
max                    99353
Name: Salary Range From, dtype: object

- I will set to numeric

#### Salary Range To

Should be numeric, lets look at the data:


* Required for data analysis:  YES

> mutilple salary related questions to answer

In [84]:
df_count - counts_map["Salary Range To"]

0

In [19]:
df.select("Salary Range To").where(F.col("Salary Range To").cast('float').isNull()).show()

+---------------+
|Salary Range To|
+---------------+
+---------------+



In [86]:
desc.loc["Salary Range To"]

summary
count                  2946
mean      85535.71162739307
stddev    42871.31345366744
min                   10.36
max                   99406
Name: Salary Range To, dtype: object

* I will set to numeric

#### Salary Frequency

In [87]:
df.groupBy("Salary Frequency").count().limit(10).show()

+----------------+-----+
|Salary Frequency|count|
+----------------+-----+
|           Daily|   39|
|          Annual| 2712|
|          Hourly|  195|
+----------------+-----+



- Function required to put salary columns on the same frequencey scale

create function to create new columns: "Freq Adjusted Salary Range From" and "Freq Adjusted Salary Range To"

#### Work Location

String

* Required for data analysis:  Yes

#### Division/Work Unit

* Required for data analysis:  No

#### Job Description

* Required for data analysis:  No

#### Minimum Qual Requirements

String

* Required for data analysis:  YES

> mutilple salary related questions to answer

Data will need to be processed as this text field looks to be free txt as apposed to a selection from a drop-down combo list.  That means there could be typos, abbreviations etc etc.

In [118]:
df.select("Minimum Qual Requirements").limit(20).show()

+-------------------------+
|Minimum Qual Requirements|
+-------------------------+
|     "1.	A baccalaurea...|
|     "1. A baccalaurea...|
|     "1. Three years o...|
|     "1. Three years o...|
|     1. Five years of ...|
|     1. Five years of ...|
|     "1. A baccalaurea...|
|     Qualification Req...|
|     "1. A master's de...|
|     Qualification Req...|
|     "1. A baccalaurea...|
|     "1. A baccalaurea...|
|     "1. A baccalaurea...|
|     For Assignment Le...|
|     For Assignment Le...|
|     Qualification Req...|
|     "Qualification Re...|
|     "1. A baccalaurea...|
|     Qualification Req...|
|     "(1) A baccalaure...|
+-------------------------+



In [121]:
df.groupBy("Minimum Qual Requirements").count().limit(10).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#### Preferred Skills

* Required for data analysis:  No


#### Additional Information

* Required for data analysis:  No


#### To Apply

* Required for data analysis:  No


#### Hours/Shift

* Required for data analysis:  Yes / maybe.
    
May be needed to calculate salary in the case of hourly or monthly paid emplyees.  Need to check the data to find out


In [123]:
df.select(["Hours/Shift","Salary Frequency"]).show()

+--------------------+----------------+
|         Hours/Shift|Salary Frequency|
+--------------------+----------------+
|                null|          Annual|
|                null|          Annual|
| all candidates m...|          Annual|
| all candidates m...|          Annual|
|                null|          Hourly|
|                null|          Hourly|
| at least one yea...|          Annual|
|35 Hours per week...|          Annual|
| as described in ...|          Annual|
|Monday through Fr...|          Annual|
| full-time progre...|          Annual|
|"To apply please ...|          Annual|
|"To apply please ...|          Annual|
|Hours:  17 hours ...|          Hourly|
|Hours:  17 hours ...|          Hourly|
|                null|          Annual|
| or ""3"" above. ...|          Annual|
| full-time progre...|          Annual|
|                null|          Annual|
| you must explain...|          Annual|
+--------------------+----------------+
only showing top 20 rows



In [142]:
df.select(["Hours/Shift","Salary Frequency"]).where(F.col('Salary Frequency') == 'Hourly').toPandas()

Unnamed: 0,Hours/Shift,Salary Frequency
0,,Hourly
1,,Hourly
2,Hours: 17 hours per week when school is in se...,Hourly
3,Hours: 17 hours per week when school is in se...,Hourly
4,,Hourly
...,...,...
190,Up to 17 hours/week while school is in session...,Hourly
191,Up to 17 hours/week while school is in session...,Hourly
192,,Hourly
193,,Hourly


In [141]:
df.select(["Hours/Shift","Salary Frequency"]).where(F.col('Salary Frequency') == 'Daily').toPandas()

Unnamed: 0,Hours/Shift,Salary Frequency
0,,Daily
1,,Daily
2,the U.S. Department of Labor or any apprentic...,Daily
3,35 Hours / Shift To Be Determined,Daily
4,35 Hours / Shift To Be Determined,Daily
5,,Daily
6,with a major in Water Quality Monitoring,Daily
7,with a major in Water Quality Monitoring,Daily
8,40 hours per week / rotating shifts,Daily
9,40 hours per week / rotating shifts,Daily


* _tricky_ function required here.  Need to decide how to rationalise the salary based upon payment frequency.   Its not going to be correct to assume that hourly paid roles are going to do 40 hrs / week.  

I will instead, use the statistics I've found in the following link and work out the salary as follows:

[Average Working Hours \(Statistical Data 2021\)](https://clockify.me/working-hours)

- for Annually paid roles, I'll calculate an hourly rate base upon USA avg hrs / year: 1757, e.g. $100k -> $56 per hour
- for Daily paid roles, I'll calculate an hourly rate base upon 8 hrs / day
- for Hourly paid roles, I'll use the raw data.

This may be me skewed results, for example there _may_ be a role that demands only 5 hours a week but its very well paid.  The employee's real annual wage would be extremely low, but in my calculation this role would be relatively well paid.  I will have to look at the data after applying the proposed formula above.




* Also, there are many hourly and daily paid jobs where the number of hours are not specified in "Hours/Shift"

Maybe these columns have that info:

- Job Description

- Additional Information

But we can see from the samples below that 


In [150]:
df.select(["Hours/Shift","Salary Frequency","Job Description","Additional Information"]).\
    where(F.col('Salary Frequency') == 'Hourly').\
    where(F.col('Hours/Shift').isNull()).toPandas()

Unnamed: 0,Hours/Shift,Salary Frequency,Job Description,Additional Information
0,,Hourly,Responsibilities of selected candidates will i...,SPECIAL NOTE: 1. This is a temporary assig...
1,,Hourly,Responsibilities of selected candidates will i...,SPECIAL NOTE: 1. This is a temporary assig...
2,,Hourly,** 30- 35 Hours Part-time The Office of S...,"Must follow all safety, security, Blood-borne ..."
3,,Hourly,** 30- 35 Hours Part-time The Office of S...,"Must follow all safety, security, Blood-borne ..."
4,,Hourly,**30-35 hours/week -Part-time OPEN TO PERMANE...,"Must follow all safety, security, Blood-borne ..."
...,...,...,...,...
136,,Hourly,The New York City Department of Correction (DO...,
137,,Hourly,The New York City Department of Correction (DO...,
138,,Hourly,"NYC Parks is the steward of over 30,000 acres ...","Approximate start date: May 15, 2020. Positio..."
139,,Hourly,"NYC Parks is the steward of over 30,000 acres ...","Approximate start date: May 15, 2020. Positio..."


In [156]:
df.select(["Hours/Shift","Salary Frequency","Job Description","Additional Information"]).\
    where(F.col('Salary Frequency') == 'Hourly').\
    where(F.col('Hours/Shift').isNull()).toPandas()

Unnamed: 0,Hours/Shift,Salary Frequency,Job Description,Additional Information
0,,Hourly,Responsibilities of selected candidates will i...,SPECIAL NOTE: 1. This is a temporary assig...
1,,Hourly,Responsibilities of selected candidates will i...,SPECIAL NOTE: 1. This is a temporary assig...
2,,Hourly,** 30- 35 Hours Part-time The Office of S...,"Must follow all safety, security, Blood-borne ..."
3,,Hourly,** 30- 35 Hours Part-time The Office of S...,"Must follow all safety, security, Blood-borne ..."
4,,Hourly,**30-35 hours/week -Part-time OPEN TO PERMANE...,"Must follow all safety, security, Blood-borne ..."
...,...,...,...,...
121,,Hourly,The New York City Department of Correction (DO...,
122,,Hourly,The New York City Department of Correction (DO...,
123,,Hourly,"NYC Parks is the steward of over 30,000 acres ...","Approximate start date: May 15, 2020. Positio..."
124,,Hourly,"NYC Parks is the steward of over 30,000 acres ...","Approximate start date: May 15, 2020. Positio..."


In [172]:
df.select(["Hours/Shift","Salary Frequency","Job Description","Additional Information"]).\
    where(F.col('Salary Frequency') == 'Hourly').\
    where(F.col('Hours/Shift').isNull()).count()

126

- look for Job Descriptions that contain specification of the number of hours to work:

Below I have limited the rows to just one to see where I am getting the regex match:

>

The mission of the New York City Police Department is to enhance the quality of life in New York City by working in partnership with the community to enforce the law, preserve peace, protect the people, reduce fear, and maintain order. The NYPD strives to foster a safe and fair city by incorporating Neighborhood Policing into all facets of Department operations, and solve the problems that create crime and disorder through an interdependent relationship between the people and its police, and by pioneering strategic innovation.  The Facilities Management Division, Building Maintenance Section manages the physical operation maintenance and repair of department facilities. The Building Maintenance Section is seeking a Sheet Metal Worker who will responsible for the following:  - Fabricate, erect and repair sheet metal structures such as ducts, metal ceilings, dampers, louvers and roofs;  - Spot welds solder and sweat all forms of sheet metal;  - Develop patterns and templates in fabricating complex shapes and forms.|

And found that my regex is poor at finding what I'm attemping to find, e.g.

- 25 hours
- 40 hrs

etc

Here I've matched the hr in through - but further on I have found that even with this false match, there are many records where we cant use these columns.


In [180]:
df.select(["Job Description"]).\
    where(F.col('Salary Frequency') == 'Daily').\
    where(F.col('Hours/Shift').isNull()).\
    where(F.col("Job Description").rlike("(?i)^.*?hour|hr.*?$")).limit(1).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

- look for records where neither Job Descriptions nor Additional Information contain specification of the number of hours to work.  

In [177]:
df.select(["Job Description","Additional Information"]).\
    where(F.col('Salary Frequency') == 'Daily').\
    where(F.col('Hours/Shift').isNull()).\
    where(~F.col("Job Description").rlike("(?i)^.*?hour|hr.*?$")).limit(10).\
    where(~F.col("Additional Information").rlike("(?i)^.*?hour|hr.*?$")).count()

5

- so only 5 rows:  It may be possible to get the total number of hours required for all of the others.   Lets take a look at the records more closely nd ssee its its going to be feasible to "grep" out the hours

In [174]:
df.select(["Job Description","Additional Information"]).\
    where(F.col('Salary Frequency') == 'Daily').\
    where(F.col('Hours/Shift').isNull()).\
    where(F.col("Job Description").rlike("(?i)^.*?hour|hr.*?$") | F.col("Additional Information").rlike("(?i)^.*?hour|hr.*?$")).\
    count()

7

##### Conclusion:  It does not seem possible to find the number of hours required for over 100 of the Hourly paid jobs.   Therefore I am to go with my origonal suggested solution of working out all jobs' salaries in Hours.

#### Work Location 1

* Required for data analysis:  No
    

#### Recruitment Contact

* Required for data analysis:  No


#### Residency Requirement

* Required for data analysis:  No


#### Posting Date

* Required for data analysis:  Yes
    
>  What's the job postings average salary per agency for the last 2 years? 

* Are all dates in the correct format to be cast to date?

These Posting Date values are not null but they also cannot be cast to date:

In [27]:
df.select(["Posting Date"]).\
    where(F.col("Posting Date").cast('date').isNull() & ~F.col("Posting Date").isNull()).count()

1049

In [25]:
df.select(["Posting Date"]).\
    where(F.col("Posting Date").cast('date').isNull() & ~F.col("Posting Date").isNull()).show(30)

+--------------------+
|        Posting Date|
+--------------------+
|New York City res...|
|Apply online with...|
| ""2"" or ""3"" a...|
|New York City Res...|
|New York City Res...|
|  mid-range computer|
| ""2"" or ""3"" a...|
|           help desk|
|The successful ca...|
| all candidates m...|
| ""2"" or ""3"" a...|
|To Apply For Inte...|
| or at least one ...|
| or at least one ...|
| ""2"" or ""3"" a...|
|New York City Res...|
|New York City Res...|
| one year of whic...|
| one year of whic...|
|The successful ca...|
|Ability to code ....|
|Ability to code ....|
|New York City res...|
|New York City res...|
|New York City res...|
|New York City res...|
| all candidates m...|
| all candidates m...|
|New York City res...|
|New York City res...|
+--------------------+
only showing top 30 rows



The following cant simple be cast to date:

In [22]:
df.select(["Posting Date"]).where(F.col("Posting Date").cast('date').isNull()).count()

1566

In [20]:
df.where(F.col("Posting Date").cast('date').isNull()).limit(5).toPandas().transpose()

Unnamed: 0,0,1,2,3,4
Job ID,87990,132292,132292,137433,151131
Agency,DEPARTMENT OF BUSINESS SERV.,NYC HOUSING AUTHORITY,NYC HOUSING AUTHORITY,DEPT OF HEALTH/MENTAL HYGIENE,NYC HOUSING AUTHORITY
Posting Type,Internal,External,Internal,Internal,External
# Of Positions,1,52,52,1,1
Business Title,Account Manager,Maintenance Worker - Technical Services-Heatin...,Maintenance Worker - Technical Services-Heatin...,Contract Analyst,Cost Estimating Manager
Civil Service Title,CONTRACT REVIEWER (OFFICE OF L,MAINTENANCE WORKER,MAINTENANCE WORKER,PROCUREMENT ANALYST,ADMINISTRATIVE STAFF ANALYST (
Title Code No,40563,90698,90698,12158,1002D
Level,1,0,0,3,0
Job Category,,Maintenance & Operations,Maintenance & Operations,"Finance, Accounting, & Procurement","Engineering, Architecture, & Planning"
Full-Time/Part-Time indicator,,F,F,F,F


In [10]:
df.select(["Posting Date"]).show()

+--------------------+
|        Posting Date|
+--------------------+
|New York City res...|
|2012-01-26T00:00:...|
|                null|
|                null|
|2014-01-09T00:00:...|
|2014-01-09T00:00:...|
|Apply online with...|
|2013-12-20T00:00:...|
|                null|
|2014-06-26T00:00:...|
| ""2"" or ""3"" a...|
|New York City Res...|
|New York City Res...|
|2014-10-09T00:00:...|
|2014-10-09T00:00:...|
|2014-10-08T00:00:...|
|  mid-range computer|
| ""2"" or ""3"" a...|
|2014-11-18T00:00:...|
|           help desk|
+--------------------+
only showing top 20 rows



There are 1566 records that are either null or cant be cast to a date. If I cast to date, all values for corresponding rows where data is not valid will all be null and I dont want that.   For the one query regarding Posting Date I will only include the rows that have valid dates, just for that query.  

With so many records "invalid" it does not make sense to impute a date

#### Post Until

* Required for data analysis:  No


#### Posting Updated

* Required for data analysis:  No


#### Process Date

* Required for data analysis:  No


In [None]:
df.groupBy('Job Category').count().orderBy('count', ascending=False).limit(10).show()

In [None]:
df.groupBy('Job Category').max('')

In [None]:
df.select("Agency").distinct().count()

In [None]:
df.groupBy('Agency').count().orderBy('count', ascending=False).show()