In [1]:
import sys; 
sys.path.insert(0, '..')

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder. \
    appName("pyspark-1"). \
    getOrCreate()

### Read data

In [6]:
df = spark.read.csv("/dataset/nyc-jobs.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- Job ID: integer (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Posting Type: string (nullable = true)
 |-- # Of Positions: integer (nullable = true)
 |-- Business Title: string (nullable = true)
 |-- Civil Service Title: string (nullable = true)
 |-- Title Code No: string (nullable = true)
 |-- Level: string (nullable = true)
 |-- Job Category: string (nullable = true)
 |-- Full-Time/Part-Time indicator: string (nullable = true)
 |-- Salary Range From: double (nullable = true)
 |-- Salary Range To: double (nullable = true)
 |-- Salary Frequency: string (nullable = true)
 |-- Work Location: string (nullable = true)
 |-- Division/Work Unit: string (nullable = true)
 |-- Job Description: string (nullable = true)
 |-- Minimum Qual Requirements: string (nullable = true)
 |-- Preferred Skills: string (nullable = true)
 |-- Additional Information: string (nullable = true)
 |-- To Apply: string (nullable = true)
 |-- Hours/Shift: string (nullable = true)
 |-- Work Locat

### Sample function

In [5]:
from utils.distinct_values import get_distinct_values
get_distinct_values(df = df, column = 'Salary Frequency')

['Annual', 'Daily', 'Hourly']

### What's the number of jobs posting per category (Top 10)?

In [13]:
df.groupBy("Job Category").count().orderBy("count",ascending=False).show(10,truncate=False)

+-----------------------------------------+-----+
|Job Category                             |count|
+-----------------------------------------+-----+
|Engineering, Architecture, & Planning    |504  |
|Technology, Data & Innovation            |313  |
|Legal Affairs                            |226  |
|Public Safety, Inspections, & Enforcement|182  |
|Building Operations & Maintenance        |181  |
|Finance, Accounting, & Procurement       |169  |
|Administration & Human Resources         |134  |
|Constituent Services & Community Programs|129  |
|Health                                   |125  |
|Policy, Research & Analysis              |124  |
+-----------------------------------------+-----+
only showing top 10 rows



### What's the salary distribution per job category?

In [48]:
from pyspark.sql import functions as F

df=df.withColumn("SalaryRangeFromPerHr",F.when(F.col("Salary Frequency")=="Annual",F.col("Salary Range From")/(8*20*12))\
                                      .when(F.col("Salary Frequency")=="Daily",F.col("Salary Range From")/(8)))

df=df.withColumn("SalaryRangeToPerHr",F.when(F.col("Salary Frequency")=="Annual",F.col("Salary Range To")/(8*20*12))\
                                      .when(F.col("Salary Frequency")=="Daily",F.col("Salary Range To")/(8)))
df=df.withColumn("SalaryRangeAvg",(F.col("SalaryRangeFromPerHr")+F.col("SalaryRangeToPerHr"))/2)

In [62]:
df.groupBy("Job category").agg(F.round(F.mean("SalaryRangeFromPerHr"),2).alias("Average Salary From"),\
                               F.round(F.mean("SalaryRangeToPerHr"),2).alias("Average Salary To"))\
.orderBy("Average Salary From", ascending=False).show(truncate=True)

+--------------------+-------------------+-----------------+
|        Job category|Average Salary From|Average Salary To|
+--------------------+-------------------+-----------------+
|Administration & ...|             113.85|           113.85|
|Engineering, Arch...|             103.39|           103.39|
|Engineering, Arch...|             100.15|           104.06|
|Communications & ...|               59.9|            70.31|
|Health Policy, Re...|              59.12|            74.94|
|Constituent Servi...|              55.56|            66.22|
|Administration & ...|              54.69|             59.9|
|Administration & ...|              46.88|            52.08|
|Administration & ...|              46.88|            47.92|
|Administration & ...|              44.27|            52.08|
|Engineering, Arch...|              44.02|            55.52|
|Finance, Accounti...|               43.8|            47.06|
|Health Technology...|               43.4|            47.05|
|Administration & ...|  

### Is there any correlation between the higher degree and the salary? 

In [84]:
df.withColumn("degree details",F.substring(F.col("Minimum Qual Requirements"),1,30))\
.groupBy("degree details").agg(F.mean(F.col("SalaryRangeFromPerHr")).alias("avg"))\
.orderBy("avg",ascending=False).show(truncate=False)

+------------------------------+------------------+
|degree details                |avg               |
+------------------------------+------------------+
|1.	Bachelorâ€™s degree from an|113.84739583333334|
|"1. Baccalaureate degree from |109.15885416666667|
|The Deputy Commissioner, Publi|104.16666666666667|
|1.	Ten years or more years of |91.14583333333333 |
|â€¢ Masterâ€™s degree in busin|83.33333333333333 |
| City agencies                |83.33333333333333 |
|Possession of a valid license |82.1484375        |
|"1.	A bachelorâ€™s degree from|67.70833333333333 |
|1.	A baccalaureate degree from|66.66666666666667 |
|The successful candidate will |61.774305555555536|
|A valid license for High Press|60.84             |
|"Qualification Requirements:  |60.416666666666664|
|1.	A graduate degree from an a|57.291666666666664|
|1. A license to practice law i|57.291666666666664|
|1.  Baccalaureate degree from |53.96875          |
| all candidates must have a va|49.46927083333333 |
|Applicants 

### What's the job posting having the highest salary per agency?

In [93]:
from pyspark.sql import Window
agency_spec = Window.partitionBy("Agency").orderBy(F.desc("SalaryRangeFromPerHr"))
df.withColumn("rank",F.dense_rank().over(agency_spec)).filter(F.col("rank")==1)\
.select(F.col("Agency"),F.col("Business Title")).distinct().show(truncate=False)

+------------------------------+------------------------------------------------------------+
|Agency                        |Business Title                                              |
+------------------------------+------------------------------------------------------------+
|LANDMARKS PRESERVATION COMM   |LANDMARKS PRESERVATIONIST, PRESERVATION DEPT                |
|OFFICE OF COLLECTIVE BARGAININ|COLLEGE AIDE - CLERICAL                                     |
|FIRE DEPARTMENT               |Clinical Director for the Couseling Services Unit           |
|ADMIN FOR CHILDREN'S SVCS     |Business Analyst                                            |
|MANHATTAN COMMUNITY BOARD #8  |Community Assistant                                         |
|TAX COMMISSION                |CLERICAL ASSOCIATE                                          |
|HRA/DEPT OF SOCIAL SERVICES   |ORACLE DATABASE ADMINISTRATOR                               |
|TAXI & LIMOUSINE COMMISSION   |Executive Director, Technolo

### What's the job postings average salary per agency for the last 2 years?

In [118]:
from pyspark.sql.types import TimestampType
df=df.withColumn("job date",F.col("Posting Date").cast(TimestampType()))
df.filter(F.col("job date")>F.add_months(F.col("job date"),-24))\
.groupBy("Agency").agg(F.round(F.mean("SalaryRangeAvg"),2).alias("Average Salary in Hrs"))\
.orderBy("Average Salary in Hrs",ascending=False).show(truncate=False)

+------------------------------+---------------------+
|Agency                        |Average Salary in Hrs|
+------------------------------+---------------------+
|CONFLICTS OF INTEREST BOARD   |70.31                |
|DEPARTMENT FOR THE AGING      |56.12                |
|FIRE DEPARTMENT               |53.98                |
|DEPARTMENT OF CITY PLANNING   |50.78                |
|BUSINESS INTEGRITY COMMISSION |49.48                |
|FINANCIAL INFO SVCS AGENCY    |48.96                |
|DEPT OF INFO TECH & TELECOMM  |47.6                 |
|HUMAN RIGHTS COMMISSION       |46.48                |
|DEPT OF CITYWIDE ADMIN SVCS   |46.09                |
|DEPARTMENT OF PROBATION       |45.68                |
|MAYORS OFFICE OF CONTRACT SVCS|45.5                 |
|CONSUMER AFFAIRS              |45.37                |
|OFFICE OF THE COMPTROLLER     |44.39                |
|HOUSING PRESERVATION & DVLPMNT|44.21                |
|PRESIDENT BOROUGH OF MANHATTAN|42.97                |
|DEPT OF Y

### What are the highest paid skills in the US market?