In [1]:
import os
import sys
os.environ["SPARK_HOME"]="/usr/hdp/current/spark2-client"
os.environ["PYLIB"]=os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0,os.environ["PYLIB"] + "/py4j-0.10.4-src.zip")
sys.path.insert(0,os.environ["PYLIB"] + "/pyspark.zip")

In [2]:
#create SparkContext ,Sparksession
from os.path import expanduser,join,abspath
from pyspark.sql import SparkSession
from pyspark.sql import Row
#warehouse location points to the default location for managed databases and tables.
warehouse_location = 'hdfs:///apps/hive/warehouse/'

spark = SparkSession.builder.appName("Python Spark SQL Hive integration example").config("spark.sql.warehouse.dir",warehouse_location).enableHiveSupport().getOrCreate()

In [3]:
spark

In [93]:
sc = spark.sparkContext
#Read data and create a dataframe
data = spark.read.format("csv")\
        .option("header", "true")\
        .option("inferSchema", "true")\
        .option("mode", "DROPMALFORMED")\
        .load("file:///home/2075B43/Task3/SQL_dataset.csv")


In [94]:
data.show(5)

+------+-------------------+--------------------+--------------------+--------------------+------------------+---------------+----+--------------------+-----------+----------+
|ROW_ID|        CASE_STATUS|       EMPLOYER_NAME|            SOC_NAME|           JOB_TITLE|FULL_TIME_POSITION|PREVAILING_WAGE|YEAR|            WORKSITE|        lon|       lat|
+------+-------------------+--------------------+--------------------+--------------------+------------------+---------------+----+--------------------+-----------+----------+
|     1|CERTIFIED-WITHDRAWN|UNIVERSITY OF MIC...|BIOCHEMISTS AND B...|POSTDOCTORAL RESE...|                 N|          36067|2016| ANN ARBOR, MICHIGAN|-83.7430378|42.2808256|
|     2|CERTIFIED-WITHDRAWN|GOODMAN NETWORKS,...|    CHIEF EXECUTIVES|CHIEF OPERATING O...|                 Y|         242674|2016|        PLANO, TEXAS|-96.6988856|33.0198431|
|     3|CERTIFIED-WITHDRAWN|PORT S AMERICA GR...|    CHIEF EXECUTIVES|CHIEF PROCESS OFF...|                 Y|         1

In [95]:
#Verify the total rows and columns
## To Count the number of rows in DataFrame
print('Total records count is {}'.format(data.count()))
## Columns count and column names
print("Total Columns count is {}".format(len(data.columns)))

Total records count is 3002425
Total Columns count is 11


In [77]:
#Verify the total rows and columns
## To Count the number of rows in DataFrame
print('Total records count is {}'.format(data.count()))
## Columns count and column names
print("Total Columns count is {}".format(len(data.columns)))

Total records count is 3002458
Total Columns count is 11


In [96]:
data.describe().show()

+-------+------------------+-----------+--------------------+--------------------+--------------------+------------------+------------------+------------------+--------------------+------------------+------------------+
|summary|            ROW_ID|CASE_STATUS|       EMPLOYER_NAME|            SOC_NAME|           JOB_TITLE|FULL_TIME_POSITION|   PREVAILING_WAGE|              YEAR|            WORKSITE|               lon|               lat|
+-------+------------------+-----------+--------------------+--------------------+--------------------+------------------+------------------+------------------+--------------------+------------------+------------------+
|  count|           3002425|    3002425|             3002425|             3002425|             3002425|           3002425|           3002425|           3002425|             3002425|           3002425|           3002425|
|   mean|1501224.9845368327|       null|       3.218588665E8|                null|           238095.35|              nul

In [68]:
## Find the Distinct values count in each column
from pyspark.sql.types import *
from pyspark.sql.functions import *
data.agg(*(countDistinct(col(c)).alias(c) for c in data.columns)).show()

+-------+-----------+-------------+--------+---------+------------------+---------------+----+--------+----+----+
| ROW_ID|CASE_STATUS|EMPLOYER_NAME|SOC_NAME|JOB_TITLE|FULL_TIME_POSITION|PREVAILING_WAGE|YEAR|WORKSITE| lon| lat|
+-------+-----------+-------------+--------+---------+------------------+---------------+----+--------+----+----+
|3002458|          8|       236016|    2140|   287560|                25|          56161|  33|   18633|2423|2416|
+-------+-----------+-------------+--------+---------+------------------+---------------+----+--------+----+----+



In [97]:
## Create view/table
data.createOrReplaceTempView("SqlTable")

In [102]:
## List EMPLOYER_NAME and YEAR in the descending order
Filter1 = spark.sql("""SELECT EMPLOYER_NAME, YEAR,CASE_STATUS  FROM SqlTable WHERE CASE_STATUS='CERTIFIED' ORDER BY CASE_STATUS desc""")

In [103]:
Filter1.show(5)

+--------------------+----+-----------+
|       EMPLOYER_NAME|YEAR|CASE_STATUS|
+--------------------+----+-----------+
|       ACCENTURE LLP|2011|  CERTIFIED|
|WILLIAM CAREY UNI...|2011|  CERTIFIED|
|EXILANT CONSULTIN...|2011|  CERTIFIED|
|QUALCOMM INCORPOR...|2011|  CERTIFIED|
|NFL ENTERPRISES,LLC.|2011|  CERTIFIED|
+--------------------+----+-----------+
only showing top 5 rows



In [104]:
#List the approved applications count in the descending order for the JOB_TITLE = "DATA SCIENTIST" and for each employer and year
Filter2 = spark.sql("""
SELECT EMPLOYER_NAME, YEAR,CASE_STATUS,JOB_TITLE  FROM SqlTable WHERE CASE_STATUS='CERTIFIED' and JOB_TITLE='DATA SCIENTIST' ORDER BY CASE_STATUS desc
""")

In [105]:
Filter2.show(5)

+------------------+----+-----------+--------------+
|     EMPLOYER_NAME|YEAR|CASE_STATUS|     JOB_TITLE|
+------------------+----+-----------+--------------+
|        IMVU, INC.|2011|  CERTIFIED|DATA SCIENTIST|
|INTENT MEDIA, INC.|2011|  CERTIFIED|DATA SCIENTIST|
|    FACEBOOK, INC.|2011|  CERTIFIED|DATA SCIENTIST|
|       INTUIT INC.|2011|  CERTIFIED|DATA SCIENTIST|
|   KONTAGENT, INC.|2011|  CERTIFIED|DATA SCIENTIST|
+------------------+----+-----------+--------------+
only showing top 5 rows



In [107]:
#Verify for Null Values
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in raw_data.columns]).show()

+------+-----------+-------------+--------+---------+------------------+---------------+----+--------+---+---+
|ROW_ID|CASE_STATUS|EMPLOYER_NAME|SOC_NAME|JOB_TITLE|FULL_TIME_POSITION|PREVAILING_WAGE|YEAR|WORKSITE|lon|lat|
+------+-----------+-------------+--------+---------+------------------+---------------+----+--------+---+---+
|     0|          0|            0|       0|        0|                 0|              0|   0|       0|  0|  0|
+------+-----------+-------------+--------+---------+------------------+---------------+----+--------+---+---+



In [108]:
# Remove all the rows with null values (in any column/position). 
df = data.na.drop( how = 'any' )
print('Before Dropping Null Values', data.count())
print('After Dropping Null Values', df.count())

('Before Dropping Null Values', 3002425)
('After Dropping Null Values', 3002425)


In [109]:
#Verify the null values count in each column. 
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+------+-----------+-------------+--------+---------+------------------+---------------+----+--------+---+---+
|ROW_ID|CASE_STATUS|EMPLOYER_NAME|SOC_NAME|JOB_TITLE|FULL_TIME_POSITION|PREVAILING_WAGE|YEAR|WORKSITE|lon|lat|
+------+-----------+-------------+--------+---------+------------------+---------------+----+--------+---+---+
|     0|          0|            0|       0|        0|                 0|              0|   0|       0|  0|  0|
+------+-----------+-------------+--------+---------+------------------+---------------+----+--------+---+---+



In [110]:
# List the count of applications in each status (CASE_STATUS) in the descending order of the year. 
filter3 = spark.sql("""SELECT CASE_STATUS,YEAR, count(*)  FROM SqlTable GROUP BY CASE_STATUS,YEAR ORDER BY YEAR""")

In [111]:
filter3.show(5)

+-------------------+----+--------+
|        CASE_STATUS|YEAR|count(1)|
+-------------------+----+--------+
|             DENIED|2011|   29130|
|          CERTIFIED|2011|  307933|
|CERTIFIED-WITHDRAWN|2011|   11596|
|          WITHDRAWN|2011|   10105|
|          CERTIFIED|2012|  352661|
+-------------------+----+--------+
only showing top 5 rows



In [114]:
# Find the mean PREVAILING_WAGE for each year for the approved applications
Filter4 = spark.sql("""
SELECT YEAR,AVG(PREVAILING_WAGE) FROM SqlTable WHERE CASE_STATUS='CERTIFIED' GROUP BY YEAR 
""")

In [115]:
Filter4.show(5)

+----+------------------------------------+
|YEAR|avg(CAST(PREVAILING_WAGE AS DOUBLE))|
+----+------------------------------------+
|2016|                   74235.85388256852|
|2012|                    70842.5693889056|
|2014|                   70573.50438874042|
|2013|                   71712.24635279125|
|2011|                   75453.04075967995|
+----+------------------------------------+
only showing top 5 rows



In [117]:
# Find the mean PREVAILING_WAGE for each year for the approved applications for each employer. 
Filter5 = spark.sql("""
SELECT YEAR,EMPLOYER_NAME,AVG(PREVAILING_WAGE) FROM SqlTable WHERE CASE_STATUS='CERTIFIED' GROUP BY YEAR,EMPLOYER_NAME
""")

In [118]:
Filter5.show(5)

+----+--------------------+------------------------------------+
|YEAR|       EMPLOYER_NAME|avg(CAST(PREVAILING_WAGE AS DOUBLE))|
+----+--------------------+------------------------------------+
|2016|  UNDER ARMOUR, INC.|                   89590.66666666667|
|2016|ASHLEY HOMESTORES...|                            158038.0|
|2016| RODAN & FIELDS, LLC|                          133289.125|
|2016|SAATCHI & SAATCHI...|                            122971.5|
|2016|GLOBAL ENERGY OPT...|                            117749.0|
+----+--------------------+------------------------------------+
only showing top 5 rows



In [122]:
# Find the approved applications count in each year for the full-time positions in the descending order of the year.
Filter6 = spark.sql("""SELECT YEAR,FULL_TIME_POSITION,count(CASE_STATUS) FROM SqlTable WHERE CASE_STATUS='CERTIFIED' group by YEAR,FULL_TIME_POSITION  ORDER BY YEAR""")

In [123]:
Filter6.show(5)

+----+------------------+------------------+
|YEAR|FULL_TIME_POSITION|count(CASE_STATUS)|
+----+------------------+------------------+
|2011|                 Y|            295607|
|2011|                 N|             12326|
|2012|                 N|             12227|
|2012|                 Y|            340434|
|2013|                 N|             11472|
+----+------------------+------------------+
only showing top 5 rows

