In [9]:
from __future__ import print_function
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType, DateType, DecimalType
from pyspark.sql.functions import desc
import pandas as pd
#import pyspark.sql.functions as f


In [10]:
spark = SparkSession\
    .builder\
    .appName("PythonSQL")\
    .getOrCreate()

In [11]:
# A list of Rows. Infer schema from the first row, create a DataFrame and print the schema
rows = [Row(name="John", age=19), Row(name="Smith", age=23), Row(name="Sarah", age=18)]
some_df = spark.createDataFrame(rows)
some_df.printSchema()
some_df.show()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



[Stage 0:>                                                          (0 + 1) / 1]

+-----+---+
| name|age|
+-----+---+
| John| 19|
|Smith| 23|
|Sarah| 18|
+-----+---+



                                                                                

In [12]:
# A list of tuples
tuples = [("John", 19), ("Smith", 23), ("Sarah", 18)]

# Schema with two fields - person_name and person_age
schema = StructType([StructField("person_name", StringType(), False),
                    StructField("person_age", IntegerType(), False)])

In [13]:
# Create a DataFrame by applying the schema to the RDD and print the schema
another_df = spark.createDataFrame(tuples, schema)
another_df.printSchema()

root
 |-- person_name: string (nullable = false)
 |-- person_age: integer (nullable = false)



In [14]:
path="/home/cdsw/resources/people.json"

# Create a DataFrame from the file(s) pointed to by path
people = spark.read.json(path)

In [15]:
# The inferred schema can be visualized using the printSchema() method.
people.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [16]:
# Creates a temporary view using the DataFrame.
people.createOrReplaceTempView("people")

# SQL statements can be run by using the sql methods provided by `spark`
teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")


In [17]:
for each in teenagers.collect():
    print(each[0])


Justin


In [18]:
#Filtering in a DF Way
teenagers = people.filter("age >= 13 and age <= 19")

In [19]:
teenagers.select("name","age").show()

+------+---+
|  name|age|
+------+---+
|Justin| 19|
+------+---+



In [20]:
#Define schema for the emp.csv file
schema_emp = StructType([
    StructField("emp", IntegerType(), True),
    StructField("mgr", IntegerType(), True),
    StructField("dept", IntegerType(), True),
    StructField("job",IntegerType(), True),
    StructField("l_name",StringType(), True),
    StructField("f_name",StringType(), True),
    StructField("hire",DateType(), True),
    StructField("birth", DateType(), True),
    StructField("salary",DecimalType(), True)             
])

In [21]:
#Create a DF for emp
empDF = spark.read.option("delimiter", ",").option("header", False).csv("/home/cdsw/resources/testdata/emp.csv",schema=schema_emp)


In [22]:
#Define a schema for dept.csv file
schema_dept = StructType([
    StructField("dept", IntegerType(), True),
    StructField("deptName", StringType(), True),
    StructField("budget", DecimalType(), True),
    StructField("mgr",IntegerType(), True)])

In [23]:
#Create a DF for dept file
deptDF = spark.read.option("delimiter", ",").option("header", False).csv("/home/cdsw/resources/testdata/dept.csv",schema=schema_dept)


In [24]:
#filer() and orderBy() - List all employess from dept 501 ordered by salary in descending order
empDF.filter("dept = 501").orderBy("salary",ascending = False).show()

+----+----+----+------+--------+------+----------+----------+------+
| emp| mgr|dept|   job|  l_name|f_name|      hire|     birth|salary|
+----+----+----+------+--------+------+----------+----------+------+
|1017| 801| 501|511100|  Runyon| Irene|1978-05-01|1951-11-10| 66000|
|1018|1017| 501|512101|Ratzlaff| Larry|1978-07-15|1954-05-31| 54000|
|1015|1017| 501|512101|  Wilson|Edward|1978-03-01|1957-03-04| 53625|
|1023|1017| 501|512101|  Rabbit| Peter|1979-03-01|1962-10-29| 26500|
+----+----+----+------+--------+------+----------+----------+------+



In [25]:
#Now perform a join using DF API
empDF.filter("dept = 501").join(deptDF,empDF.dept==deptDF.dept).show()

+----+----+----+------+--------+------+----------+----------+------+----+---------------+------+----+
| emp| mgr|dept|   job|  l_name|f_name|      hire|     birth|salary|dept|       deptName|budget| mgr|
+----+----+----+------+--------+------+----------+----------+------+----+---------------+------+----+
|1018|1017| 501|512101|Ratzlaff| Larry|1978-07-15|1954-05-31| 54000| 501|marketing sales|308000|1017|
|1017| 801| 501|511100|  Runyon| Irene|1978-05-01|1951-11-10| 66000| 501|marketing sales|308000|1017|
|1015|1017| 501|512101|  Wilson|Edward|1978-03-01|1957-03-04| 53625| 501|marketing sales|308000|1017|
|1023|1017| 501|512101|  Rabbit| Peter|1979-03-01|1962-10-29| 26500| 501|marketing sales|308000|1017|
+----+----+----+------+--------+------+----------+----------+------+----+---------------+------+----+



In [26]:
#Let's do the join between 2 DF in a SQL manner. First create 2 temp views
empDF.createOrReplaceTempView("employee")
deptDF.createOrReplaceTempView("department")

In [27]:
#Now Run the join statement.
joinData = spark.sql("select * from employee inner join department on employee.dept=department.dept where employee.dept=501").show()

+----+----+----+------+--------+------+----------+----------+------+----+---------------+------+----+
| emp| mgr|dept|   job|  l_name|f_name|      hire|     birth|salary|dept|       deptName|budget| mgr|
+----+----+----+------+--------+------+----------+----------+------+----+---------------+------+----+
|1018|1017| 501|512101|Ratzlaff| Larry|1978-07-15|1954-05-31| 54000| 501|marketing sales|308000|1017|
|1017| 801| 501|511100|  Runyon| Irene|1978-05-01|1951-11-10| 66000| 501|marketing sales|308000|1017|
|1015|1017| 501|512101|  Wilson|Edward|1978-03-01|1957-03-04| 53625| 501|marketing sales|308000|1017|
|1023|1017| 501|512101|  Rabbit| Peter|1979-03-01|1962-10-29| 26500| 501|marketing sales|308000|1017|
+----+----+----+------+--------+------+----------+----------+------+----+---------------+------+----+



In [28]:
#Example for show(), first(), take(), limit()
#show() gives a formated output and prints results
deptDF.show(2)

+----+--------------------+------+----+
|dept|            deptName|budget| mgr|
+----+--------------------+------+----+
| 301|research and deve...|465600|1019|
| 501|     marketing sales|308000|1017|
+----+--------------------+------+----+
only showing top 2 rows



In [29]:
#first() also returns a list of rows
deptDF.first()

Row(dept=301, deptName='research and development', budget=Decimal('465600'), mgr=1019)

In [30]:
#take() returns a list of rows and can be used to create a new DF
deptDF.take(2)

[Row(dept=301, deptName='research and development', budget=Decimal('465600'), mgr=1019),
 Row(dept=501, deptName='marketing sales', budget=Decimal('308000'), mgr=1017)]

In [31]:
#Distinct with Filter
empDF.select("dept").filter("dept = 301").distinct().show()

+----+
|dept|
+----+
| 301|
+----+



In [32]:
#Group By with count() and sum()
#Show the total number of empolyees per department
empDF.groupBy(empDF.dept).count().show()

+----+-----+
|dept|count|
+----+-----+
| 501|    4|
| 402|    2|
| 301|    3|
| 100|    1|
| 403|    6|
| 201|    2|
| 302|    1|
| 401|    7|
+----+-----+



In [33]:
#Show the summary of salaries by department
empDF.groupBy("dept").sum("salary").show()

+----+-----------+
|dept|sum(salary)|
+----+-----------+
| 501|     200125|
| 402|      77000|
| 301|     116400|
| 100|     100000|
| 403|     233000|
| 201|      73450|
| 302|      56500|
| 401|     245575|
+----+-----------+



In [34]:
#Show average salary by department order by in desc order
empDF\
.select("dept", "salary")\
.groupBy("dept")\
.avg("salary")\
.withColumnRenamed("avg(salary)", "avgSalary")\
.sort(desc("avgSalary"))\
.show()

+----+-----------+
|dept|  avgSalary|
+----+-----------+
| 100|100000.0000|
| 302| 56500.0000|
| 501| 50031.2500|
| 403| 38833.3333|
| 301| 38800.0000|
| 402| 38500.0000|
| 201| 36725.0000|
| 401| 35082.1429|
+----+-----------+



In [35]:
#Write DF to the output csv file
#empDF.write.csv("/tmp/pavel_test.csv")

In [36]:
spark.stop()