In [1]:
import findspark
findspark.init()

# create spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("my app").master("local").getOrCreate()

# get context from the session
sc = spark.sparkContext

In [2]:
person = spark.createDataFrame([
    (0, "Bill Chambers", 0, [100]),
    (1, "Matei Zaharia", 1, [500, 250, 100]),
    (2, "Michael Armbrust", 1, [250, 100])
]).toDF("id", "name", "graduate_program", "current_status")

graduateProgram = spark.createDataFrame([
    (0, "masters", "school of information", "UC Berkeley"),
    (1, "masters", "CS", "Hanyang Univ"),
    (2, "Ph.D.", "CS", "Hanyang Univ")
]).toDF("id", "degree", "department", "school")

currentStatus = spark.createDataFrame([
    (500, "Vice President"),
    (250, "Senior Programmer"),
    (100, "Junior Programmer")
]).toDF("id", "status")

In [3]:
person.show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program| current_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+



In [4]:
graduateProgram.show()

+---+-------+--------------------+------------+
| id| degree|          department|      school|
+---+-------+--------------------+------------+
|  0|masters|school of informa...| UC Berkeley|
|  1|masters|                  CS|Hanyang Univ|
|  2|  Ph.D.|                  CS|Hanyang Univ|
+---+-------+--------------------+------------+



In [5]:
currentStatus.show()

+---+-----------------+
| id|           status|
+---+-----------------+
|500|   Vice President|
|250|Senior Programmer|
|100|Junior Programmer|
+---+-----------------+



### filter(condition)
Filters rows using the given condition.

where() is an alias for filter().

Parameters
* condition – a Column of types.BooleanType or a string of SQL expression.

In [6]:
person["graduate_program"]

Column<b'graduate_program'>

In [7]:
person.filter(person["graduate_program"] == 1).show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program| current_status|
+---+----------------+----------------+---------------+
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+



In [8]:
from pyspark.sql.functions import *
person.filter(expr("graduate_program = 1")).show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program| current_status|
+---+----------------+----------------+---------------+
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+



In [9]:
person.filter(col("graduate_program") == 1).show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program| current_status|
+---+----------------+----------------+---------------+
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+



### distinct() # 전체 row가 일치되어야 제외
Returns a new DataFrame containing the distinct rows in this DataFrame.

In [10]:
person.distinct().show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program| current_status|
+---+----------------+----------------+---------------+
|  2|Michael Armbrust|               1|     [250, 100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  0|   Bill Chambers|               0|          [100]|
+---+----------------+----------------+---------------+



In [11]:
person.select("graduate_program").distinct().show()

+----------------+
|graduate_program|
+----------------+
|               0|
|               1|
+----------------+



### join(other, on=None, how=None)
Joins with another DataFrame, using the given join expression.

Parameters
* other – Right side of the join
* on – a string for the join column name, a list of column names, a join expression (Column), or a list of Columns. If on is a string or a list of strings indicating the name of the join column(s), the column(s) must exist on both sides, and this performs an equi-join.
* how – str, default inner. Must be one of: inner, cross, outer, full, full_outer, left, left_outer, right, right_outer, left_semi, and left_anti.

In [12]:
person.show()
graduateProgram.show()

+---+----------------+----------------+---------------+
| id|            name|graduate_program| current_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+

+---+-------+--------------------+------------+
| id| degree|          department|      school|
+---+-------+--------------------+------------+
|  0|masters|school of informa...| UC Berkeley|
|  1|masters|                  CS|Hanyang Univ|
|  2|  Ph.D.|                  CS|Hanyang Univ|
+---+-------+--------------------+------------+



In [13]:
# foreign key == primary key
person.join(graduateProgram, person["graduate_program"] == graduateProgram["id"], "inner").show()

+---+----------------+----------------+---------------+---+-------+--------------------+------------+
| id|            name|graduate_program| current_status| id| degree|          department|      school|
+---+----------------+----------------+---------------+---+-------+--------------------+------------+
|  0|   Bill Chambers|               0|          [100]|  0|masters|school of informa...| UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|masters|                  CS|Hanyang Univ|
|  2|Michael Armbrust|               1|     [250, 100]|  1|masters|                  CS|Hanyang Univ|
+---+----------------+----------------+---------------+---+-------+--------------------+------------+



#### join이후 select

In [14]:
person.join(graduateProgram, person["graduate_program"] == graduateProgram["id"], "inner").select("name", "degree", "school").show()

+----------------+-------+------------+
|            name| degree|      school|
+----------------+-------+------------+
|   Bill Chambers|masters| UC Berkeley|
|   Matei Zaharia|masters|Hanyang Univ|
|Michael Armbrust|masters|Hanyang Univ|
+----------------+-------+------------+



In [15]:
person.select("name", "graduate_program")\
.join(graduateProgram.select("id", "degree", "school"), person["graduate_program"] == graduateProgram["id"], "inner")\
.select("name", "degree", "school").show()

+----------------+-------+------------+
|            name| degree|      school|
+----------------+-------+------------+
|   Bill Chambers|masters| UC Berkeley|
|   Matei Zaharia|masters|Hanyang Univ|
|Michael Armbrust|masters|Hanyang Univ|
+----------------+-------+------------+



In [17]:
person.join(graduateProgram, person["graduate_program"] == graduateProgram["id"], "outer").show()

+----+----------------+----------------+---------------+----+-------+--------------------+------------+
|  id|            name|graduate_program| current_status|  id| degree|          department|      school|
+----+----------------+----------------+---------------+----+-------+--------------------+------------+
|   0|   Bill Chambers|               0|          [100]|   0|masters|school of informa...| UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|   1|masters|                  CS|Hanyang Univ|
|   2|Michael Armbrust|               1|     [250, 100]|   1|masters|                  CS|Hanyang Univ|
|   3|             kim|               3|          [100]|null|   null|                null|        null|
|null|            null|            null|           null|   2|  Ph.D.|                  CS|Hanyang Univ|
+----+----------------+----------------+---------------+----+-------+--------------------+------------+



### outlier 보여주기 위해

In [18]:
person = spark.createDataFrame([
    (0, "Bill Chambers", 0, [100]),
    (1, "Matei Zaharia", 1, [500, 250, 100]),
    (2, "Michael Armbrust", 1, [250, 100]),
    (3, "Kim", 3, [100])
]).toDF("id", "name", "graduate_program", "current_status")

In [19]:
person.join(graduateProgram, person["graduate_program"] == graduateProgram["id"], "outer").show()

+----+----------------+----------------+---------------+----+-------+--------------------+------------+
|  id|            name|graduate_program| current_status|  id| degree|          department|      school|
+----+----------------+----------------+---------------+----+-------+--------------------+------------+
|   0|   Bill Chambers|               0|          [100]|   0|masters|school of informa...| UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|   1|masters|                  CS|Hanyang Univ|
|   2|Michael Armbrust|               1|     [250, 100]|   1|masters|                  CS|Hanyang Univ|
|   3|             Kim|               3|          [100]|null|   null|                null|        null|
|null|            null|            null|           null|   2|  Ph.D.|                  CS|Hanyang Univ|
+----+----------------+----------------+---------------+----+-------+--------------------+------------+



In [20]:
# select "id" 중복되므로 특정 필요 person["id"]
person.join(graduateProgram, person["graduate_program"] == graduateProgram["id"], "inner").select(person["id"], "name", "degree").show()

+---+----------------+-------+
| id|            name| degree|
+---+----------------+-------+
|  0|   Bill Chambers|masters|
|  1|   Matei Zaharia|masters|
|  2|Michael Armbrust|masters|
+---+----------------+-------+



In [23]:
# left_outer # left_join아님
person\
.join(graduateProgram, person["graduate_program"] == graduateProgram["id"], "left_outer")\
.select(person["id"], "name", "degree").show()

+---+----------------+-------+
| id|            name| degree|
+---+----------------+-------+
|  0|   Bill Chambers|masters|
|  1|   Matei Zaharia|masters|
|  2|Michael Armbrust|masters|
|  3|             Kim|   null|
+---+----------------+-------+

