# Table joining

In [1]:
# Spark related machinery
import pyspark
import pyspark.sql.functions as F
from pyspark import SparkConf
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql import HiveContext
from pyspark.sql.functions import concat_ws

spark = pyspark.sql.SparkSession.builder.enableHiveSupport().getOrCreate()

In [2]:
from pyspark_functions import create_players_table,\
                              create_new_players_table,\
                              create_ranking_table

## Table creation

We start by creating some tables, in this case we will have two tables with the information of video game players, and a second table with their rankings:

* **players**: Contains the name of the player, how old the player is, the game he/she plays, and the player id
* **new_players**: this table is a continuation of the table **players**.
* **ranking**: Contains the player’s id, and the player’s ranking

In [3]:
players = create_players_table()
new_players = create_new_players_table()
ranking = create_ranking_table()

players.show()
new_players.show()
ranking.show()

+-------+---+--------------+---------+
|   Name|Age|          Game|Player_id|
+-------+---+--------------+---------+
|Nicolas| 25|          Doom|        1|
| Camila| 23|        Diablo|        2|
|Gabriel| 20|   Wolfenstein|        3|
|  Mateo| 23|    Zelda BOTW|        4|
|   Luna| 21|         Mario|        5|
|   Lily| 25|Counter Strike|        6|
|  Sofia| 26|     Max Payne|        7|
|    Leo| 28|       Fifa 20|        8|
| Thomas| 22|         Speed|        9|
|  James| 30| Goldeneye 007|       15|
+-------+---+--------------+---------+

+-----+---+------+---------+
| Name|Age|  Game|Player_id|
+-----+---+------+---------+
|  Mia| 22| Mario|       10|
|David| 28|Diablo|       11|
|Dylan| 21|  Doom|       12|
+-----+---+------+---------+

+---------+-------+
|Player_id|Ranking|
+---------+-------+
|        1|      1|
|        2|      1|
|        3|      4|
|        4|      9|
|        5|      2|
|        6|      3|
|        7|     99|
|        8|     22|
|        9|     12|
|       1

# Inner join

Only the keys common to **both** tables are preserved.

In [4]:
players_ranking_inner = players.join(ranking, on="Player_id", how="inner")

players_ranking_inner.sort("Player_id").show()

+---------+-------+---+--------------+-------+
|Player_id|   Name|Age|          Game|Ranking|
+---------+-------+---+--------------+-------+
|        1|Nicolas| 25|          Doom|      1|
|        2| Camila| 23|        Diablo|      1|
|        3|Gabriel| 20|   Wolfenstein|      4|
|        4|  Mateo| 23|    Zelda BOTW|      9|
|        5|   Luna| 21|         Mario|      2|
|        6|   Lily| 25|Counter Strike|      3|
|        7|  Sofia| 26|     Max Payne|     99|
|        8|    Leo| 28|       Fifa 20|     22|
|        9| Thomas| 22|         Speed|     12|
+---------+-------+---+--------------+-------+



# Left join example

All the keys on the **left table** (the first table) are preserved.

In [5]:
ranking_players_left = players.join(ranking, on="Player_id", how="left")

ranking_players_left.sort("Player_id").show()

+---------+-------+---+--------------+-------+
|Player_id|   Name|Age|          Game|Ranking|
+---------+-------+---+--------------+-------+
|        1|Nicolas| 25|          Doom|      1|
|        2| Camila| 23|        Diablo|      1|
|        3|Gabriel| 20|   Wolfenstein|      4|
|        4|  Mateo| 23|    Zelda BOTW|      9|
|        5|   Luna| 21|         Mario|      2|
|        6|   Lily| 25|Counter Strike|      3|
|        7|  Sofia| 26|     Max Payne|     99|
|        8|    Leo| 28|       Fifa 20|     22|
|        9| Thomas| 22|         Speed|     12|
|       15|  James| 30| Goldeneye 007|   null|
+---------+-------+---+--------------+-------+



# Outer join Example
All the keys in **both** tables are preseved.

In [6]:
ranking_players_outer = players.join(ranking, on="Player_id", how="outer")

#Sort the table by player_id and print it
ranking_players_outer.sort("Player_id").show()

+---------+-------+----+--------------+-------+
|Player_id|   Name| Age|          Game|Ranking|
+---------+-------+----+--------------+-------+
|        1|Nicolas|  25|          Doom|      1|
|        2| Camila|  23|        Diablo|      1|
|        3|Gabriel|  20|   Wolfenstein|      4|
|        4|  Mateo|  23|    Zelda BOTW|      9|
|        5|   Luna|  21|         Mario|      2|
|        6|   Lily|  25|Counter Strike|      3|
|        7|  Sofia|  26|     Max Payne|     99|
|        8|    Leo|  28|       Fifa 20|     22|
|        9| Thomas|  22|         Speed|     12|
|       10|   null|null|          null|    440|
|       11|   null|null|          null|     21|
|       15|  James|  30| Goldeneye 007|   null|
+---------+-------+----+--------------+-------+



# Left anti join example
Only the keys of the left table (the first table) that are not present in the second table are preserved:

In [7]:
ranking_players_left_anti = players.join(ranking, on="Player_id", how="left_anti")

ranking_players_left_anti.show()

+---------+-----+---+-------------+
|Player_id| Name|Age|         Game|
+---------+-----+---+-------------+
|       15|James| 30|Goldeneye 007|
+---------+-----+---+-------------+



# Union tables
Sometimes you have a dataset split into two tables, as in the case of the **players** and **new_players** tables, and you want to "union" them (put one after the other). This is done as follows:

In [8]:
#List with the name of the columns
cols = players.columns

#Union
full_players = players.union(new_players.select(cols))

#Print to screen the new table
full_players.sort("Player_id").show()

+-------+---+--------------+---------+
|   Name|Age|          Game|Player_id|
+-------+---+--------------+---------+
|Nicolas| 25|          Doom|        1|
| Camila| 23|        Diablo|        2|
|Gabriel| 20|   Wolfenstein|        3|
|  Mateo| 23|    Zelda BOTW|        4|
|   Luna| 21|         Mario|        5|
|   Lily| 25|Counter Strike|        6|
|  Sofia| 26|     Max Payne|        7|
|    Leo| 28|       Fifa 20|        8|
| Thomas| 22|         Speed|        9|
|    Mia| 22|         Mario|       10|
|  David| 28|        Diablo|       11|
|  Dylan| 21|          Doom|       12|
|  James| 30| Goldeneye 007|       15|
+-------+---+--------------+---------+



In [9]:
players.union(new_players.select(["Name", "Age", "Player_id", "Game"])).show()

+-------+---+--------------+---------+
|   Name|Age|          Game|Player_id|
+-------+---+--------------+---------+
|Nicolas| 25|          Doom|        1|
| Camila| 23|        Diablo|        2|
|Gabriel| 20|   Wolfenstein|        3|
|  Mateo| 23|    Zelda BOTW|        4|
|   Luna| 21|         Mario|        5|
|   Lily| 25|Counter Strike|        6|
|  Sofia| 26|     Max Payne|        7|
|    Leo| 28|       Fifa 20|        8|
| Thomas| 22|         Speed|        9|
|  James| 30| Goldeneye 007|       15|
|    Mia| 22|            10|    Mario|
|  David| 28|            11|   Diablo|
|  Dylan| 21|            12|     Doom|
+-------+---+--------------+---------+

