# Joins in `pyspark`

Performed with `df_left.join(df_right, how=type_str)`

In [1]:
from pyspark.sql import SparkSession
from more_pyspark import to_pandas
spark = SparkSession.builder.appName('Ops').getOrCreate()
deptk = spark.read.csv("./data/department.csv",  header=True, inferSchema=True)
deptk.collect() >> to_pandas

Unnamed: 0,DeptID,DeptName
0,31,Sales
1,33,Engineering
2,34,Clerical
3,35,Marketing


In [2]:
emplk = spark.read.csv("./data/employee.csv",  header=True, inferSchema=True)
emplk.collect() >> to_pandas

Unnamed: 0,DeptID,LastName
0,31.0,Rafferty
1,33.0,Jones
2,33.0,Heisenberg
3,34.0,Robinson
4,34.0,Smith
5,,Williams


#### Inner join

In [3]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='inner')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31,Rafferty,Sales
1,33,Jones,Engineering
2,33,Heisenberg,Engineering
3,34,Robinson,Clerical
4,34,Smith,Clerical


#### Left join

In [4]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='left')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31.0,Rafferty,Sales
1,33.0,Jones,Engineering
2,33.0,Heisenberg,Engineering
3,34.0,Robinson,Clerical
4,34.0,Smith,Clerical
5,,Williams,


#### Right join

In [5]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='right')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31,Rafferty,Sales
1,33,Heisenberg,Engineering
2,33,Jones,Engineering
3,34,Smith,Clerical
4,34,Robinson,Clerical
5,35,,Marketing


#### Outer join

In [6]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='outer')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,,Williams,
1,35.0,,Marketing
2,34.0,Robinson,Clerical
3,34.0,Smith,Clerical
4,31.0,Rafferty,Sales
5,33.0,Jones,Engineering
6,33.0,Heisenberg,Engineering


## <font color="red"> Exercise 2 </font>

Determine all the players that have hit more than 100 home runs in a season.  The final table should include the players proper name, as well as the team name.  

**Hint:** You will need join the files listed below.  To get credit for this exercise, use the join `pyspark` join methods presented above.

In [7]:
f1, f2, f3 = ("./data/baseball/core/Batting.csv", 
              "./data/baseball/core/People.csv",
              "./data/baseball/core/Teams.csv")

In [40]:
import pyspark.sql.functions as fn
from pyspark.sql.functions import col, column, mean

In [31]:
battingk = spark.read.csv(f1,  header=True, inferSchema=True)


In [33]:
battingk1 =(battingk
               .select([battingk.playerID, battingk.yearID, battingk.teamID, battingk.HR] ))


In [35]:
peoplek = spark.read.csv(f2,  header=True, inferSchema=True)


In [34]:
peoplek1 = (peoplek 
               .select([peoplek.playerID, peoplek.nameGiven]))

In [37]:
teamsk = spark.read.csv(f3,  header=True, inferSchema=True)


In [36]:
teamsk1 = (teamsk
              .select([teamsk.yearID, teamsk.teamID, teamsk.name]))

In [38]:
batjoinpeop = (battingk1.join(peoplek1, battingk1.playerID == peoplek1.playerID, how='left'))
batjoinpeop.collect() >> to_pandas

Unnamed: 0,playerID,yearID,teamID,HR,nameGiven
0,abercda01,1871,TRO,0,Francis Patterson
1,addybo01,1871,RC1,0,Robert Edward
2,allisar01,1871,CL1,0,Arthur Algernon
3,allisdo01,1871,WS3,2,Douglas L.
4,ansonca01,1871,RC1,0,Adrian Constantine
5,armstbo01,1871,FW1,0,Robert Livingston
6,barkeal01,1871,RC1,0,Alfred L.
7,barnero01,1871,BS1,0,Charles Roscoe
8,barrebi01,1871,FW1,0,William
9,barrofr01,1871,BS1,0,Franklin Lee


In [39]:
alljoined = (batjoinpeop.join(teamsk1, (batjoinpeop.teamID == teamsk1.teamID) & (batjoinpeop.yearID == teamsk1.yearID), how='left'))
alljoined.collect() >> to_pandas

Unnamed: 0,playerID,yearID,teamID,HR,nameGiven,name
0,abercda01,1871,TRO,0,Francis Patterson,Troy Haymakers
1,addybo01,1871,RC1,0,Robert Edward,Rockford Forest Citys
2,allisar01,1871,CL1,0,Arthur Algernon,Cleveland Forest Citys
3,allisdo01,1871,WS3,2,Douglas L.,Washington Olympics
4,ansonca01,1871,RC1,0,Adrian Constantine,Rockford Forest Citys
5,armstbo01,1871,FW1,0,Robert Livingston,Fort Wayne Kekiongas
6,barkeal01,1871,RC1,0,Alfred L.,Rockford Forest Citys
7,barnero01,1871,BS1,0,Charles Roscoe,Boston Red Stockings
8,barrebi01,1871,FW1,0,William,Fort Wayne Kekiongas
9,barrofr01,1871,BS1,0,Franklin Lee,Boston Red Stockings


In [47]:
filthr = (alljoined
             .where(alljoined.HR > 70))
filthr.collect() >> to_pandas

Unnamed: 0,playerID,yearID,teamID,HR,nameGiven,name
0,bondsba01,2001,SFN,73,Barry Lamar,San Francisco Giants


## Up Next

Stuff