In [74]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [75]:
type(spark)

pyspark.sql.session.SparkSession

In [76]:
 emp_df = (spark
          .read
          .format("csv")
           .option("header","true")
          .load("OneDrive\Documents\DATA\employee.csv"))

In [77]:
emp_df.show()

+---+-------+-----------+---------+-----------+
| ID|   Name|   HomeTown|   Salary|JoiningDate|
+---+-------+-----------+---------+-----------+
|  1|  Arpit|  Burhanpur| 50000.00| 2018-11-14|
|  2|   Benu|Bhubaneswar|100000.00| 2018-11-19|
|  3|Dilsher|   Amritsar|100000.00| 2018-11-19|
|  4|  Kiran|  Bengaluru| 50000.00| 2018-11-15|
+---+-------+-----------+---------+-----------+



In [78]:
dept_df = (spark
          .read
          .format("csv")
           .option("header","true")
          .load("OneDrive\Documents\DATA\department.csv"))

In [79]:
dept_df.show()

+----------+----------------+--------------+-------------+
|EmployeeID|  DepartmentName|        Client|OnboardedDate|
+----------+----------------+--------------+-------------+
|         1|Data Engineering|Funding Circle|   2019-01-15|
|         2|Data Engineering|Funding Circle|   2018-11-19|
|         3|  Data Analytics|Funding Circle|   2018-11-19|
|         4|  Data Analytics|Funding Circle|   2018-12-16|
+----------+----------------+--------------+-------------+



# Joining Data frames

In [80]:
joined_df = emp_df.join(dept_df,emp_df.ID == dept_df.EmployeeID)
joined_df.show()

+---+-------+-----------+---------+-----------+----------+----------------+--------------+-------------+
| ID|   Name|   HomeTown|   Salary|JoiningDate|EmployeeID|  DepartmentName|        Client|OnboardedDate|
+---+-------+-----------+---------+-----------+----------+----------------+--------------+-------------+
|  1|  Arpit|  Burhanpur| 50000.00| 2018-11-14|         1|Data Engineering|Funding Circle|   2019-01-15|
|  2|   Benu|Bhubaneswar|100000.00| 2018-11-19|         2|Data Engineering|Funding Circle|   2018-11-19|
|  3|Dilsher|   Amritsar|100000.00| 2018-11-19|         3|  Data Analytics|Funding Circle|   2018-11-19|
|  4|  Kiran|  Bengaluru| 50000.00| 2018-11-15|         4|  Data Analytics|Funding Circle|   2018-12-16|
+---+-------+-----------+---------+-----------+----------+----------------+--------------+-------------+



#  Droping columns

In [81]:
final_df = joined_df.drop("EmployeeID")
final_df.show()

+---+-------+-----------+---------+-----------+----------------+--------------+-------------+
| ID|   Name|   HomeTown|   Salary|JoiningDate|  DepartmentName|        Client|OnboardedDate|
+---+-------+-----------+---------+-----------+----------------+--------------+-------------+
|  1|  Arpit|  Burhanpur| 50000.00| 2018-11-14|Data Engineering|Funding Circle|   2019-01-15|
|  2|   Benu|Bhubaneswar|100000.00| 2018-11-19|Data Engineering|Funding Circle|   2018-11-19|
|  3|Dilsher|   Amritsar|100000.00| 2018-11-19|  Data Analytics|Funding Circle|   2018-11-19|
|  4|  Kiran|  Bengaluru| 50000.00| 2018-11-15|  Data Analytics|Funding Circle|   2018-12-16|
+---+-------+-----------+---------+-----------+----------------+--------------+-------------+



In [82]:
new_df = final_df.drop("HomeTown", "Client")
new_df.show()

+---+-------+---------+-----------+----------------+-------------+
| ID|   Name|   Salary|JoiningDate|  DepartmentName|OnboardedDate|
+---+-------+---------+-----------+----------------+-------------+
|  1|  Arpit| 50000.00| 2018-11-14|Data Engineering|   2019-01-15|
|  2|   Benu|100000.00| 2018-11-19|Data Engineering|   2018-11-19|
|  3|Dilsher|100000.00| 2018-11-19|  Data Analytics|   2018-11-19|
|  4|  Kiran| 50000.00| 2018-11-15|  Data Analytics|   2018-12-16|
+---+-------+---------+-----------+----------------+-------------+



In [83]:
# data frames are immutable
# we need to assign the changed to a new df they doesnot change inplace

# User Defined Function

### Does not use catalyst operator so not ideal, try using built in functions

In [84]:
emp_df.show()

+---+-------+-----------+---------+-----------+
| ID|   Name|   HomeTown|   Salary|JoiningDate|
+---+-------+-----------+---------+-----------+
|  1|  Arpit|  Burhanpur| 50000.00| 2018-11-14|
|  2|   Benu|Bhubaneswar|100000.00| 2018-11-19|
|  3|Dilsher|   Amritsar|100000.00| 2018-11-19|
|  4|  Kiran|  Bengaluru| 50000.00| 2018-11-15|
+---+-------+-----------+---------+-----------+



In [85]:
from pyspark.sql.functions import udf

@udf
def fill_state(HomeTown):
    if HomeTown == "Bengaluru":
        return "Karnataka"
    else:
        return "Out of State"

In [86]:
state_df = emp_df.withColumn("State",fill_state(emp_df.HomeTown))

In [87]:
state_df.show()

+---+-------+-----------+---------+-----------+------------+
| ID|   Name|   HomeTown|   Salary|JoiningDate|       State|
+---+-------+-----------+---------+-----------+------------+
|  1|  Arpit|  Burhanpur| 50000.00| 2018-11-14|Out of State|
|  2|   Benu|Bhubaneswar|100000.00| 2018-11-19|Out of State|
|  3|Dilsher|   Amritsar|100000.00| 2018-11-19|Out of State|
|  4|  Kiran|  Bengaluru| 50000.00| 2018-11-15|   Karnataka|
+---+-------+-----------+---------+-----------+------------+



# When and Otherwise

In [88]:
from pyspark.sql.functions import when

emp_dep_final_df = emp_df.withColumn("State", when(emp_df.HomeTown == "Bengaluru", "Karnataka").when(emp_df.HomeTown == "Bhubaneswar", "Odisha").otherwise("Out of state"), )

In [89]:
emp_dep_final_df.show()

+---+-------+-----------+---------+-----------+------------+
| ID|   Name|   HomeTown|   Salary|JoiningDate|       State|
+---+-------+-----------+---------+-----------+------------+
|  1|  Arpit|  Burhanpur| 50000.00| 2018-11-14|Out of state|
|  2|   Benu|Bhubaneswar|100000.00| 2018-11-19|      Odisha|
|  3|Dilsher|   Amritsar|100000.00| 2018-11-19|Out of state|
|  4|  Kiran|  Bengaluru| 50000.00| 2018-11-15|   Karnataka|
+---+-------+-----------+---------+-----------+------------+



# Writing the DF

In [100]:
emp_dep_final_df.write

<pyspark.sql.readwriter.DataFrameWriter at 0x211f319a160>

In [105]:
#v emp_dep_final_df.write.parquet("C:\data_output")

In [106]:
# emp_dep_final_df.write.format("csv").save("OneDrive\Documents\DATA\emp_dep_csv")

In [107]:
# emp_dep_final_df.write.format("csv").option("header","true").save("OneDrive\Documents\DATA\emp_dep_csv")

In [109]:
# (emp_dep_final_df
#  .write
#  .format("csv")
#  .option("header","true")
#  .mode("overwrite")
#  .option("compression","snappy")
#  .save("OneDrive\Documents\DATA\emp_dep_csv"))

In [None]:
()