## dataframe transformations part-2

Topics are as follows:

* Aliasing
* filter/where 
* literal
* Adding & removing columns
* Renaming columns
* Casting data types

In [2]:

import os


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
spark = SparkSession.builder.master("local[*]") \
    .appName("testing") \
    .config("spark.driver.extraClassPath", "C:\\my_sql_jar\\mysql-connector-java-8.0.26.jar") \
    .getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x00000130C53A7C10>


In [4]:
employee_df = spark.read.format("csv")\
                    .option("header", "true")\
                    .option("inferschema", "true")\
                    .option("mode", "PERMISSIVE")\
                    .load("C:\\Users\\HP\\Pyspark\\employee.csv\\")
employee_df.show()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+



In [6]:
employee_df.createOrReplaceTempView("emp_tbl")

In [7]:
# Aliasing the column 

employee_df.select(col("id").alias("employee_id"), "name", "age").show()

+-----------+--------+---+
|employee_id|    name|age|
+-----------+--------+---+
|          1|  Manish| 26|
|          2|  Nikita| 23|
|          3|  Pritam| 22|
|          4|Prantosh| 17|
|          5|  Vikash| 31|
+-----------+--------+---+



In [9]:
# filter condition
employee_df.filter(col("salary")>150000).show()

+---+--------+---+------+-------+--------+
| id|    name|age|salary|address| nominee|
+---+--------+---+------+-------+--------+
|  4|Prantosh| 17|200000|Kolkata|   India|
|  5|  Vikash| 31|300000|   null|nominee5|
+---+--------+---+------+-------+--------+



In [10]:
# where condition
employee_df.where(col("salary")>150000).show()

+---+--------+---+------+-------+--------+
| id|    name|age|salary|address| nominee|
+---+--------+---+------+-------+--------+
|  4|Prantosh| 17|200000|Kolkata|   India|
|  5|  Vikash| 31|300000|   null|nominee5|
+---+--------+---+------+-------+--------+



In [11]:
employee_df.where((col("salary")>150000) & (col("age")<18)).show()

+---+--------+---+------+-------+-------+
| id|    name|age|salary|address|nominee|
+---+--------+---+------+-------+-------+
|  4|Prantosh| 17|200000|Kolkata|  India|
+---+--------+---+------+-------+-------+



In [15]:
employee_df.where(col("name") == "Nikita").show()

+---+------+---+------+------------+--------+
| id|  name|age|salary|     address| nominee|
+---+------+---+------+------------+--------+
|  2|Nikita| 23|100000|uttarpradesh|nominee2|
+---+------+---+------+------------+--------+



In [17]:
## literal --  basically adding another column with default values

employee_df.select("*", lit("sharma").alias("last_name")).show()

+---+--------+---+------+------------+--------+---------+
| id|    name|age|salary|     address| nominee|last_name|
+---+--------+---+------+------------+--------+---------+
|  1|  Manish| 26| 75000|       bihar|nominee1|   sharma|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|   sharma|
|  3|  Pritam| 22|150000|   Bangalore|   India|   sharma|
|  4|Prantosh| 17|200000|     Kolkata|   India|   sharma|
|  5|  Vikash| 31|300000|        null|nominee5|   sharma|
+---+--------+---+------+------------+--------+---------+



In [18]:
## adding columns using withColumn

employee_df.withColumn("sur_name", lit("idk")).show()

+---+--------+---+------+------------+--------+--------+
| id|    name|age|salary|     address| nominee|sur_name|
+---+--------+---+------+------------+--------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|     idk|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|     idk|
|  3|  Pritam| 22|150000|   Bangalore|   India|     idk|
|  4|Prantosh| 17|200000|     Kolkata|   India|     idk|
|  5|  Vikash| 31|300000|        null|nominee5|     idk|
+---+--------+---+------+------------+--------+--------+



In [19]:
## renaming columns using withColumnRenamed

employee_df.withColumnRenamed("id", "employee_id").show()

+-----------+--------+---+------+------------+--------+
|employee_id|    name|age|salary|     address| nominee|
+-----------+--------+---+------+------------+--------+
|          1|  Manish| 26| 75000|       bihar|nominee1|
|          2|  Nikita| 23|100000|uttarpradesh|nominee2|
|          3|  Pritam| 22|150000|   Bangalore|   India|
|          4|Prantosh| 17|200000|     Kolkata|   India|
|          5|  Vikash| 31|300000|        null|nominee5|
+-----------+--------+---+------+------------+--------+



In [21]:
## Casting columns
# checking the existing schema
employee_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [22]:
# casting columns to float

employee_df.withColumn("id", col("id").cast("float")).show()

employee_df.withColumn("id", col("id").cast("float")).printSchema()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|1.0|  Manish| 26| 75000|       bihar|nominee1|
|2.0|  Nikita| 23|100000|uttarpradesh|nominee2|
|3.0|  Pritam| 22|150000|   Bangalore|   India|
|4.0|Prantosh| 17|200000|     Kolkata|   India|
|5.0|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+

root
 |-- id: float (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [23]:
# casting column "id" to float and "salary" to long data type

# data 
employee_df.withColumn("id", col("id").cast("float"))\
           .withColumn("salary", col("salary").cast("long"))\
           .show()
#schema
employee_df.withColumn("id", col("id").cast("float"))\
           .withColumn("salary", col("salary").cast("long"))\
           .printSchema()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|1.0|  Manish| 26| 75000|       bihar|nominee1|
|2.0|  Nikita| 23|100000|uttarpradesh|nominee2|
|3.0|  Pritam| 22|150000|   Bangalore|   India|
|4.0|Prantosh| 17|200000|     Kolkata|   India|
|5.0|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+

root
 |-- id: float (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: long (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)



In [24]:
# dropping columns
# data before dropping col
employee_df.show()

#dropping col
employee_df.drop("id", "nominee").show()

+---+--------+---+------+------------+--------+
| id|    name|age|salary|     address| nominee|
+---+--------+---+------+------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|
|  3|  Pritam| 22|150000|   Bangalore|   India|
|  4|Prantosh| 17|200000|     Kolkata|   India|
|  5|  Vikash| 31|300000|        null|nominee5|
+---+--------+---+------+------------+--------+

+--------+---+------+------------+
|    name|age|salary|     address|
+--------+---+------+------------+
|  Manish| 26| 75000|       bihar|
|  Nikita| 23|100000|uttarpradesh|
|  Pritam| 22|150000|   Bangalore|
|Prantosh| 17|200000|     Kolkata|
|  Vikash| 31|300000|        null|
+--------+---+------+------------+

