#withColumn()

In [0]:
from pyspark.sql.functions import col
data = [(1,"Pranci",2000),(2,"Tanna",5000)]
df = spark.createDataFrame(data = data, schema = ["id","Name","Salary"])
df1 = df.withColumn(colName="Salary",col = col("Salary").cast("Integer")) #cast is used for change the datatype of a column
df1.show()
#withColumn() is used to create a new column or changeing the data into existing column

+---+------+------+
| id|  Name|Salary|
+---+------+------+
|  1|Pranci|  2000|
|  2| Tanna|  5000|
+---+------+------+



In [0]:
df2 = df1.withColumn("Salary",col("Salary")*2) # Col is used to change the data of a column
df2.show()

+---+------+------+
| id|  Name|Salary|
+---+------+------+
|  1|Pranci|  4000|
|  2| Tanna| 10000|
+---+------+------+



In [0]:
from pyspark.sql.functions import lit
df3 = df2.withColumn("Country",lit("India")) # lit is used to add new column with same data in the column
df3.show()

+---+------+------+-------+
| id|  Name|Salary|Country|
+---+------+------+-------+
|  1|Pranci|  4000|  India|
|  2| Tanna| 10000|  India|
+---+------+------+-------+



#withColumnRenamed()

In [0]:
data = [(1,"Pranci",2000),(2,"Tanna",5000)]
df = spark.createDataFrame(data = data, schema = ["id","Name","Salary"])
df1 = df.withColumnRenamed("Salary","Salary_amount")
df1.show()
# withColumnRenamed() is used to change existing column name 

+---+------+-------------+
| id|  Name|Salary_amount|
+---+------+-------------+
|  1|Pranci|         2000|
|  2| Tanna|         5000|
+---+------+-------------+



#StructType() and StructField()

In [0]:
from pyspark.sql.types import*
data = [(1,"Pranci"),(2,"Tanna")]
schema = StructType([StructField(name = 'id',dataType = IntegerType()),
                     StructField(name = 'name',dataType = StringType())])
df = spark.createDataFrame(data = data, schema = ['id','name'])
df.show()
#It is used to programmitically specify the schema to the Data Frame

+---+------+
| id|  name|
+---+------+
|  1|Pranci|
|  2| Tanna|
+---+------+



#ArrayType()

In [0]:
from pyspark.sql.types import *
data = [('Pranci',[1,2]),('Tanna',[4,5]),('Aakrati',[7,8])]
schema = StructType([StructField('id',StringType()),
                    StructField('number',ArrayType(IntegerType()))])
df = spark.createDataFrame(data,schema)
df.show()
#this function is used to deal with array type data in the dataframe

+-------+------+
|     id|number|
+-------+------+
| Pranci|[1, 2]|
|  Tanna|[4, 5]|
|Aakrati|[7, 8]|
+-------+------+



#explode()

In [0]:
data = [(1,"Pranci",["Pyspark","Azure"]),(2,"Tanna",["BigData","Python"])]
df = spark.createDataFrame(data = data, schema = ["id","Name","Skill"])
from pyspark.sql.functions import explode,col
df1 = df.withColumn("Skills",explode(col("Skill")))
df1.show()
# This function is used to create a new row for each element in the given array column.

+---+------+-----------------+-------+
| id|  Name|            Skill| Skills|
+---+------+-----------------+-------+
|  1|Pranci| [Pyspark, Azure]|Pyspark|
|  1|Pranci| [Pyspark, Azure]|  Azure|
|  2| Tanna|[BigData, Python]|BigData|
|  2| Tanna|[BigData, Python]| Python|
+---+------+-----------------+-------+



#split()

In [0]:
from pyspark.sql.types import *
data = [(1,'Pranci',"Pyspark,Azure"),(2,'Tanna',"BigData,Python")]
schema = ["id","name","skill"]
df = spark.createDataFrame(data,schema)
from pyspark.sql.functions import split,col
df1 = df.withColumn("Primary_Skillls",split(col("skill"),",").getItem(0))
df2 = df1.withColumn("Secondary_Skills",split(col("skill"),",").getItem(1))
df2.show()
#This function return an array type after splitting the string column by delimiter.

+---+------+--------------+---------------+----------------+
| id|  name|         skill|Primary_Skillls|Secondary_Skills|
+---+------+--------------+---------------+----------------+
|  1|Pranci| Pyspark,Azure|        Pyspark|           Azure|
|  2| Tanna|BigData,Python|        BigData|          Python|
+---+------+--------------+---------------+----------------+



#array()

In [0]:
data = [(1,"Pranci","Pyspark","azure"),(2,"Tanna","BigData","Python")]
schema = ["id","name","primaryskill","secondaryskill"]
df = spark.createDataFrame(data ,schema)
from pyspark.sql.functions import array,col
df1 = df.withColumn("skillArray",array(col("primaryskill"),col("secondaryskill")))
df1.show()
# This function is used to create a new column by merging the data from multiple column

+---+------+------------+--------------+-----------------+
| id|  name|primaryskill|secondaryskill|       skillArray|
+---+------+------------+--------------+-----------------+
|  1|Pranci|     Pyspark|         azure| [Pyspark, azure]|
|  2| Tanna|     BigData|        Python|[BigData, Python]|
+---+------+------------+--------------+-----------------+



#array_contains()

In [0]:
data = [(1,"Pranci",["Pyspark","azure"]),(2,"Tanna",["BigData","Python"])]
schema = ["id","name","skill"]
df = spark.createDataFrame(data ,schema)
from pyspark.sql.functions import array_contains
df1 = df.withColumn("HasJavaSkill",array_contains(col("skill"),"java"))
df1.show()
#This function is used to check whether the data is present or not if present then return True otherwise False

+---+------+-----------------+------------+
| id|  name|            skill|HasJavaSkill|
+---+------+-----------------+------------+
|  1|Pranci| [Pyspark, azure]|       false|
|  2| Tanna|[BigData, Python]|       false|
+---+------+-----------------+------------+



#MapType()

In [0]:
# Method - 1
data = [("Pranci",{"hair":"black","eye":"brown"}),("Tanna",{"hair":"black","eye":"blue"})]
schema = ["name","properties"]
df = spark.createDataFrame(data,schema)
df.show()

+------+--------------------+
|  name|          properties|
+------+--------------------+
|Pranci|{eye -> brown, ha...|
| Tanna|{eye -> blue, hai...|
+------+--------------------+



In [0]:
# Method - 2
from pyspark.sql.types import StructField, StructType,StringType,MapType
data = [("Pranci",{"hair":"black","eye":"brown"}),("Tanna",{"hair":"black","eye":"blue"})]
schema = StructType([StructField("name",StringType()),
                     StructField("properties",MapType(StringType(),StringType()))])
df = spark.createDataFrame(data,schema)
df.show(truncate = False)

+------+-----------------------------+
|name  |properties                   |
+------+-----------------------------+
|Pranci|{eye -> brown, hair -> black}|
|Tanna |{eye -> blue, hair -> black} |
+------+-----------------------------+



In [0]:
#Access MapType Elements
df1 = df.withColumn("hair",df.properties["hair"])
df2 = df1.withColumn("eye",df.properties["eye"])
df2.show()

+------+--------------------+-----+-----+
|  name|          properties| hair|  eye|
+------+--------------------+-----+-----+
|Pranci|{eye -> brown, ha...|black|brown|
| Tanna|{eye -> blue, hai...|black| blue|
+------+--------------------+-----+-----+



#Functions Work with MapColumn

In [0]:
from pyspark.sql.types import StructField, StructType,StringType,MapType
data = [("Pranci",{"hair":"black","eye":"brown"}),("Tanna",{"hair":"black","eye":"blue"})]
schema = StructType([StructField("name",StringType()),
                     StructField("properties",MapType(StringType(),StringType()))])
df = spark.createDataFrame(data,schema)
df.show(truncate = False)

+------+-----------------------------+
|name  |properties                   |
+------+-----------------------------+
|Pranci|{eye -> brown, hair -> black}|
|Tanna |{eye -> blue, hair -> black} |
+------+-----------------------------+



In [0]:
# Function - 1 --> explode()
from pyspark.sql.functions import explode
df1 = df.select("name","properties",explode(df.properties))
df1.show(truncate=False)

+------+-----------------------------+----+-----+
|name  |properties                   |key |value|
+------+-----------------------------+----+-----+
|Pranci|{eye -> brown, hair -> black}|eye |brown|
|Pranci|{eye -> brown, hair -> black}|hair|black|
|Tanna |{eye -> blue, hair -> black} |eye |blue |
|Tanna |{eye -> blue, hair -> black} |hair|black|
+------+-----------------------------+----+-----+



In [0]:
# Function - 2 --> map_keys()
from pyspark.sql.functions import map_keys
df1 = df.withColumn("keys",map_keys(df.properties))
df1.show(truncate = False)

+------+-----------------------------+-----------+
|name  |properties                   |keys       |
+------+-----------------------------+-----------+
|Pranci|{eye -> brown, hair -> black}|[eye, hair]|
|Tanna |{eye -> blue, hair -> black} |[eye, hair]|
+------+-----------------------------+-----------+



In [0]:
# Function - 3 --> map_value()
from pyspark.sql.functions import map_values
df1 = df.withColumn("values",map_values(df.properties))
df1.show(truncate = False)

+------+-----------------------------+--------------+
|name  |properties                   |values        |
+------+-----------------------------+--------------+
|Pranci|{eye -> brown, hair -> black}|[brown, black]|
|Tanna |{eye -> blue, hair -> black} |[blue, black] |
+------+-----------------------------+--------------+



#Row() Class

In [0]:
#Method 1 
from pyspark.sql import Row
row = Row("Pranci",2000)
print(row[0]+" "+str(row[1]))

Pranci 2000


In [0]:
# Method 2 - using named argumnets
from pyspark.sql import Row
row = Row(name ="Tanna",salary = 2000)
print(row.name+" "+str(row.salary))

Tanna 2000


In [0]:
# Method 2 - Multiple row data
from pyspark.sql import Row
row1 = Row(name ="Pranci",salary = 2000)
row2 = Row(name ="Tanna",salary = 2000)
data =[row1,row2]
df = spark.createDataFrame(data)
df.show()

+------+------+
|  name|salary|
+------+------+
|Pranci|  2000|
| Tanna|  2000|
+------+------+



In [0]:
# Method 3 - Row like Class
from pyspark.sql import Row
Person = Row("name","Salary")
p1 = Person("Pranci","Agrahari")
p2 = Person("Tanna","Gupta")
print(p1.name + " " + p2.name)

Pranci Tanna


In [0]:
# Method 4 - Nested struct type also using Row()
from pyspark.sql import Row
data = [Row(name = "Pranci",prop = Row(hair = "Black",eye = "Blue")),
        Row(name = "Tanna",prop = Row(hair = "Black",eye = "Brown"))]
df = spark.createDataFrame(data)
df.show()

+------+--------------+
|  name|          prop|
+------+--------------+
|Pranci| {Black, Blue}|
| Tanna|{Black, Brown}|
+------+--------------+



#column()

In [0]:
from pyspark.sql.functions import lit
data = [("Pranci","Female",2000),("Prashant","Male",4000)]
schema = ["name","gender","salary"]
df = spark.createDataFrame(data,schema)
df.show()
df1 = df.withColumn("country",lit("India"))
df1.show()
# lit() is used to add new column with same data in the column

+--------+------+------+
|    name|gender|salary|
+--------+------+------+
|  Pranci|Female|  2000|
|Prashant|  Male|  4000|
+--------+------+------+

+--------+------+------+-------+
|    name|gender|salary|country|
+--------+------+------+-------+
|  Pranci|Female|  2000|  India|
|Prashant|  Male|  4000|  India|
+--------+------+------+-------+



In [0]:
#Access column from dataframe
## Method -1
df1.select(df1.name).show()

+--------+
|    name|
+--------+
|  Pranci|
|Prashant|
+--------+



In [0]:
# Method -2
df1.select(df1["salary"]).show()

+------+
|salary|
+------+
|  2000|
|  4000|
+------+



In [0]:
# Method -3
from pyspark.sql.functions import col
df1.select(col("gender")).show()

+------+
|gender|
+------+
|Female|
|  Male|
+------+

