### PySpark Broadcast Variables

Broadcast variables not sent to excutors with sc.broadcast(variable) call instead, they will be sent to executors when they are first used

In [0]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').appName('SparkByExample').getOrCreate()

In [0]:
# PySpark RDD Broadcast variable example

states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)
data = [('james','smith', 'usa','CA'), ('Michael','Rose','USA','NY'),('Robert','William', 'USA','CA'),('Maria','Jones','USA','FL')]
rdd = spark.sparkContext.parallelize(data)

def state_convert(code):
    return broadcastStates.value[code]

result = rdd.map(lambda x:(x[0], x[1], x[2], state_convert(x[3]))).collect()
print(result)

[('james', 'smith', 'usa', 'California'), ('Michael', 'Rose', 'USA', 'New York'), ('Robert', 'William', 'USA', 'California'), ('Maria', 'Jones', 'USA', 'Florida')]


In [0]:
# PySpark DataFrame Broadcast variable example
from pyspark.sql.functions import col, when
states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)

data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

columns = ['firstname', 'lastname', 'country', 'state']
df = spark.createDataFrame(data = data, schema=columns)
df.printSchema()
def state_convert(code):
    return broadcastStates.value[code]

result = df.withColumn('convertedstate', when(col('state').isin(list(states.keys())),lit(states[col('state')])).otherwise(col('state')))


root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)



[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-503504833156609>:18[0m
[1;32m     15[0m [38;5;28;01mdef[39;00m [38;5;21mstate_convert[39m(code):
[1;32m     16[0m     [38;5;28;01mreturn[39;00m broadcastStates[38;5;241m.[39mvalue[code]
[0;32m---> 18[0m result [38;5;241m=[39m df[38;5;241m.[39mwithColumn([38;5;124m'[39m[38;5;124mconvertedstate[39m[38;5;124m'[39m, when(col([38;5;124m'[39m[38;5;124mstate[39m[38;5;124m'[39m)[38;5;241m.[39misin([38;5;28mlist[39m(states[38;5;241m.[39mkeys())),lit(states[col([38;5;124m'[39m[38;5;124mstate[39m[38;5;124m'[39m)]))[38;5;241m.[39motherwise(col([38;5;124m'[39m[38;5;124mstate[39m[38;5;124m'[39m)))

[0;31mNameError[0m: name 'lit' is not defined

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit

# Create a SparkSession (you would typically create a SparkSession)

data = [("James", "Smith", "USA", "CA"),
        ("Michael", "Rose", "USA", "NY"),
        ("Robert", "Williams", "USA", "CA"),
        ("Maria", "Jones", "USA", "FL")]

columns = ['firstname', 'lastname', 'country', 'state']
df = spark.createDataFrame(data=data, schema=columns)
df.printSchema()

# Define the state mapping using PySpark's when function
states= {
    "NY": "New York",
    "CA": "California",
    "FL": "Florida"
}

# Use when function to convert state codes to state names
# result = df.withColumn("Converted_state", when(col("state").isin(list(state_mapping.keys())), lit(state_mapping[col("state")])).otherwise("Unknown"))

result = df.withColumn('Converted_state', when(col('state').isin(list(states.keys())), lit(states[col('state')])).otherwise(col('state')))


# Show the result
result.show()


root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)



[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-503504833156610>:25[0m
[1;32m     22[0m # Use when function to convert state codes to state names
[1;32m     23[0m result = df.withColumn("Converted_state", when(col("state").isin(list(state_mapping.keys())), lit(state_mapping[col("state")])).otherwise("Unknown"))
[0;32m---> 25[0m # Show the result
[1;32m     26[0m result.show()

[0;31mTypeError[0m: unhashable type: 'Column'

In [0]:
# Create a SparkSession (you would typically create a SparkSession)

data = [("James", "Smith", "USA", "CA"),
        ("Michael", "Rose", "USA", "NY"),
        ("Robert", "Williams", "USA", "CA"),
        ("Maria", "Jones", "USA", "FL")]

columns = ['firstname', 'lastname', 'country', 'state']
df = spark.createDataFrame(data=data, schema=columns)
df.printSchema()

# Define the state mapping using PySpark's when function
states = {"NY": "New York", "CA": "California", "FL": "Florida"}

# Use when function to convert state codes to state names
result = df.rdd.map(lambda x: (x[0],x[1],x[2],states[x[3]]))

# Show the result
result.collect()

root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)

Out[2]: [('James', 'Smith', 'USA', 'California'),
 ('Michael', 'Rose', 'USA', 'New York'),
 ('Robert', 'Williams', 'USA', 'California'),
 ('Maria', 'Jones', 'USA', 'Florida')]

### PySpark Accumulator with Example

In [0]:
#Creating Accumulator Variable

accum = spark.sparkContext.accumulator(0)
rdd = spark.sparkContext.parallelize([1,2,3,4,5])
rdd.foreach(lambda x: accum.add(x))
print(accum.value)

#with function
accum1 = spark.sparkContext.accumulator(50)
def sum_accum(x):
    global accum1
    accum1 += x
rdd = spark.sparkContext.parallelize([1,2,3,4,5])
rdd.foreach(sum_accum)
print(accum1.value)

15
65


### Convert PySpark RDD to DataFrame

#### Using rdd.toDF() function

In [0]:
dept = [('Finance',10),('Marketing',20),('Sales',30),('IT', 40)]
rdd = spark.sparkContext.parallelize(dept)
df=rdd.toDF()
df.show()
df.printSchema()
df2=rdd.toDF(['dept_name','dept_id'])
df2.printSchema()
df2.show()

+---------+---+
|       _1| _2|
+---------+---+
|  Finance| 10|
|Marketing| 20|
|    Sales| 30|
|       IT| 40|
+---------+---+

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



#### Using PySpark createDataFrame() function

In [0]:
df = spark.createDataFrame(rdd, schema=['dept_name','dept_id'])
df.show()
df.printSchema()

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)



#### Using createDataFrame() with StructType schema

In [0]:
from pyspark.sql.types import StructType, StructField, StringType
deptSchema = StructType([
    StructField('dept_name', StringType(), True),
    StructField('dept_id', StringType(), True)
])

deptDF1 = spark.createDataFrame(rdd, schema=deptSchema)
deptDF1.printSchema()
deptDF1.show(truncate=False)


root
 |-- dept_name: string (nullable = true)
 |-- dept_id: string (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



### PySpark StructType & StructField Explained with Examples

**StructType** is a collection of **StructFields** that defines column name, column data type, bollena to specify if the field can be nullable or not and metadata
* **StructType**: It Defines the Structure of DataFrame. It is a collection of list of StructField Objects.
* **StructField** : It defines the metadata of the Dataframe column. It defines the columns which includes column name(string), column type(data type), nullable column(Boolean) and metadata(Metadata). Using StructField, we can also add nested struct schema, ArrayType for arrays , and MapType for key-value pairs .

In [0]:
data = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ \
       StructField('firstname', StringType(), True), \
       StructField('middlename', StringType(), True), \
       StructField('lastname', StringType(), True), \
       StructField('id', StringType(), True), \
       StructField('gender', StringType(), True), \
       StructField('salary', StringType(), True)
    ])

df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



#### Defining Nested StructType object struct

In [0]:
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

structureSchema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True), 
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('id', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', StringType(), True) 
])

df2 = spark.createDataFrame(data=structureData, schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

+--------------------+-----+------+------+
|name                |id   |gender|salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3100  |
|{Michael, Rose, }   |40288|M     |4300  |
|{Robert, , Williams}|42114|M     |1400  |
|{Maria, Anne, Jones}|39192|F     |5500  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+



#### Using SQL ArrayType and MapType
SQL StructType also supports ArrayType and MapType to define the DataFrame columns for array and map collections respectively.

In [0]:
from pyspark.sql.types import MapType
arrayStructureSchema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('hobbies', ArrayType(StringType()), True),
    StructField('properties', MapType(StringType(), StringType()),True)
])

### Convert PySpark DataFrame to Pandas

In [0]:
data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

columns = ["first_name","middle_name","last_name","dob","gender","salary"]
pysparkDF = spark.createDataFrame(data = data, schema = columns)
pysparkDF.printSchema()
pysparkDF.show(truncate=False)

pandasDF = pysparkDF.toPandas()
print(pandasDF)


root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|dob  |gender|salary|
+----------+-----------+---------+-----+------+------+
|James     |           |Smith    |36636|M     |60000 |
|Michael   |Rose       |         |40288|M     |70000 |
|Robert    |           |Williams |42114|      |400000|
|Maria     |Anne       |Jones    |39192|F     |500000|
|Jen       |Mary       |Brown    |     |F     |0     |
+----------+-----------+---------+-----+------+------+

  first_name middle_name last_name    dob gender  salary
0      James                 Smith  36636      M   60000
1    Michael        Rose            40288      M   70000
2     Robert              Williams  42114         400000
3      Maria        Anne     Jones 

#### Convert Spark Nested Struct DataFrame to Pandas

In [0]:

# Nested structure elements
from pyspark.sql.types import StructType, StructField, StringType,IntegerType
dataStruct = [(("James","","Smith"),"36636","M","3000"), \
      (("Michael","Rose",""),"40288","M","4000"), \
      (("Robert","","Williams"),"42114","M","4000"), \
      (("Maria","Anne","Jones"),"39192","F","4000"), \
      (("Jen","Mary","Brown"),"","F","-1") \
]

schemaStruct = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
          StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', StringType(), True)
         ])
df = spark.createDataFrame(data=dataStruct, schema = schemaStruct)
df.printSchema()

pandasDF2 = df.toPandas()
print(pandasDF2)


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

                                                name    dob gender salary
0  {'firstname': 'James', 'middlename': '', 'last...  36636      M   3000
1  {'firstname': 'Michael', 'middlename': 'Rose',...  40288      M   4000
2  {'firstname': 'Robert', 'middlename': '', 'las...  42114      M   4000
3  {'firstname': 'Maria', 'middlename': 'Anne', '...  39192      F   4000
4  {'firstname': 'Jen', 'middlename': 'Mary', 'la...             F     -1


### PySpark show() – Display DataFrame Contents in Table

Pyspark Dataframe show() is used to display the content of the Dataframe in a table row and column format. By efault it shows only 20 rows, and the column values are trubcated at 20 characters

syntax :
`
def show(self, n=20, truncate=True, vertical=False):
`

In [0]:
#default - displays 2o rows and 20 characters from column value
df.show()

#display full column content
df.show(truncate=False)

#display 2 rows and full column contents
df.show(2, truncate=False)

#display 2 rows and column values 25 characters
df.show(2, truncate=25)

#display dataframe rows & columns vertically
df.show(n=3, truncate=25, vertical=True)

+--------------------+-----+------+------+
|                name|  dob|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M|  3000|
|   {Michael, Rose, }|40288|     M|  4000|
|{Robert, , Williams}|42114|     M|  4000|
|{Maria, Anne, Jones}|39192|     F|  4000|
|  {Jen, Mary, Brown}|     |     F|    -1|
+--------------------+-----+------+------+

+--------------------+-----+------+------+
|name                |dob  |gender|salary|
+--------------------+-----+------+------+
|{James, , Smith}    |36636|M     |3000  |
|{Michael, Rose, }   |40288|M     |4000  |
|{Robert, , Williams}|42114|M     |4000  |
|{Maria, Anne, Jones}|39192|F     |4000  |
|{Jen, Mary, Brown}  |     |F     |-1    |
+--------------------+-----+------+------+

+-----------------+-----+------+------+
|name             |dob  |gender|salary|
+-----------------+-----+------+------+
|{James, , Smith} |36636|M     |3000  |
|{Michael, Rose, }|40288|M     |4000  |
+-----------------+-----

In [0]:
columns = ["Seqno","Quote"]
data = [("1", "Be the change that you wish to see in the world"),
    ("2", "Everyone thinks of changing the world, but no one thinks of changing himself."),
    ("3", "The purpose of our lives is to be happy."),
    ("4", "Be cool.")]
df = spark.createDataFrame(data,columns)
df.show()
df.show(truncate=False) #Display full column contents
df.show(2, truncate=False) # Display 2 rows & column values 25 characters
df.show(n=3,truncate=25,vertical=True) ## Display DataFrame rows & columns vertically


+-----+--------------------+
|Seqno|               Quote|
+-----+--------------------+
|    1|Be the change tha...|
|    2|Everyone thinks o...|
|    3|The purpose of ou...|
|    4|            Be cool.|
+-----+--------------------+

+-----+-----------------------------------------------------------------------------+
|Seqno|Quote                                                                        |
+-----+-----------------------------------------------------------------------------+
|1    |Be the change that you wish to see in the world                              |
|2    |Everyone thinks of changing the world, but no one thinks of changing himself.|
|3    |The purpose of our lives is to be happy.                                     |
|4    |Be cool.                                                                     |
+-----+-----------------------------------------------------------------------------+

+-----+-----------------------------------------------------------------------

### PySpark Column Class | Operators & Functions

In [0]:

from pyspark.sql.functions import lit
colObj = lit("sparkbyexamples.com")


In [0]:
data = [('James',23),('Ann', 40)]
df = spark.createDataFrame(data).toDF('name.fname','gender')
df.printSchema()

df.select(df.gender).show()
df.select(df["`name.fname`"]).show()


root
 |-- name.fname: string (nullable = true)
 |-- gender: long (nullable = true)

+------+
|gender|
+------+
|    23|
|    40|
+------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
+----------+



In [0]:

data=[("James",23),("Ann",40)]
df=spark.createDataFrame(data).toDF("name.fname","gender")
df.printSchema()
df.select(df.gender).show()
df.select(df["gender"]).show()
df.select(df["`name.fname`"]).show()

#Using SQL col() function
from pyspark.sql.functions import col
df.select(col("gender")).show()
#Accessing column name with dot (with backticks)
df.select(col("`name.fname`")).show()


root
 |-- name.fname: string (nullable = true)
 |-- gender: long (nullable = true)

+------+
|gender|
+------+
|    23|
|    40|
+------+

+------+
|gender|
+------+
|    23|
|    40|
+------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
+----------+

+------+
|gender|
+------+
|    23|
|    40|
+------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
+----------+



In [0]:

#Create DataFrame with struct using Row class
from pyspark.sql import Row
data=[Row(name="James",prop=Row(hair="black",eye="blue")),
      Row(name="Ann",prop=Row(hair="grey",eye="black"))]
df=spark.createDataFrame(data)
df.printSchema()
df.select(df.prop.hair).show()
df.select(df["prop.hair"]).show()
df.select(col("prop.hair")).show()
df.select(col("prop.*")).show()


root
 |-- name: string (nullable = true)
 |-- prop: struct (nullable = true)
 |    |-- hair: string (nullable = true)
 |    |-- eye: string (nullable = true)

+---------+
|prop.hair|
+---------+
|    black|
|     grey|
+---------+

+-----+
| hair|
+-----+
|black|
| grey|
+-----+

+-----+
| hair|
+-----+
|black|
| grey|
+-----+

+-----+-----+
| hair|  eye|
+-----+-----+
|black| blue|
| grey|black|
+-----+-----+



### PySpark Column Operators

In [0]:

data=[(100,2,1),(200,3,4),(300,4,4)]
df=spark.createDataFrame(data).toDF("col1","col2","col3")

#Arthmetic operations
df.select(df.col1 + df.col2).show()
df.select(df.col1 - df.col2).show() 
df.select(df.col1 * df.col2).show()
df.select(df.col1 / df.col2).show()
df.select(df.col1 % df.col2).show()

df.select(df.col2 > df.col3).show()
df.select(df.col2 < df.col3).show()
df.select(df.col2 == df.col3).show()


+-------------+
|(col1 + col2)|
+-------------+
|          102|
|          203|
|          304|
+-------------+

+-------------+
|(col1 - col2)|
+-------------+
|           98|
|          197|
|          296|
+-------------+

+-------------+
|(col1 * col2)|
+-------------+
|          200|
|          600|
|         1200|
+-------------+

+-----------------+
|    (col1 / col2)|
+-----------------+
|             50.0|
|66.66666666666667|
|             75.0|
+-----------------+

+-------------+
|(col1 % col2)|
+-------------+
|            0|
|            2|
|            0|
+-------------+

+-------------+
|(col2 > col3)|
+-------------+
|         true|
|        false|
|        false|
+-------------+

+-------------+
|(col2 < col3)|
+-------------+
|        false|
|         true|
|        false|
+-------------+

+-------------+
|(col2 = col3)|
+-------------+
|        false|
|        false|
|         true|
+-------------+



#### alias() – Set’s name to Column

In [0]:

data=[("James","Bond","100",None),
      ("Ann","Varsa","200",'F'),
      ("Tom Cruise","XXX","400",''),
      ("Tom Brand",None,"400",'M')] 
columns=["fname","lname","id","gender"]
df=spark.createDataFrame(data,columns)



#alias
from pyspark.sql.functions import expr
df.select(df.fname.alias("first_name"), \
          df.lname.alias("last_name")
   ).show()

#Another example
df.select(expr(" fname ||','|| lname").alias("fullName") \
   ).show()


+----------+---------+
|first_name|last_name|
+----------+---------+
|     James|     Bond|
|       Ann|    Varsa|
|Tom Cruise|      XXX|
| Tom Brand|     null|
+----------+---------+

+--------------+
|      fullName|
+--------------+
|    James,Bond|
|     Ann,Varsa|
|Tom Cruise,XXX|
|          null|
+--------------+



#### asc() & desc() – Sort the DataFrame columns by Ascending or Descending order.

In [0]:

#asc, desc to sort ascending and descending order repsectively.
df.sort(df.fname.asc()).show()
df.sort(df.fname.desc()).show()


+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|       Ann|Varsa|200|     F|
|     James| Bond|100|  null|
| Tom Brand| null|400|     M|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| null|400|     M|
|     James| Bond|100|  null|
|       Ann|Varsa|200|     F|
+----------+-----+---+------+



####cast() & astype() – Used to convert the data Type.

In [0]:

#cast
df.select(df.fname,df.id).printSchema()
df.select(df.fname,df.id.cast("int")).printSchema()


root
 |-- fname: string (nullable = true)
 |-- id: string (nullable = true)

root
 |-- fname: string (nullable = true)
 |-- id: integer (nullable = true)



#### between() – Returns a Boolean expression when a column values in between lower and upper bound.

In [0]:

#between
df.filter(df.id.between(100,300)).show()


+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|James| Bond|100|  null|
|  Ann|Varsa|200|     F|
+-----+-----+---+------+



#### contains() – Checks if a DataFrame column value contains a a value specified in this function.

In [0]:
df.filter(df.fname.contains('Cruise')).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



#### startswith() & endswith() – Checks if the value of the DataFrame Column starts and ends with a String respectively.

In [0]:
df.filter(df.fname.startswith('T')).show()
df.filter(df.fname.endswith('Cruise')).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| null|400|     M|
+----------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



d
####isNull & isNotNull() – Checks if the DataFrame column has NULL or non NULL values.

In [0]:
df.filter(df.lname.isNull()).show()
df.filter(df.lname.isNotNull()).show()

+---------+-----+---+------+
|    fname|lname| id|gender|
+---------+-----+---+------+
|Tom Brand| null|400|     M|
+---------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|     James| Bond|100|  null|
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



#### like() & rlike() – Similar to SQL LIKE expression

In [0]:

#like , rlike
df.select(df.fname,df.lname,df.id) \
  .filter(df.fname.like("%om")) 


Out[16]: DataFrame[fname: string, lname: string, id: string]

#### substr() – Returns a Column after getting sub string from the Column

In [0]:
df.select(df.fname.substr(1,2).alias('SUBSTR')).show()

+------+
|SUBSTR|
+------+
|    Ja|
|    An|
|    To|
|    To|
+------+

