In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('dataframe_operations').getOrCreate()

24/03/04 18:56:21 WARN Utils: Your hostname, Ds-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.29.89 instead (on interface en0)
24/03/04 18:56:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/04 18:56:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Row Class in Spark

In [6]:
from pyspark.sql import Row

row1 = Row(name = 'sai', salary = 2000)
print(row1[0], row1[1]) # Rows can be used as tuples 

row2 = Row(name = 'pawan', salary = 4000)
print(row2.name, row2.salary) # Rows can be used as dictonary

data = [row1, row2]

df = spark.createDataFrame(data)
df.show()

sai 2000
pawan 4000
+-----+------+
| name|salary|
+-----+------+
|  sai|  2000|
|pawan|  4000|
+-----+------+



In [7]:
# Rows can be created like class
Person = Row('name', 'salary')
row1 = Person('sai', 2000)
row2  = Person('pawan', 4000)

data = [row1, row2]
df = spark.createDataFrame(data)
df.show()

+-----+------+
| name|salary|
+-----+------+
|  sai|  2000|
|pawan|  4000|
+-----+------+



In [9]:
data = [
    Row(name = Row(FirstName = 'sai', LastName = 'pawan'), salary = 2000 ),
    Row(name = Row(FirstName = 'pawan', LastName = 'sai'), salary = 3000)
]

df = spark.createDataFrame(data)
df.show()
df.printSchema()

+------------+------+
|        name|salary|
+------------+------+
|{sai, pawan}|  2000|
|{pawan, sai}|  3000|
+------------+------+

root
 |-- name: struct (nullable = true)
 |    |-- FirstName: string (nullable = true)
 |    |-- LastName: string (nullable = true)
 |-- salary: long (nullable = true)



## Column Class in Spark

In [10]:
from pyspark.sql import Column

In [13]:
from pyspark.sql.functions import lit

data = [['sai', 2000], ['pawan', 4000]]
schema = ['Name', 'Salary']

df = spark.createDataFrame(data, schema)
df.show()

# Populating the complete column with one value
df = df.withColumn('Leaves', lit(0))
df.show()

+-----+------+
| Name|Salary|
+-----+------+
|  sai|  2000|
|pawan|  4000|
+-----+------+

+-----+------+------+
| Name|Salary|Leaves|
+-----+------+------+
|  sai|  2000|     0|
|pawan|  4000|     0|
+-----+------+------+



In [14]:
# Different ways to access a column
from pyspark.sql.functions import col

df.select(df.Name).show()
df.select(df['Name']).show()
df.select(col('Name')).show()

+-----+
| Name|
+-----+
|  sai|
|pawan|
+-----+

+-----+
| Name|
+-----+
|  sai|
|pawan|
+-----+

+-----+
| Name|
+-----+
|  sai|
|pawan|
+-----+



### Functions to apply on Columns

In [2]:
data = [[1, 'sai', 2000], [2, 'pawan', 3000], [3, 'd', 4000]]
schema = ['_id', 'name', 'salary']

df = spark.createDataFrame(data, schema)
df.show()

                                                                                

+---+-----+------+
|_id| name|salary|
+---+-----+------+
|  1|  sai|  2000|
|  2|pawan|  3000|
|  3|    d|  4000|
+---+-----+------+



In [4]:
# Alias
df.select(df._id.alias('emp_id'), df.name.alias('emp_name'), df.salary.alias('emp_salary')).show()

+------+--------+----------+
|emp_id|emp_name|emp_salary|
+------+--------+----------+
|     1|     sai|      2000|
|     2|   pawan|      3000|
|     3|       d|      4000|
+------+--------+----------+



In [10]:
# Ascending and decending order
df.sort(df.name.asc()).show()
df.sort(df.name.desc()).show()

+---+-----+------+
|_id| name|salary|
+---+-----+------+
|  3|    d|  4000|
|  2|pawan|  3000|
|  1|  sai|  2000|
+---+-----+------+

+---+-----+------+
|_id| name|salary|
+---+-----+------+
|  1|  sai|  2000|
|  2|pawan|  3000|
|  3|    d|  4000|
+---+-----+------+



In [15]:
# Cast function
df.printSchema()
df.select(df._id, df.name, df.salary.cast('int')).printSchema()

root
 |-- _id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)

root
 |-- _id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)



In [19]:
# like
# filter all the data in the df that starts with s
df.filter(df.name.like('s%')).show()

# Or with this you can create new column with boolean values.
df.withColumn('startsWithS',df.name.like('s%')).show()

+---+----+------+
|_id|name|salary|
+---+----+------+
|  1| sai|  2000|
+---+----+------+

+---+-----+------+-----------+
|_id| name|salary|startsWithS|
+---+-----+------+-----------+
|  1|  sai|  2000|       true|
|  2|pawan|  3000|      false|
|  3|    d|  4000|      false|
+---+-----+------+-----------+



<h1>Some other functions on columns</h1>

<img src = 'assets/Column_functions.png' width="500" height="600"/>