In [1]:
from pyspark.sql import SparkSession
spark  = SparkSession.builder.appName("DataFrameOperations").getOrCreate()

24/03/04 19:24:46 WARN Utils: Your hostname, Ds-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.29.89 instead (on interface en0)
24/03/04 19:24:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/04 19:24:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Types in DataFrame

### Structure types and Structure fields

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

name_structure = StructType([
        StructField(name="FirstName", dataType=StringType()),
        StructField(name="LastName", dataType=StringType())
    ])

data_structure = StructType([
    StructField(name="_id", dataType=IntegerType()),
    StructField(name="Name", dataType = name_structure),
    StructField(name="Salary", dataType=FloatType())
])

data = [
    [1, ('sai', 'pawan'), 200.4],
    [2, ('D', 'pawan'), 2231324.0]
]

df = spark.createDataFrame(data, data_structure)
df.show()
df.printSchema()


+---+------------+---------+
|_id|        Name|   Salary|
+---+------------+---------+
|  1|{sai, pawan}|    200.4|
|  2|  {D, pawan}|2231324.0|
+---+------------+---------+

root
 |-- _id: integer (nullable = true)
 |-- Name: struct (nullable = true)
 |    |-- FirstName: string (nullable = true)
 |    |-- LastName: string (nullable = true)
 |-- Salary: float (nullable = true)



In [4]:
from pyspark.sql.types import ArrayType

data_structure = StructType([
    StructField(name="_id", dataType=IntegerType()),
    StructField(name="Scores", dataType=ArrayType(elementType=IntegerType())),
    StructField(name="Salary", dataType=FloatType())
])

data = [
    [1, (20, 40, 50), 200.4],
    [2, (10, 51), 2231324.0]
]

df = spark.createDataFrame(data, data_structure)

df.show()

df.printSchema()

+---+------------+---------+
|_id|      Scores|   Salary|
+---+------------+---------+
|  1|[20, 40, 50]|    200.4|
|  2|    [10, 51]|2231324.0|
+---+------------+---------+

root
 |-- _id: integer (nullable = true)
 |-- Scores: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- Salary: float (nullable = true)



### Some operation on df using withColumn function for creating or experimenting with arrays.

In [5]:
from pyspark.sql.functions import col, array, array_append

# get the 1st element in above created array as a seperate column.

df.withColumn('First Array element', col=df.Scores[0]).show()
# This can also be done by
df.withColumn('First Array element', col=col('Scores')[0]).show()

# add the salary to the Socres column's 1st index and name the new column as Numbers
df.withColumn("numbers", col= array(col('Scores')[0], col('Salary'))).show()

# add the salary to the Socres arry and name the new column as Numbers
df.withColumn("numbers", col= array_append(col('Scores'), col('Salary'))).show()


+---+------------+---------+-------------------+
|_id|      Scores|   Salary|First Array element|
+---+------------+---------+-------------------+
|  1|[20, 40, 50]|    200.4|                 20|
|  2|    [10, 51]|2231324.0|                 10|
+---+------------+---------+-------------------+

+---+------------+---------+-------------------+
|_id|      Scores|   Salary|First Array element|
+---+------------+---------+-------------------+
|  1|[20, 40, 50]|    200.4|                 20|
|  2|    [10, 51]|2231324.0|                 10|
+---+------------+---------+-------------------+

+---+------------+---------+-----------------+
|_id|      Scores|   Salary|          numbers|
+---+------------+---------+-----------------+
|  1|[20, 40, 50]|    200.4|    [20.0, 200.4]|
|  2|    [10, 51]|2231324.0|[10.0, 2231324.0]|
+---+------------+---------+-----------------+

+---+------------+---------+--------------------+
|_id|      Scores|   Salary|             numbers|
+---+------------+---------

### Exploring ArrayType Functions

In [6]:
# Explode function
from pyspark.sql.functions import explode, col

# Explode function will create new rows with the elements of the arrya specified in the function parameter.
df.withColumn('Score', col=explode(col('Scores'))).show()

+---+------------+---------+-----+
|_id|      Scores|   Salary|Score|
+---+------------+---------+-----+
|  1|[20, 40, 50]|    200.4|   20|
|  1|[20, 40, 50]|    200.4|   40|
|  1|[20, 40, 50]|    200.4|   50|
|  2|    [10, 51]|2231324.0|   10|
|  2|    [10, 51]|2231324.0|   51|
+---+------------+---------+-----+



In [7]:
# Split function
from pyspark.sql.functions import split, col


df = spark.createDataFrame(
    [[1, 'sai,pawan', 99314124], [2, 'pawan,sai', 3268582]],
    ['_id', 'name', 'salary']
)

df.show()

# This will convert all the comma seperated names into array of names.
df.withColumn('name', col=split(col('name'), ',')).show()


+---+---------+--------+
|_id|     name|  salary|
+---+---------+--------+
|  1|sai,pawan|99314124|
|  2|pawan,sai| 3268582|
+---+---------+--------+

+---+------------+--------+
|_id|        name|  salary|
+---+------------+--------+
|  1|[sai, pawan]|99314124|
|  2|[pawan, sai]| 3268582|
+---+------------+--------+



In [8]:
# array function
from pyspark.sql.functions import array, col


df = spark.createDataFrame(
    [[1, 'sai', 'pawan', 99314124], [2, 'pawan', 'sai', 3268582]],
    ['_id', 'FirstName', 'SecondName', 'salary']
)

df.show()

# this function will combile two columns into a array. 
df.withColumn('name', col=array(col('FirstName'), col('SecondName'))).show()

+---+---------+----------+--------+
|_id|FirstName|SecondName|  salary|
+---+---------+----------+--------+
|  1|      sai|     pawan|99314124|
|  2|    pawan|       sai| 3268582|
+---+---------+----------+--------+

+---+---------+----------+--------+------------+
|_id|FirstName|SecondName|  salary|        name|
+---+---------+----------+--------+------------+
|  1|      sai|     pawan|99314124|[sai, pawan]|
|  2|    pawan|       sai| 3268582|[pawan, sai]|
+---+---------+----------+--------+------------+



In [9]:
# Array contains function
from pyspark.sql.functions import array_contains, col

df = spark.createDataFrame(
    [[1, [20, 40, 50], 200.4],[2, [10, 51, ], 2231324.0]], 
    ['_id', 'Scores', 'Salary']    
)

df.show()


# Return a boolean values representing if the values is there or now in the array.
df.withColumn('vauleExist', col = array_contains(col('Scores'), 50)).show()


+---+------------+---------+
|_id|      Scores|   Salary|
+---+------------+---------+
|  1|[20, 40, 50]|    200.4|
|  2|    [10, 51]|2231324.0|
+---+------------+---------+

+---+------------+---------+----------+
|_id|      Scores|   Salary|vauleExist|
+---+------------+---------+----------+
|  1|[20, 40, 50]|    200.4|      true|
|  2|    [10, 51]|2231324.0|     false|
+---+------------+---------+----------+



## Map Type columns

### Defining maptype columns

In [10]:
from pyspark.sql.types import StructField, StructType, MapType, StringType, IntegerType
from pyspark.sql.functions import col


data = [['sai', {'Maths': 20, 'Science': 40}], ['pawan', {'Maths':30, 'Science': 30}]]

# Defining schema to above data.
schema = StructType([
    StructField(name='name', dataType=StringType()),
    StructField(name='marks', dataType=MapType(keyType=StringType(), valueType=IntegerType()))
])

df =  spark.createDataFrame(data, schema)

# Truncate will dilplay all the information.
df.show(truncate= False)
df.printSchema()

+-----+----------------------------+
|name |marks                       |
+-----+----------------------------+
|sai  |{Science -> 40, Maths -> 20}|
|pawan|{Science -> 30, Maths -> 30}|
+-----+----------------------------+

root
 |-- name: string (nullable = true)
 |-- marks: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)



### Accessing the map data

In [11]:
df.withColumn('Science', col = col('marks')['Science']).withColumn('Maths', col = col('marks')['Maths']).show()


+-----+--------------------+-------+-----+
| name|               marks|Science|Maths|
+-----+--------------------+-------+-----+
|  sai|{Science -> 40, M...|     40|   20|
|pawan|{Science -> 30, M...|     30|   30|
+-----+--------------------+-------+-----+



### Exploring fuctions to deal with MapType data.

In [12]:
# Explode Fucntion.
from pyspark.sql.functions import explode

df.select('name',explode(col=col('marks'))).show(truncate= False)

+-----+-------+-----+
|name |key    |value|
+-----+-------+-----+
|sai  |Science|40   |
|sai  |Maths  |20   |
|pawan|Science|30   |
|pawan|Maths  |30   |
+-----+-------+-----+



In [13]:
# Map keya and Map values
from pyspark.sql.functions import map_keys, map_values

df.withColumn('keys', map_keys(df.marks)).withColumn('values', map_values(df.marks)).show(truncate = False)


+-----+----------------------------+----------------+--------+
|name |marks                       |keys            |values  |
+-----+----------------------------+----------------+--------+
|sai  |{Science -> 40, Maths -> 20}|[Science, Maths]|[40, 20]|
|pawan|{Science -> 30, Maths -> 30}|[Science, Maths]|[30, 30]|
+-----+----------------------------+----------------+--------+

