In [1]:
from pyspark.sql import SparkSession
spark  = SparkSession.builder.appName("DataFrameOperations").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/05 22:45:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/05 22:45:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Types in DataFrame

### Structure types and Structure fields

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

name_structure = StructType([
        StructField(name="FirstName", dataType=StringType()),
        StructField(name="LastName", dataType=StringType())
    ])

data_structure = StructType([
    StructField(name="_id", dataType=IntegerType()),
    StructField(name="Name", dataType = name_structure),
    StructField(name="Salary", dataType=FloatType())
])

data = [
    [1, ('sai', 'pawan'), 200.4],
    [2, ('D', 'pawan'), 2231324.0]
]

df = spark.createDataFrame(data, data_structure)
df.show()
df.printSchema()


+---+------------+---------+
|_id|        Name|   Salary|
+---+------------+---------+
|  1|{sai, pawan}|    200.4|
|  2|  {D, pawan}|2231324.0|
+---+------------+---------+

root
 |-- _id: integer (nullable = true)
 |-- Name: struct (nullable = true)
 |    |-- FirstName: string (nullable = true)
 |    |-- LastName: string (nullable = true)
 |-- Salary: float (nullable = true)



In [3]:
from pyspark.sql.types import ArrayType
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

data_structure = StructType([
    StructField(name="_id", dataType=IntegerType()),
    StructField(name="Scores", dataType=ArrayType(elementType=IntegerType())),
])

data = [
    [1, (20, 40, 50)],
    [2, (10, 51)]
]

df = spark.createDataFrame(data, data_structure)

df.show()

df.printSchema()

+---+------------+
|_id|      Scores|
+---+------------+
|  1|[20, 40, 50]|
|  2|    [10, 51]|
+---+------------+

root
 |-- _id: integer (nullable = true)
 |-- Scores: array (nullable = true)
 |    |-- element: integer (containsNull = true)



### Some operation on df using withColumn function for creating or experimenting with arrays.

In [4]:
from pyspark.sql.functions import col, array, array_append

# get the 1st element in above created array as a seperate column.

df.withColumn('First Array element', col=df.Scores[0]).show()
# This can also be done by
df.withColumn('First Array element', col=col('Scores')[0]).show()

# add the salary to the Socres column's 1st index and name the new column as Numbers
df.withColumn("numbers", col= array(col('Scores')[0], col('Salary'))).show()

# add the salary to the Socres arry and name the new column as Numbers
df.withColumn("numbers", col= array_append(col('Scores'), col('Salary'))).show()


+---+------------+-------------------+
|_id|      Scores|First Array element|
+---+------------+-------------------+
|  1|[20, 40, 50]|                 20|
|  2|    [10, 51]|                 10|
+---+------------+-------------------+

+---+------------+-------------------+
|_id|      Scores|First Array element|
+---+------------+-------------------+
|  1|[20, 40, 50]|                 20|
|  2|    [10, 51]|                 10|
+---+------------+-------------------+



AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Salary` cannot be resolved. Did you mean one of the following? [`_id`, `Scores`].;
'Project [_id#0, Scores#1, array(Scores#1[0], 'Salary) AS numbers#47]
+- LogicalRDD [_id#0, Scores#1], false


### Exploring ArrayType Functions

In [5]:
# Explode function
from pyspark.sql.functions import explode, col

# Explode function will create new rows with the elements of the array specified in the function parameter.
df.withColumn('Score', col=explode(col('Scores'))).show()

+---+------------+-----+
|_id|      Scores|Score|
+---+------------+-----+
|  1|[20, 40, 50]|   20|
|  1|[20, 40, 50]|   40|
|  1|[20, 40, 50]|   50|
|  2|    [10, 51]|   10|
|  2|    [10, 51]|   51|
+---+------------+-----+



In [7]:
# Split function
from pyspark.sql.functions import split, col

df = spark.createDataFrame(
    [[1, 'sai,pawan', 99314124], [2, 'pawan,sai', 3268582]],
    ['_id', 'name', 'salary']
)

df.show()
# This will convert all the ',' seperated names into array of names.
df.withColumn('name', col=split(col('name'), ',')).show()


+---+---------+--------+
|_id|     name|  salary|
+---+---------+--------+
|  1|sai,pawan|99314124|
|  2|pawan,sai| 3268582|
+---+---------+--------+

+---+------------+--------+
|_id|        name|  salary|
+---+------------+--------+
|  1|[sai, pawan]|99314124|
|  2|[pawan, sai]| 3268582|
+---+------------+--------+



In [8]:
# array function
from pyspark.sql.functions import array, col


df = spark.createDataFrame(
    [[1, 'sai', 'pawan', 99314124], [2, 'pawan', 'sai', 3268582]],
    ['_id', 'FirstName', 'SecondName', 'salary']
)

df.show()

# this function will combine two columns into an array. 
df.withColumn('name', col=array(col('FirstName'), col('SecondName'))).show()

+---+---------+----------+--------+
|_id|FirstName|SecondName|  salary|
+---+---------+----------+--------+
|  1|      sai|     pawan|99314124|
|  2|    pawan|       sai| 3268582|
+---+---------+----------+--------+

+---+---------+----------+--------+------------+
|_id|FirstName|SecondName|  salary|        name|
+---+---------+----------+--------+------------+
|  1|      sai|     pawan|99314124|[sai, pawan]|
|  2|    pawan|       sai| 3268582|[pawan, sai]|
+---+---------+----------+--------+------------+



In [6]:
# Array contains function
from pyspark.sql.functions import array_contains, col

df = spark.createDataFrame(
    [[1, [20, 40, 50], 200.4],[2, [10, 51, ], 2231324.0]], 
    ['_id', 'Scores', 'Salary']    
)

df.show()

# Return a boolean value representing if the values is present in the array.
df.withColumn('vauleExist', col = array_contains(col('Scores'), 50)).show()

+---+------------+---------+
|_id|      Scores|   Salary|
+---+------------+---------+
|  1|[20, 40, 50]|    200.4|
|  2|    [10, 51]|2231324.0|
+---+------------+---------+

+---+------------+---------+----------+
|_id|      Scores|   Salary|vauleExist|
+---+------------+---------+----------+
|  1|[20, 40, 50]|    200.4|      true|
|  2|    [10, 51]|2231324.0|     false|
+---+------------+---------+----------+



## Map Type columns

### Defining maptype columns

In [10]:
from pyspark.sql.types import StructField, StructType, MapType, StringType, IntegerType
from pyspark.sql.functions import col


data = [['sai', {'Maths': 20, 'Science': 40}], ['pawan', {'Maths':30, 'Science': 30}]]

# Defining schema
schema = StructType([
    StructField(name='name', dataType=StringType()),
    StructField(name='marks', dataType=MapType(keyType=StringType(), valueType=IntegerType()))
])

df =  spark.createDataFrame(data, schema)

# Truncate will dilplay all the information.
df.show(truncate= False)
df.printSchema()

+-----+----------------------------+
|name |marks                       |
+-----+----------------------------+
|sai  |{Science -> 40, Maths -> 20}|
|pawan|{Science -> 30, Maths -> 30}|
+-----+----------------------------+

root
 |-- name: string (nullable = true)
 |-- marks: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)



### Accessing the map data

In [11]:
df.withColumn('Science', col = col('marks')['Science']).withColumn('Maths', col = col('marks')['Maths']).show()


+-----+--------------------+-------+-----+
| name|               marks|Science|Maths|
+-----+--------------------+-------+-----+
|  sai|{Science -> 40, M...|     40|   20|
|pawan|{Science -> 30, M...|     30|   30|
+-----+--------------------+-------+-----+



### Exploring fuctions to deal with MapType data.

In [12]:
# Explode Fucntion.
from pyspark.sql.functions import explode

# this will create two columns key and values and rows are MapType col's data
df.select('name',explode(col=col('marks'))).show(truncate= False)

+-----+-------+-----+
|name |key    |value|
+-----+-------+-----+
|sai  |Science|40   |
|sai  |Maths  |20   |
|pawan|Science|30   |
|pawan|Maths  |30   |
+-----+-------+-----+



In [13]:
# Map keys and Map values
from pyspark.sql.functions import map_keys, map_values

df.withColumn('keys', map_keys(df.marks)).withColumn('values', map_values(df.marks)).show(truncate = False)

+-----+----------------------------+----------------+--------+
|name |marks                       |keys            |values  |
+-----+----------------------------+----------------+--------+
|sai  |{Science -> 40, Maths -> 20}|[Science, Maths]|[40, 20]|
|pawan|{Science -> 30, Maths -> 30}|[Science, Maths]|[30, 30]|
+-----+----------------------------+----------------+--------+

