In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 37 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 45.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=5e4f45fbbf7cb0beac34aca016f0ff69fc41c8739c12f8ecc0ad379efc33cf2a
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


PySpark MapType (also called map type) is a data type to represent Python Dictionary (dict) to store key-value pair, a MapType object comprises three fields, keyType (a DataType), valueType (a DataType) and valueContainsNull (a BooleanType).

PySpark MapType is used to represent map key-value pair similar to python Dictionary (Dict), it extends DataType class which is a superclass of all types in PySpark and takes two mandatory arguments keyType and valueType of type DataType and one optional boolean argument valueContainsNull. keyType and valueType can be any type that extends the DataType class. for e.g StringType, IntegerType, ArrayType, MapType, StructType (struct) e.t.c

1. Create PySpark MapType

In [4]:
from pyspark.sql.types import StringType, MapType
mapCol = MapType(StringType(), StringType(), False)

In [5]:
mapCol

MapType(StringType(), StringType(), False)

In [24]:
from pyspark.sql.types import StructField, StructType, StringType, MapType
schema = StructType([StructField('name', StringType(), True),
            StructField('properties', MapType(StringType(),
                                              StringType()) , True)
])



In [7]:
schema

StructType([StructField('name', StringType(), True), StructField('properties', MapType(StringType(), StringType(), True), True)])

In [25]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spark').getOrCreate()

datadic = [
           ('james', {'hair':'black', 'eye':'browm'}),
           ('james', {'hair':'black', 'eye':'browm'}),
           ('james', {'hair':'black', 'eye':'browm'}),
           ('james', {'hair':'black', 'eye':'browm'}),
           ('james', {'hair':'black', 'eye':'browm'})
           ]

df  = spark.createDataFrame(data=datadic, schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+-----+-----------------------------+
|name |properties                   |
+-----+-----------------------------+
|james|{eye -> browm, hair -> black}|
|james|{eye -> browm, hair -> black}|
|james|{eye -> browm, hair -> black}|
|james|{eye -> browm, hair -> black}|
|james|{eye -> browm, hair -> black}|
+-----+-----------------------------+



In [26]:
df3 = df.rdd.map(lambda x:(x.name, x.properties['hair'], x.properties['eye'])).toDF(['name', 'hair', 'eye'])
df3.printSchema()
df3.show()

root
 |-- name: string (nullable = true)
 |-- hair: string (nullable = true)
 |-- eye: string (nullable = true)

+-----+-----+-----+
| name| hair|  eye|
+-----+-----+-----+
|james|black|browm|
|james|black|browm|
|james|black|browm|
|james|black|browm|
|james|black|browm|
+-----+-----+-----+



In [10]:
from pyspark.sql.types import MapType
mapcol = MapType(StringType(), StringType(), True)

In [13]:
from pyspark.sql.types import StructField, StructType, StringType, MapType

schema = StructType(
    [StructField('game', StringType() , True),
     StructField('properties', MapType(StringType(), StringType()), True),
     StructField('respect', MapType(StringType(), StringType()), True)]
)

spark = SparkSession.builder.appName('king').getOrCreate()

datadic = [
           ('james', {'king':'kong', 'eye':'green'}, {'cant':'can', 'hurt':'me'}),
           ('clar', {'live':'long', 'death':'birth'}, {'peace':'war', 'ignorance':'knowledfe'})
            ]

df  = spark.createDataFrame(data=datadic , schema=schema)
df.printSchema()

df.show(truncate=False)

root
 |-- game: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- respect: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+-----+------------------------------+--------------------------------------+
|game |properties                    |respect                               |
+-----+------------------------------+--------------------------------------+
|james|{eye -> green, king -> kong}  |{cant -> can, hurt -> me}             |
|clar |{death -> birth, live -> long}|{ignorance -> knowledfe, peace -> war}|
+-----+------------------------------+--------------------------------------+



In [34]:
from pyspark.sql.types import StringType , IntegerType, StructType

schema = StructType([StructField('my_house', StringType(), True),
                     StructField('rooms', IntegerType(), True),
                     StructField('area', MapType(StringType(), IntegerType(), True))
                     
                     
                     ])

In [16]:
schema

StructType([StructField('my_house', StringType(), True), StructField('rooms', IntegerType(), True), StructField('area', StringType(), True)])

In [49]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('king').getOrCreate()

data3 = [('amla', 23, {'dold':26, 'rold':34}),
        ('bimla', 23, {'dold':21, 'rold':32}),
        ('kamla', 23, {'dold':25, 'rold':35}),
        
        ] 

In [50]:
df4 = df3.rdd.map(lambda x: (x.my_house, x.rooms)).toDF(['my_house', 'rooms'])
df4.printSchema()

root
 |-- my_house: string (nullable = true)
 |-- rooms: long (nullable = true)



In [51]:
df4.show()

+--------+-----+
|my_house|rooms|
+--------+-----+
|    amla|   23|
|   bimla|   23|
|   kamla|   23|
+--------+-----+



In [58]:
df5 = df3.rdd.map(lambda x:(x.my_house, x.rooms, x.area['dold'],x.area['rold'] )).toDF(['my_house', 'rooms', 'area'])

In [46]:
df5.show()

AttributeError: ignored

In [56]:
data3

[('amla', 23, {'dold': 26, 'rold': 34}),
 ('bimla', 23, {'dold': 21, 'rold': 32}),
 ('kamla', 23, {'dold': 25, 'rold': 35})]

In [57]:
df3 = spark.createDataFrame(data3 , schema=schema)
df3.printSchema()
df3.show(truncate=False)

root
 |-- my_house: string (nullable = true)
 |-- rooms: integer (nullable = true)
 |-- area: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)

+--------+-----+------------------------+
|my_house|rooms|area                    |
+--------+-----+------------------------+
|amla    |23   |{rold -> 34, dold -> 26}|
|bimla   |23   |{rold -> 32, dold -> 21}|
|kamla   |23   |{rold -> 35, dold -> 25}|
+--------+-----+------------------------+



3. Access PySpark MapType Elements
Let’s see how to extract the key and values from the PySpark DataFrame Dictionary column. Here I have used PySpark map transformation to read the values of properties (MapType column)

In [59]:
df4.show()

+--------+-----+
|my_house|rooms|
+--------+-----+
|    amla|   23|
|   bimla|   23|
|   kamla|   23|
+--------+-----+



In [60]:
df5.show()

+--------+-----+----+---+
|my_house|rooms|area| _4|
+--------+-----+----+---+
|    amla|   23|  26| 34|
|   bimla|   23|  21| 32|
|   kamla|   23|  25| 35|
+--------+-----+----+---+



In [62]:
df3.withColumn('rold', df3.area.getItem('rold'))\
.withColumn('dold', df3.area.getItem('dold')).show(truncate=False)

+--------+-----+------------------------+----+----+
|my_house|rooms|area                    |rold|dold|
+--------+-----+------------------------+----+----+
|amla    |23   |{rold -> 34, dold -> 26}|34  |26  |
|bimla   |23   |{rold -> 32, dold -> 21}|32  |21  |
|kamla   |23   |{rold -> 35, dold -> 25}|35  |25  |
+--------+-----+------------------------+----+----+



In [64]:
df3.withColumn('rold', df3.area['rold'])\
.withColumn('dold', df3.area['dold']).show(truncate=False)

+--------+-----+------------------------+----+----+
|my_house|rooms|area                    |rold|dold|
+--------+-----+------------------------+----+----+
|amla    |23   |{rold -> 34, dold -> 26}|34  |26  |
|bimla   |23   |{rold -> 32, dold -> 21}|32  |21  |
|kamla   |23   |{rold -> 35, dold -> 25}|35  |25  |
+--------+-----+------------------------+----+----+



In [65]:
from pyspark.sql.functions import explode
df.select(df.name, explode(df.properties)).show()

+-----+----+-----+
| name| key|value|
+-----+----+-----+
|james| eye|browm|
|james|hair|black|
|james| eye|browm|
|james|hair|black|
|james| eye|browm|
|james|hair|black|
|james| eye|browm|
|james|hair|black|
|james| eye|browm|
|james|hair|black|
+-----+----+-----+



In [68]:
df3.select(df3.my_house, explode(df3.area)).show()

+--------+----+-----+
|my_house| key|value|
+--------+----+-----+
|    amla|rold|   34|
|    amla|dold|   26|
|   bimla|rold|   32|
|   bimla|dold|   21|
|   kamla|rold|   35|
|   kamla|dold|   25|
+--------+----+-----+



In [69]:
from pyspark.sql.functions import map_keys
df.select(df.name, map_keys(df.properties)).show()

+-----+--------------------+
| name|map_keys(properties)|
+-----+--------------------+
|james|         [eye, hair]|
|james|         [eye, hair]|
|james|         [eye, hair]|
|james|         [eye, hair]|
|james|         [eye, hair]|
+-----+--------------------+



In [73]:
df3.select(df3.my_house, map_keys(df3.area)).show()

+--------+--------------+
|my_house|map_keys(area)|
+--------+--------------+
|    amla|  [rold, dold]|
|   bimla|  [rold, dold]|
|   kamla|  [rold, dold]|
+--------+--------------+



In [74]:
from pyspark.sql.functions import map_values
df.select(df.name, map_values(df.properties)).show()

+-----+----------------------+
| name|map_values(properties)|
+-----+----------------------+
|james|        [browm, black]|
|james|        [browm, black]|
|james|        [browm, black]|
|james|        [browm, black]|
|james|        [browm, black]|
+-----+----------------------+



In [76]:
df3.select(df3.my_house, map_values(df3.area)).show()

+--------+----------------+
|my_house|map_values(area)|
+--------+----------------+
|    amla|        [34, 26]|
|   bimla|        [32, 21]|
|   kamla|        [35, 25]|
+--------+----------------+

