In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
data = [('Shrikant', 'Vitthal', 'Shejwal', 21, 'Pune', 15000),
       ('Sanjivanee', 'Vitthal', 'Shejwal', 22, 'Karad', 46000),
       ('Sagar', 'Vitthal', 'Shejwal', 27, 'Tripura', 105000),
       ('Sita', 'Vitthal', 'Shejwal', 49, 'Umbraj', None),
       ('Vitthal', 'Namdev', 'Shejwal', 55, 'Umbraj', 17000)]
df = spark.createDataFrame(data=data, schema=['first_name', 'middle_name', 'last_name', 'age', 'city', 'salary'])
df.display()

first_name,middle_name,last_name,age,city,salary
Shrikant,Vitthal,Shejwal,21,Pune,15000.0
Sanjivanee,Vitthal,Shejwal,22,Karad,46000.0
Sagar,Vitthal,Shejwal,27,Tripura,105000.0
Sita,Vitthal,Shejwal,49,Umbraj,
Vitthal,Namdev,Shejwal,55,Umbraj,17000.0


In [0]:
data = [(('Shrikant', 'Vitthal', 'Shejwal'), 21, 'Pune', 15000),
       (('Sanjivanee', 'Vitthal', 'Shejwal'), 22, 'Karad', 46000),
       (('Sagar', 'Vitthal', 'Shejwal'), 27, 'Tripura', 105000),
       (('Sita', 'Vitthal', 'Shejwal'), 49, 'Umbraj', None),
       (('Vitthal', 'Namdev', 'Shejwal'), 55, 'Umbraj', 17000)]
schema = StructType([StructField('names', StructType([\
                                         StructField('first_name', StringType(), True),
                                         StructField('middle_name', StringType(), True),
                                         StructField('last_name', StringType(), True)])),
                    StructField('age', IntegerType(), True),
                    StructField('city', StringType(), True),
                    StructField('salary', StringType(), True)])
df1 = spark.createDataFrame(data=data, schema=schema)
df1.display()

names,age,city,salary
"List(Shrikant, Vitthal, Shejwal)",21,Pune,15000.0
"List(Sanjivanee, Vitthal, Shejwal)",22,Karad,46000.0
"List(Sagar, Vitthal, Shejwal)",27,Tripura,105000.0
"List(Sita, Vitthal, Shejwal)",49,Umbraj,
"List(Vitthal, Namdev, Shejwal)",55,Umbraj,17000.0


In [0]:
df1.select('names.*','age','city','salary').display()

first_name,middle_name,last_name,age,city,salary
Shrikant,Vitthal,Shejwal,21,Pune,15000.0
Sanjivanee,Vitthal,Shejwal,22,Karad,46000.0
Sagar,Vitthal,Shejwal,27,Tripura,105000.0
Sita,Vitthal,Shejwal,49,Umbraj,
Vitthal,Namdev,Shejwal,55,Umbraj,17000.0


In [0]:
df1.withColumn('Info', struct(col('age').alias('age'),
                             col('city').alias('city'),
                             col('salary').alias('salary'), when(col('salary')<=20000, 'low')\
                             .when(col('salary')>40000, 'Medium')\
                             .otherwise('High').alias('salary_grade'))).drop('age','city','salary').display()

names,Info
"List(Shrikant, Vitthal, Shejwal)","List(21, Pune, 15000, low)"
"List(Sanjivanee, Vitthal, Shejwal)","List(22, Karad, 46000, Medium)"
"List(Sagar, Vitthal, Shejwal)","List(27, Tripura, 105000, Medium)"
"List(Sita, Vitthal, Shejwal)","List(49, Umbraj, null, High)"
"List(Vitthal, Namdev, Shejwal)","List(55, Umbraj, 17000, low)"


In [0]:
df.withColumn('names', struct(col('first_name').alias('first_name'),
                             col('middle_name').alias('middle_name'),
                             col('last_name').alias('last_name'))).display()

first_name,middle_name,last_name,age,city,salary,names
Shrikant,Vitthal,Shejwal,21,Pune,15000.0,"List(Shrikant, Vitthal, Shejwal)"
Sanjivanee,Vitthal,Shejwal,22,Karad,46000.0,"List(Sanjivanee, Vitthal, Shejwal)"
Sagar,Vitthal,Shejwal,27,Tripura,105000.0,"List(Sagar, Vitthal, Shejwal)"
Sita,Vitthal,Shejwal,49,Umbraj,,"List(Sita, Vitthal, Shejwal)"
Vitthal,Namdev,Shejwal,55,Umbraj,17000.0,"List(Vitthal, Namdev, Shejwal)"


In [0]:
df1.display()

names,age,city,salary
"List(Shrikant, Vitthal, Shejwal)",21,Pune,15000.0
"List(Sanjivanee, Vitthal, Shejwal)",22,Karad,46000.0
"List(Sagar, Vitthal, Shejwal)",27,Tripura,105000.0
"List(Sita, Vitthal, Shejwal)",49,Umbraj,
"List(Vitthal, Namdev, Shejwal)",55,Umbraj,17000.0


In [0]:
df1.select('names.*','age','city','salary').display()

first_name,middle_name,last_name,age,city,salary
Shrikant,Vitthal,Shejwal,21,Pune,15000.0
Sanjivanee,Vitthal,Shejwal,22,Karad,46000.0
Sagar,Vitthal,Shejwal,27,Tripura,105000.0
Sita,Vitthal,Shejwal,49,Umbraj,
Vitthal,Namdev,Shejwal,55,Umbraj,17000.0


In [0]:
data = [('Shrikant, Vitthal, Shejwal',['english','marathi','hindi','german'],21, 'Pune', 15000),
       ('Sanjivanee, Vitthal, Shejwal',['english','marathi','hindi'], 22, 'Karad', 46000),
       ('Sagar, Vitthal, Shejwal',['english','marathi','hindi'], 27, 'Tripura', 105000),
       ('Sita, Vitthal, Shejwal',['english','marathi'], 49, 'Umbraj', None),
       ('Vitthal, Namdev, Shejwal',['english','marathi','hindi'], 55, 'Umbraj', 17000)]

schema = StructType([\
                    StructField('names', StringType(), True),
                    StructField('languages', ArrayType(StringType(), True)),
                    StructField('age', IntegerType(), True),
                    StructField('city', StringType(), True),
                    StructField('salary', IntegerType(), True)\
                    ])

df3 = spark.createDataFrame(data=data, schema=schema)
df3.display()

names,languages,age,city,salary
"Shrikant, Vitthal, Shejwal","List(english, marathi, hindi, german)",21,Pune,15000.0
"Sanjivanee, Vitthal, Shejwal","List(english, marathi, hindi)",22,Karad,46000.0
"Sagar, Vitthal, Shejwal","List(english, marathi, hindi)",27,Tripura,105000.0
"Sita, Vitthal, Shejwal","List(english, marathi)",49,Umbraj,
"Vitthal, Namdev, Shejwal","List(english, marathi, hindi)",55,Umbraj,17000.0


In [0]:
df4 = df3.select(split('names',',').alias('Names'), 'languages', 'age', 'city', 'salary')
df4.display()

Names,languages,age,city,salary
"List(Shrikant, Vitthal, Shejwal)","List(english, marathi, hindi, german)",21,Pune,15000.0
"List(Sanjivanee, Vitthal, Shejwal)","List(english, marathi, hindi)",22,Karad,46000.0
"List(Sagar, Vitthal, Shejwal)","List(english, marathi, hindi)",27,Tripura,105000.0
"List(Sita, Vitthal, Shejwal)","List(english, marathi)",49,Umbraj,
"List(Vitthal, Namdev, Shejwal)","List(english, marathi, hindi)",55,Umbraj,17000.0


In [0]:
df5 = df4.withColumn('Info', array(col('age'), col('city'), col('salary'))).drop('age', 'city', 'salary')
df5.display()

Names,languages,Info
"List(Shrikant, Vitthal, Shejwal)","List(english, marathi, hindi, german)","List(21, Pune, 15000)"
"List(Sanjivanee, Vitthal, Shejwal)","List(english, marathi, hindi)","List(22, Karad, 46000)"
"List(Sagar, Vitthal, Shejwal)","List(english, marathi, hindi)","List(27, Tripura, 105000)"
"List(Sita, Vitthal, Shejwal)","List(english, marathi)","List(49, Umbraj, null)"
"List(Vitthal, Namdev, Shejwal)","List(english, marathi, hindi)","List(55, Umbraj, 17000)"


In [0]:
schema = StructType([\
                    StructField('name', StringType(), True),
                   StructField('info', MapType(StringType(), IntegerType()), True),
                   ])
data = [('Shrikant', {'age':21, 'salary':15000}),
       ('Sagar', {'age':27, 'salary':100000})]
df6 = spark.createDataFrame(data=data, schema=schema)
df6.display()

name,info
Shrikant,"Map(salary -> 15000, age -> 21)"
Sagar,"Map(salary -> 100000, age -> 27)"


In [0]:
df6 = df6.select('name', explode('info'))
df6.display()

name,key,value
Shrikant,salary,15000
Shrikant,age,21
Sagar,salary,100000
Sagar,age,27


In [0]:
df6 = df6.groupBy('name').pivot('key').sum('value')
df6.display()

name,age,salary
Shrikant,21,15000
Sagar,27,100000


In [0]:
df6.withColumn('info', array(col('name'), col('age'), col('salary'))).display()

name,age,salary,info
Shrikant,21,15000,"List(Shrikant, 21, 15000)"
Sagar,27,100000,"List(Sagar, 27, 100000)"
