In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Basic Transformations'). \
    master('yarn'). \
    getOrCreate()

In [2]:
from pyspark.sql.functions import *
from pyspark.sql import Window

In [3]:
raw_df = spark. \
    read. \
    option("delimiter", "|"). \
    csv("extracolumndata.txt")

In [4]:
raw_df.show()

+--------------+
|           _c0|
+--------------+
|    23, 56, 76|
|75, 17, 76, 97|
|    22, 57, 66|
+--------------+



In [5]:
raw_df = raw_df.withColumn("id", monotonically_increasing_id())

In [6]:
raw_df.show()

+--------------+---+
|           _c0| id|
+--------------+---+
|    23, 56, 76|  0|
|75, 17, 76, 97|  1|
|    22, 57, 66|  2|
+--------------+---+



In [7]:
raw_df.select(
        col("id"),
        split("_c0", ", ").alias("numbers"),
        posexplode(split("_c0", ", ")).alias("pos", "val")
    )\
    .show()

+---+----------------+---+---+
| id|         numbers|pos|val|
+---+----------------+---+---+
|  0|    [23, 56, 76]|  0| 23|
|  0|    [23, 56, 76]|  1| 56|
|  0|    [23, 56, 76]|  2| 76|
|  1|[75, 17, 76, 97]|  0| 75|
|  1|[75, 17, 76, 97]|  1| 17|
|  1|[75, 17, 76, 97]|  2| 76|
|  1|[75, 17, 76, 97]|  3| 97|
|  2|    [22, 57, 66]|  0| 22|
|  2|    [22, 57, 66]|  1| 57|
|  2|    [22, 57, 66]|  2| 66|
+---+----------------+---+---+



In [13]:
raw_df.select(
        col("id"),
        split("_c0", ", ").alias("numbers"),
        posexplode(split("_c0", ", ")).alias("pos", "val")
    )\
.drop("val")\
    .select(
        col("id"),
        concat(lit("col"),col("pos").cast("string")).alias("column_name"),
        expr("numbers[pos]").alias("val")
    )\
.show()

+---+-----------+---+
| id|column_name|val|
+---+-----------+---+
|  0|       col0| 23|
|  0|       col1| 56|
|  0|       col2| 76|
|  1|       col0| 75|
|  1|       col1| 17|
|  1|       col2| 76|
|  1|       col3| 97|
|  2|       col0| 22|
|  2|       col1| 57|
|  2|       col2| 66|
+---+-----------+---+



In [15]:
raw_df.select(
        col("id"),
        split("_c0", ", ").alias("numbers"),
        posexplode(split("_c0", ", ")).alias("pos", "val")
    )\
.drop("val")\
    .select(
        col("id"),
        concat(lit("col"),col("pos").cast("string")).alias("column_name"),
        expr("numbers[pos]").alias("val")
    )\
.groupBy("id").pivot("column_name").agg(first("val"))\
    .show()

+---+----+----+----+----+
| id|col0|col1|col2|col3|
+---+----+----+----+----+
|  0|  23|  56|  76|null|
|  1|  75|  17|  76|  97|
|  2|  22|  57|  66|null|
+---+----+----+----+----+



In [11]:
raw_df.select(
        col("id"),
        split("_c0", ", ").alias("numbers"),
        posexplode(split("_c0", ", ")).alias("pos", "val")
    )\
.drop("val")\
    .select(
        col("id"),
        concat(lit("col"),col("pos").cast("string")).alias("column_name"),
        expr("numbers[pos]").alias("val")
    )\
.groupBy("id").pivot("name").agg(first("val"))\
.drop("id")\
    .show()

+----+----+----+----+
|col0|col1|col2|col3|
+----+----+----+----+
|  23|  56|  76|null|
|  75|  17|  76|  97|
|  22|  57|  66|null|
+----+----+----+----+

