
### convert array data into column

In [0]:
# data = [
#     (1, [[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
#     (2, [[10, 11], [12, 13, 14]]),
#     (3, [])
# ]
import pyspark.sql.functions as f
data = [
    (1, [[1, 2, 3], [4, 5, 6]]),
    (2, [[10, 11,20], [12, 13, 14]]),
    (3, [[7, 8, 9], [5,555,999]])
]

df1 = spark.createDataFrame(data = data, schema = ["id","matrix"])

df1.show()


+---+--------------------+
| id|              matrix|
+---+--------------------+
|  1|[[1, 2, 3], [4, 5...|
|  2|[[10, 11, 20], [1...|
|  3|[[7, 8, 9], [5, 5...|
+---+--------------------+



In [0]:
df_outer_explode = df1.withColumn("inner_array",f.explode(df1.matrix))
df_outer_explode.show()

+---+--------------------+-------------+
| id|              matrix|  inner_array|
+---+--------------------+-------------+
|  1|[[1, 2, 3], [4, 5...|    [1, 2, 3]|
|  1|[[1, 2, 3], [4, 5...|    [4, 5, 6]|
|  2|[[10, 11, 20], [1...| [10, 11, 20]|
|  2|[[10, 11, 20], [1...| [12, 13, 14]|
|  3|[[7, 8, 9], [5, 5...|    [7, 8, 9]|
|  3|[[7, 8, 9], [5, 5...|[5, 555, 999]|
+---+--------------------+-------------+



In [0]:
df_final = df_outer_explode.withColumn("inner_array",f.explode(df_outer_explode.inner_array))
df_final.show()

+---+--------------------+-----------+
| id|              matrix|inner_array|
+---+--------------------+-----------+
|  1|[[1, 2, 3], [4, 5...|          1|
|  1|[[1, 2, 3], [4, 5...|          2|
|  1|[[1, 2, 3], [4, 5...|          3|
|  1|[[1, 2, 3], [4, 5...|          4|
|  1|[[1, 2, 3], [4, 5...|          5|
|  1|[[1, 2, 3], [4, 5...|          6|
|  2|[[10, 11, 20], [1...|         10|
|  2|[[10, 11, 20], [1...|         11|
|  2|[[10, 11, 20], [1...|         20|
|  2|[[10, 11, 20], [1...|         12|
|  2|[[10, 11, 20], [1...|         13|
|  2|[[10, 11, 20], [1...|         14|
|  3|[[7, 8, 9], [5, 5...|          7|
|  3|[[7, 8, 9], [5, 5...|          8|
|  3|[[7, 8, 9], [5, 5...|          9|
|  3|[[7, 8, 9], [5, 5...|          5|
|  3|[[7, 8, 9], [5, 5...|        555|
|  3|[[7, 8, 9], [5, 5...|        999|
+---+--------------------+-----------+



#### club similar id values to single row

In [0]:
df_new=df_final.drop("matrix")
df_new.show()

+---+-----------+
| id|inner_array|
+---+-----------+
|  1|          1|
|  1|          2|
|  1|          3|
|  1|          4|
|  1|          5|
|  1|          6|
|  2|         10|
|  2|         11|
|  2|         20|
|  2|         12|
|  2|         13|
|  2|         14|
|  3|          7|
|  3|          8|
|  3|          9|
|  3|          5|
|  3|        555|
|  3|        999|
+---+-----------+



In [0]:
df_new1= df_new.withColumnRenamed('inner_array','cgpas')#.show()

In [0]:
df_clubbed = df_new1.groupBy("id").agg(f.collect_set("cgpas").alias("cgpa"))

df_clubbed.show()

+---+--------------------+
| id|                cgpa|
+---+--------------------+
|  1|  [1, 5, 2, 6, 3, 4]|
|  3|[999, 9, 5, 7, 55...|
|  2|[12, 13, 20, 10, ...|
+---+--------------------+



In [0]:
df_clubbed.cache()

df_clubbed.createOrReplaceTempView("cgpa_list")

In [0]:
%sql
select id, (cgpa) from cgpa_list

id,cgpa
1,"List(1, 5, 2, 6, 3, 4)"
3,"List(999, 9, 5, 7, 555, 8)"
2,"List(12, 13, 20, 10, 14, 11)"
