In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Spark Metastore'). \
    master('yarn'). \
    getOrCreate()

In [2]:
df_raw = spark.createDataFrame([("Canada", "100;200;300"),
("Canada", "400;500;600"),
("Canada", "700;800;900"),
("India", "150;250;350"),
("India", "450;550;650"),
("India", "750;850;950"),
("USA", "111;222;333")], ["Country", "Values"])

In [3]:
df_raw.show()

+-------+-----------+
|Country|     Values|
+-------+-----------+
| Canada|100;200;300|
| Canada|400;500;600|
| Canada|700;800;900|
|  India|150;250;350|
|  India|450;550;650|
|  India|750;850;950|
|    USA|111;222;333|
+-------+-----------+



In [5]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
splitted_col = split(df_raw["Values"], ";")

country_1 = df_raw.withColumn("c1", splitted_col.getItem(0).cast(IntegerType()))
country_2 = country_1.withColumn("c2", splitted_col.getItem(1).cast(IntegerType()))
country_3 = country_2.withColumn("c3", splitted_col.getItem(2).cast(IntegerType()))

country_3.show()

+-------+-----------+---+---+---+
|Country|     Values| c1| c2| c3|
+-------+-----------+---+---+---+
| Canada|100;200;300|100|200|300|
| Canada|400;500;600|400|500|600|
| Canada|700;800;900|700|800|900|
|  India|150;250;350|150|250|350|
|  India|450;550;650|450|550|650|
|  India|750;850;950|750|850|950|
|    USA|111;222;333|111|222|333|
+-------+-----------+---+---+---+



In [6]:
country_sum = country_3.groupby("Country").sum()

country_sum.show()

+-------+-------+-------+-------+
|Country|sum(c1)|sum(c2)|sum(c3)|
+-------+-------+-------+-------+
|  India|   1350|   1650|   1950|
|    USA|    111|    222|    333|
| Canada|   1200|   1500|   1800|
+-------+-------+-------+-------+



In [7]:
df_res = country_sum.withColumn("Values", concat(col("sum(c1)"), lit(";"), col("sum(c2)"), lit(";"), col("sum(c3)"), lit(";"))). \
    drop("sum(c1)").drop("sum(c2)").drop("sum(c3)")

df_res.show()

+-------+---------------+
|Country|         Values|
+-------+---------------+
|  India|1350;1650;1950;|
|    USA|   111;222;333;|
| Canada|1200;1500;1800;|
+-------+---------------+



### The same above problem using arrayType column

https://kontext.tech/column/spark/316/pyspark-convert-python-arraylist-to-spark-data-frame

https://stackoverflow.com/questions/43444925/how-to-create-dataframe-from-list-in-spark-sql/50969995