In [3]:
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)

# Attribution 3.0 Unported (CC BY 3.0)
# https://www.kaggle.com/datasets/csanhueza/the-marvel-universe-social-network

spark = SparkSession.builder.appName("df_most_popular").getOrCreate()
csv_file_path = "file:///home/jovyan/work/sample/hero-network.csv"
# read file
df = spark.read\
            .option("header", "true")\
            .option("inferSchema", "true").csv(csv_file_path)

# # pyspark.sql.functions.collect_set(col): Aggregate function: returns a set of objects with duplicate elements eliminated.
data = df.groupBy("hero1")\
            .agg(
                f.collect_set("hero2").alias("connection"))\
            .withColumnRenamed("hero1", "hero")
# data.show()

# pyspark.sql.functions.concat_ws(sep, *cols): Concatenates multiple input string columns together into a single string column, using the given separator.
data = data.withColumn("connection", f.concat_ws(",", f.col("connection")))
data.show()

+--------------------+--------------------+
|                hero|          connection|
+--------------------+--------------------+
|             ABCISSA|ELSIE DEE,FURY, C...|
|ABSORBING MAN | MUTA|DRAX | MUTANT X-V...|
|ABSORBING MAN/CARL C|SOMMERS, APRIL,HE...|
|    ADAMSON, REBECCA|KABALLA,GOLEM III...|
|   ADVENT/KYLE GROBE|JUSTICE II/VANCE ...|
|      AGAMEMNON III/|ASTER, LUCIAN,HOG...|
|            AGAMOTTO|MUNIPOOR,DORMAMMU...|
|             AGGAMON|DR. STRANGE/STEPHEN |
|              AGINAR|SIF,REJECT/RAN-SA...|
|                AGON|MARISTA,BLACK BOL...|
|               AINET|STORM/ORORO MUNRO...|
|    AKUTAGAWA, OSAMU|HUMAN TORCH/JOHNN...|
|ALDEN, PROF. MEREDIT|CABE, BETHANY,STO...|
|             ALISTRO|ENCHANTRESS/AMORA...|
|       ALVAREZ, PAUL|ATOR, GENERAL,ZAR...|
|   AMERICAN SAMURAI/|PAGE, KAREN,DARED...|
|             AMPERE/|QUICKSILVER/PIETR...|
|           ANCESTOR/|RECORDER II,FOUND...|
|ANCIENT ONE/BARON MO|BLOODSTORM | MUTA...|
|    ANDERSSEN, TANYA|KA-ZAR/KEV

In [4]:
# # DataFrame.coalesce(numPartitions): Returns a new DataFrame that has exactly numPartitions partitions.
data.coalesce(1).write.option("header", True).csv("output")

In [5]:
# # load the file
csv_file_path = "file:///home/jovyan/work/output"
df = spark.read\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .csv(csv_file_path)
df.show()

+--------------------+--------------------+
|                hero|          connection|
+--------------------+--------------------+
|             ABCISSA|ELSIE DEE,FURY, C...|
|ABSORBING MAN | MUTA|DRAX | MUTANT X-V...|
|ABSORBING MAN/CARL C|SOMMERS, APRIL,HE...|
|    ADAMSON, REBECCA|KABALLA,GOLEM III...|
|   ADVENT/KYLE GROBE|JUSTICE II/VANCE ...|
|      AGAMEMNON III/|ASTER, LUCIAN,HOG...|
|            AGAMOTTO|MUNIPOOR,DORMAMMU...|
|             AGGAMON| DR. STRANGE/STEPHEN|
|              AGINAR|SIF,REJECT/RAN-SA...|
|                AGON|MARISTA,BLACK BOL...|
|               AINET|STORM/ORORO MUNRO...|
|    AKUTAGAWA, OSAMU|HUMAN TORCH/JOHNN...|
|ALDEN, PROF. MEREDIT|CABE, BETHANY,STO...|
|             ALISTRO|ENCHANTRESS/AMORA...|
|       ALVAREZ, PAUL|ATOR, GENERAL,ZAR...|
|   AMERICAN SAMURAI/|PAGE, KAREN,DARED...|
|             AMPERE/|QUICKSILVER/PIETR...|
|           ANCESTOR/|RECORDER II,FOUND...|
|ANCIENT ONE/BARON MO|BLOODSTORM | MUTA...|
|    ANDERSSEN, TANYA|KA-ZAR/KEV

In [6]:
# pyspark.sql.functions.size(col): Collection function: returns the length of the array or map stored in the column.
df = df.withColumn(
        "connection_size",
        f.size(
            f.split(f.col("connection"), ",")))\
        .orderBy(f.desc("connection_size"))
df.show()

most_popular_hero = df.select("hero").first()
print(most_popular_hero.hero)

+--------------------+--------------------+---------------+
|                hero|          connection|connection_size|
+--------------------+--------------------+---------------+
|     CAPTAIN AMERICA|URICH, DORIS,ARMA...|           1795|
|SPIDER-MAN/PETER PAR|MAGMA II/JONATHAN...|           1737|
| IRON MAN/TONY STARK|RED SHIFT,SABRETO...|           1443|
|     WOLVERINE/LOGAN|SABRETOOTH/VICTOR...|           1278|
|THING/BENJAMIN J. GR|CHORD, ANDREW,CAT...|           1262|
| SCARLET WITCH/WANDA|SABRETOOTH/VICTOR...|           1246|
|HUMAN TORCH/JOHNNY S|CAT KING,BUZZ,MAK...|           1202|
|MR. FANTASTIC/REED R|ARMADILLO/ANTONIO...|           1200|
|THOR/DR. DONALD BLAK|PARKER, MAY | TIM...|           1183|
| INVISIBLE WOMAN/SUE|CAPTAIN MARVEL II...|           1143|
|BEAST/HENRY &HANK& P|AMERICAN EAGLE II...|           1140|
|              VISION|PHOSPHORUS,AMERIC...|           1110|
|                HAWK|AMERICAN EAGLE II...|           1086|
|CYCLOPS/SCOTT SUMMER|SABRETOOTH/VICTOR.