In [7]:
from pyspark.sql import (
    functions as f,
    Row,
    SparkSession,
    types as t
)

In [None]:
spark = SparkSession.builder.appName("spark_lab").getOrCreate()

In [3]:
# functions.explode(col)
# Returns a new row for each element in the given array or map
df = spark.createDataFrame([Row(a=1, intlist=[1, 2, 3], mapfield={"a" : "b"})])

df.select(f.explode(df.intlist).alias("anInt")).collect()

[Row(anInt=1), Row(anInt=2), Row(anInt=3)]

In [4]:
# functions.split(str, pattern, limit = 1)
# Splits str around matches of the given pattern.
df = spark.createDataFrame([Row(word="hello world and pyspark")])
df.select(f.split(df.word, ' ').alias("word")).collect()

[Row(word=['hello', 'world', 'and', 'pyspark'])]

In [8]:
# types.StructField(name, dataType, nullable=True, metadata=None)
table_schema = t.StructType([
    t.StructField("country", t.StringType(), True),
    t.StructField("temperature", t.FloatType(), True),
    t.StructField("observed_date", t.StringType(), True)])

csv_file_path = "data/raw/temp_with_date.csv"
df = spark.read.schema(table_schema).csv(csv_file_path)
df.printSchema()

root
 |-- country: string (nullable = true)
 |-- temperature: float (nullable = true)
 |-- observed_date: string (nullable = true)



In [9]:
data = df.select("country", "temperature", "observed_date")
min_temperature = data.groupBy("country").min("temperature")
min_temperature.show()

[Stage 3:>                                                          (0 + 1) / 1]

+--------------------+----------------+
|             country|min(temperature)|
+--------------------+----------------+
|                Chad|           -24.0|
|            Anguilla|           -40.0|
|            Paraguay|            30.0|
|               Macao|           -34.0|
|Heard Island and ...|           -39.0|
|               Yemen|           -33.0|
|             Senegal|           -21.0|
|              Sweden|           -29.0|
|             Tokelau|           -35.0|
|            Kiribati|           -26.0|
|French Southern T...|           -22.0|
|   Republic of Korea|           -18.0|
|              Guyana|           -28.0|
|             Eritrea|           -40.0|
|         Philippines|           -34.0|
|              Jersey|           -21.0|
|      Norfolk Island|           -28.0|
|               Tonga|           -40.0|
|           Singapore|           -25.0|
|            Malaysia|           -21.0|
+--------------------+----------------+
only showing top 20 rows



                                                                                

In [10]:
# celsius to fahrenheit: (0°C × 9/5) + 32 
f_temperature = data.withColumn(
                    "temperature",
                    (f.col("temperature") * 9 / 5) + 32)\
                .select("country", "temperature")
f_temperature.show()

+--------------------+-------------------+
|             country|        temperature|
+--------------------+-------------------+
|                Guam|              -13.0|
|                Guam|              102.2|
|              Serbia|              -31.0|
|       French Guiana|               21.2|
|Falkland Islands ...|              -40.0|
|              Brazil|               59.0|
|             Tunisia|-23.799999999999997|
|            Portugal|               44.6|
|                Iran| -7.600000000000001|
|           Australia|               23.0|
|              Gambia|               69.8|
|               Italy|               87.8|
|          Guadeloupe|              -38.2|
|        South Africa|-11.200000000000003|
|              Malawi|               21.2|
|                Iran|               93.2|
|      Norfolk Island|               23.0|
|      Virgin Islands|               null|
|Lao People's Demo...|               77.0|
|   Republic of Korea|-0.3999999999999986|
+----------

In [11]:
table_schema = t.StructType([
    t.StructField("customer_name", t.StringType(), True),
    t.StructField("product_id", t.IntegerType(), True),
    t.StructField("price", t.IntegerType(), True)])

csv_file_path = "data/raw/product.csv"
df = spark.read.schema(table_schema).csv(csv_file_path)

customer_spent = df.groupBy("customer_name").agg(f.round(f.sum("price"), 2).alias("cost"))

sorted_customer_spent = customer_spent.orderBy(f.col("cost").desc())
sorted_customer_spent.show()

[Stage 7:>                                                          (0 + 1) / 1]

+-----------------+----+
|    customer_name|cost|
+-----------------+----+
|     Damion Wolfe|1397|
| Benedict Frazier| 998|
|  Giuseppe Miller| 997|
|    Garret Martin| 997|
|Erminia Robertson| 997|
|     Milan Gibson| 996|
|     Rudy Wheeler| 994|
|   Kathey Baldwin| 994|
|   Williemae Bell| 992|
|Gearldine Aguilar| 988|
|      Jewel Parks| 987|
|     Hyman Castro| 985|
|    Noriko Medina| 984|
|     Garfield Day| 982|
|      Dacia Adams| 981|
|     Taisha Henry| 980|
|    Branda Valdez| 978|
|     Fumiko Weber| 976|
|Geraldo Alexander| 975|
|      Walker Pope| 975|
+-----------------+----+
only showing top 20 rows



                                                                                

In [12]:
table_schema = t.StructType([
    t.StructField("interviewer_id", t.StringType(), False),
    t.StructField("occupation_id", t.StringType(), False),
    t.StructField("rating", t.IntegerType(), False)])

csv_file_path = "data/raw/like.csv"

df = spark.read.schema(table_schema).csv(csv_file_path)

interviewer_count = df.groupBy("occupation_id").count().orderBy(f.desc("count"))

for d in interviewer_count.select("occupation_id", f.col("count").alias("cnt")).collect():
    print(f"{d.occupation_id}: {d.cnt}")
    
    
# But, What if we want to know what occupation_id is?  
# 1100: engineer
# 2030: developer
# 3801: painter
# 3021: chemistry teacher
# 9382: priest

meta = {
    "1100": "engineer",
    "2030": "developer",
    "3801": "painter",
    "3021": "chemistry teacher",
    "9382": "priest"
}

occupation_dict = spark.sparkContext.broadcast(meta)

def get_occupation_name(occupation_id: str) -> str:
    return occupation_dict.value[occupation_id]

occupation_lookup_udf = f.udf(get_occupation_name)

occupation_with_name = interviewer_count.withColumn("occupation_name", occupation_lookup_udf(f.col("occupation_id")))

occupation_with_name.show(10)

1100: 217
3801: 203
2030: 200
3021: 191
9382: 189


[Stage 25:>                                                         (0 + 1) / 1]

+-------------+-----+-----------------+
|occupation_id|count|  occupation_name|
+-------------+-----+-----------------+
|         1100|  217|         engineer|
|         3801|  203|          painter|
|         2030|  200|        developer|
|         3021|  191|chemistry teacher|
|         9382|  189|           priest|
+-------------+-----+-----------------+



                                                                                

In [13]:
csv_file_path = "data/raw/hero-network.csv"

# read file
df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_file_path)

# pyspark.sql.functions.collect_set(col) : Aggregate function: returns a set of objects with duplicate elements eliminated

data = df.groupBy("hero1").agg(f.collect_set("hero2").alias("connection")).withColumnRenamed("hero1", "hero")

# data.show()
# pyspark.sql.functions.concat_ws(sep, *cols): Concatenates multiple input string columns together into a single string column, using the given separator.
data = data.withColumn("connection", f.concat_ws(",", f.col("connection")))
data.show()



+--------------------+--------------------+
|                hero|          connection|
+--------------------+--------------------+
|             ABCISSA|ELSIE DEE,FURY, C...|
|ABOMINATION/EMIL BLO|PO,LOCUST,WATTS,M...|
|             ABSALOM|SHATTERSTAR II/GA...|
|ABSORBING MAN | MUTA|VALKYRIE II | MUT...|
|ABSORBING MAN/CARL C|SOMMERS, APRIL,HE...|
|ADAMS, CONGRESSMAN H|SPIDER-MAN/PETER ...|
| ADAMS, NICOLE NIKKI|JUSTICE II/VANCE ...|
|    ADAMSON, REBECCA|KABALLA,GOLEM III...|
|               ADRIA|DORMAMMU,ANCIENT ...|
|   ADVENT/KYLE GROBE|JUSTICE II/VANCE ...|
|AGAMEMNON II/ANDREI |BLACK WIDOW/NATASHA |
|      AGAMEMNON III/|ASTER, LUCIAN,HOG...|
|            AGAMOTTO|SATANNISH,DORMAMM...|
|         AGENT AXIS/|HUMAN TORCH ANDRO...|
|             AGGAMON|DR. STRANGE/STEPHEN |
|              AGINAR|SIF,REJECT/RAN-SA...|
|                AGON|MARISTA,BLACK BOL...|
|     AGUIRRE, ISOBEL|TERMINUS,HUMAN TO...|
|               AINET|STORM/ORORO MUNRO...|
|    AKUTAGAWA, OSAMU|HUMAN TORC

                                                                                

In [16]:
# DataFrame.coalesce(numPartitions): Returns a new DataFrame that has exactly numPartitions partitions.
data.coalesce(1).write.option("header", True).csv("data/output")

                                                                                

In [17]:
# load the file
csv_file_path = "data/output"
df = spark.read\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .csv(csv_file_path)
df.show()

+--------------------+--------------------+
|                hero|          connection|
+--------------------+--------------------+
|             ABCISSA|ELSIE DEE,FURY, C...|
|ABOMINATION/EMIL BLO|PO,LOCUST,WATTS,M...|
|             ABSALOM|SHATTERSTAR II/GA...|
|ABSORBING MAN | MUTA|VALKYRIE II | MUT...|
|ABSORBING MAN/CARL C|SOMMERS, APRIL,HE...|
|ADAMS, CONGRESSMAN H|SPIDER-MAN/PETER ...|
| ADAMS, NICOLE NIKKI|JUSTICE II/VANCE ...|
|    ADAMSON, REBECCA|KABALLA,GOLEM III...|
|               ADRIA|DORMAMMU,ANCIENT ...|
|   ADVENT/KYLE GROBE|JUSTICE II/VANCE ...|
| AGAMEMNON II/ANDREI| BLACK WIDOW/NATASHA|
|      AGAMEMNON III/|ASTER, LUCIAN,HOG...|
|            AGAMOTTO|SATANNISH,DORMAMM...|
|         AGENT AXIS/|HUMAN TORCH ANDRO...|
|             AGGAMON| DR. STRANGE/STEPHEN|
|              AGINAR|SIF,REJECT/RAN-SA...|
|                AGON|MARISTA,BLACK BOL...|
|     AGUIRRE, ISOBEL|TERMINUS,HUMAN TO...|
|               AINET|STORM/ORORO MUNRO...|
|    AKUTAGAWA, OSAMU|HUMAN TORC

In [18]:
# pyspark.sql.functions.size(col): Collection function: returns the length of the array or map stored in the column.
df = df.withColumn(
        "connection_size",
        f.size(
            f.split(f.col("connection"), ",")))\
        .orderBy(f.desc("connection_size"))
df.show()

+--------------------+--------------------+---------------+
|                hero|          connection|connection_size|
+--------------------+--------------------+---------------+
|     CAPTAIN AMERICA|URICH, DORIS,ARMA...|           1795|
|SPIDER-MAN/PETER PAR|RED SHIFT,GAMELIN...|           1737|
| IRON MAN/TONY STARK|RED SHIFT,SABRETO...|           1443|
|     WOLVERINE/LOGAN|SABRETOOTH/VICTOR...|           1278|
|THING/BENJAMIN J. GR|CHORD, ANDREW,CAT...|           1262|
| SCARLET WITCH/WANDA|SABRETOOTH/VICTOR...|           1246|
|HUMAN TORCH/JOHNNY S|CAT KING,BUZZ,MAK...|           1202|
|MR. FANTASTIC/REED R|ARMADILLO/ANTONIO...|           1200|
|THOR/DR. DONALD BLAK|PARKER, MAY | TIM...|           1183|
| INVISIBLE WOMAN/SUE|CAPTAIN MARVEL II...|           1143|
|BEAST/HENRY &HANK& P|AMERICAN EAGLE II...|           1140|
|              VISION|PHOSPHORUS,AMERIC...|           1110|
|                HAWK|AMERICAN EAGLE II...|           1086|
|CYCLOPS/SCOTT SUMMER|SABRETOOTH/VICTOR.