# Latihan Apache Spark Transformation

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

## Contoh Map

### Inisialisasi Spark Session

In [2]:
spark = SparkSession.builder.appName("mapExample").getOrCreate()

In [3]:
spark

## Create new dataframe

In [5]:
data = [(1, "Alice"), (2, "Bob"), (3, "Charlie")]
df = spark.createDataFrame(data, ["id", "name"])

## Using map equivalent in DataFrame API (withColumn)

In [7]:
df_new = df.withColumn("id_plus_one", F.col("id") + 1)
df_new.show()

+---+-------+-----------+
| id|   name|id_plus_one|
+---+-------+-----------+
|  1|  Alice|          2|
|  2|    Bob|          3|
|  3|Charlie|          4|
+---+-------+-----------+



## Contoh Filter

In [11]:
filtered_df = df.filter(F.col("id") > 1)
filtered_df.show()

+---+-------+
| id|   name|
+---+-------+
|  2|    Bob|
|  3|Charlie|
+---+-------+



## Contoh Union

### Add new dataframe

In [12]:
data2 = [(4, "David"), (5, "Eve")]
df2 = spark.createDataFrame(data2, ["id", "name"])

In [13]:
union_df = df.union(df2)
union_df.show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
|  4|  David|
|  5|    Eve|
+---+-------+



## Contoh Join

### Define new dataframe with different column

In [14]:
data_salary = [
    (1,"45000", "IT"),(2,"145000", "Manager"),(3,"45000", "HR"),(6,"34000", "Sales")
]
column_data_salary = ['id','salary', 'department']
df_salary = spark.createDataFrame(data_salary,column_data_salary)
df_salary.show()

+---+------+----------+
| id|salary|department|
+---+------+----------+
|  1| 45000|        IT|
|  2|145000|   Manager|
|  3| 45000|        HR|
|  6| 34000|     Sales|
+---+------+----------+



### Inner join on two dataframes 

In [16]:
union_df.join(df_salary, union_df.id == df_salary.id, "inner").show()

+---+-------+---+------+----------+
| id|   name| id|salary|department|
+---+-------+---+------+----------+
|  1|  Alice|  1| 45000|        IT|
|  2|    Bob|  2|145000|   Manager|
|  3|Charlie|  3| 45000|        HR|
+---+-------+---+------+----------+



### Left Join on two dataframe

In [17]:
union_df.join(df_salary, union_df.id == df_salary.id, "left").show()

+---+-------+----+------+----------+
| id|   name|  id|salary|department|
+---+-------+----+------+----------+
|  1|  Alice|   1| 45000|        IT|
|  2|    Bob|   2|145000|   Manager|
|  3|Charlie|   3| 45000|        HR|
|  4|  David|null|  null|      null|
|  5|    Eve|null|  null|      null|
+---+-------+----+------+----------+



### Right Join on two dataframe

In [19]:
union_df.join(df_salary, union_df.id == df_salary.id, "left").show()

+---+-------+----+------+----------+
| id|   name|  id|salary|department|
+---+-------+----+------+----------+
|  1|  Alice|   1| 45000|        IT|
|  2|    Bob|   2|145000|   Manager|
|  3|Charlie|   3| 45000|        HR|
|  4|  David|null|  null|      null|
|  5|    Eve|null|  null|      null|
+---+-------+----+------+----------+



### Full Outer Join
This join joins the two dataframes with all matching and non-matching rows, we can perform this join in three ways


Syntax:

outer: dataframe1.join(dataframe2,dataframe1.column_name ==  dataframe2.column_name,”outer”)

full: dataframe1.join(dataframe2,dataframe1.column_name ==  dataframe2.column_name,”full”)

fullouter: dataframe1.join(dataframe2,dataframe1.column_name ==  dataframe2.column_name,”fullouter”)



### Example Outer

In [20]:
union_df.join(df_salary, union_df.id == df_salary.id, "outer").show()

+----+-------+----+------+----------+
|  id|   name|  id|salary|department|
+----+-------+----+------+----------+
|   1|  Alice|   1| 45000|        IT|
|   2|    Bob|   2|145000|   Manager|
|   3|Charlie|   3| 45000|        HR|
|   4|  David|null|  null|      null|
|   5|    Eve|null|  null|      null|
|null|   null|   6| 34000|     Sales|
+----+-------+----+------+----------+



### Example Full

In [21]:
union_df.join(df_salary, union_df.id == df_salary.id, "full").show()

+----+-------+----+------+----------+
|  id|   name|  id|salary|department|
+----+-------+----+------+----------+
|   1|  Alice|   1| 45000|        IT|
|   2|    Bob|   2|145000|   Manager|
|   3|Charlie|   3| 45000|        HR|
|   4|  David|null|  null|      null|
|   5|    Eve|null|  null|      null|
|null|   null|   6| 34000|     Sales|
+----+-------+----+------+----------+



### Example Fullouter

In [22]:
union_df.join(df_salary, union_df.id == df_salary.id, "fullouter").show()

+----+-------+----+------+----------+
|  id|   name|  id|salary|department|
+----+-------+----+------+----------+
|   1|  Alice|   1| 45000|        IT|
|   2|    Bob|   2|145000|   Manager|
|   3|Charlie|   3| 45000|        HR|
|   4|  David|null|  null|      null|
|   5|    Eve|null|  null|      null|
|null|   null|   6| 34000|     Sales|
+----+-------+----+------+----------+



## Contoh aggregate data

In [43]:
from pyspark.sql.functions import avg, max, min, count, sum, desc

In [23]:
data = [
    ("Andi", "Matematika", 85),
    ("Andi", "Fisika", 78),
    ("Budi", "Matematika", 90),
    ("Budi", "Fisika", 82),
    ("Cici", "Matematika", 88),
    ("Cici", "Fisika", 91)
]

In [25]:
# Membuat DataFrame
kolom = ["nama", "mata_pelajaran", "nilai"]
df_nilai = spark.createDataFrame(data, kolom)

In [27]:
# Menampilkan data asli
print("Data Asli:")
df_nilai.show()

Data Asli:
+----+--------------+-----+
|nama|mata_pelajaran|nilai|
+----+--------------+-----+
|Andi|    Matematika|   85|
|Andi|        Fisika|   78|
|Budi|    Matematika|   90|
|Budi|        Fisika|   82|
|Cici|    Matematika|   88|
|Cici|        Fisika|   91|
+----+--------------+-----+



### Contoh aggregate menggunakan fungsi average

In [30]:
# Aggregasi rata-rata nilai per mata pelajaran
print("Rata-rata Nilai per Mata Pelajaran:")
df_avg_subject = df_nilai.groupBy("mata_pelajaran") \
    .agg(avg("nilai").alias("rata_rata")) \
    .orderBy("mata_pelajaran")

df_avg_subject.show()

Rata-rata Nilai per Mata Pelajaran:
+--------------+-----------------+
|mata_pelajaran|        rata_rata|
+--------------+-----------------+
|        Fisika|83.66666666666667|
|    Matematika|87.66666666666667|
+--------------+-----------------+



In [32]:
# Aggregasi rata-rata nilai per siswa
print("Rata-rata Nilai per Siswa:")
df_avg_nama = df_nilai.groupBy("nama") \
    .agg(avg("nilai").alias("rata_rata")) \
    .orderBy("nama")

df_avg_nama.show()

Rata-rata Nilai per Siswa:
+----+---------+
|nama|rata_rata|
+----+---------+
|Andi|     81.5|
|Budi|     86.0|
|Cici|     89.5|
+----+---------+



### Contoh aggregate menggunakan fungsi max

In [37]:
# Aggregasi maksimal nilai per mata pelajaran
print("Nilai Tertinggi per Mata Pelajaran:")
df_max_subject = df_nilai.groupBy("mata_pelajaran") \
    .agg(max("nilai").alias("tertinggi")) \
    .orderBy("mata_pelajaran")

df_max_subject.show()

Nilai Tertinggi per Mata Pelajaran:
+--------------+---------+
|mata_pelajaran|tertinggi|
+--------------+---------+
|        Fisika|       91|
|    Matematika|       90|
+--------------+---------+



In [38]:
# Aggregasi maksimal nilai per siswa
print("Nilai Tertinggi Nilai per Siswa:")
df_max_nama = df_nilai.groupBy("nama") \
    .agg(max("nilai").alias("tertinggi")) \
    .orderBy("nama")

df_max_nama.show()

Nilai Tertinggi Nilai per Siswa:
+----+---------+
|nama|tertinggi|
+----+---------+
|Andi|       85|
|Budi|       90|
|Cici|       91|
+----+---------+



### Contoh aggregate menggunakan fungsi min

In [39]:
# Aggregasi nilai terrendah per mata pelajaran
print("Nilai Terrendah per Mata Pelajaran:")
df_min_subject = df_nilai.groupBy("mata_pelajaran") \
    .agg(min("nilai").alias("terrendah")) \
    .orderBy("mata_pelajaran")

df_min_subject.show()

Nilai Terrendah per Mata Pelajaran:
+--------------+---------+
|mata_pelajaran|terrendah|
+--------------+---------+
|        Fisika|       78|
|    Matematika|       85|
+--------------+---------+



In [45]:
# Aggregasi nilai terrendah per siswa
print("Nilai Terrendah per Siswa:")
df_min_nama = df_nilai.groupBy("nama") \
    .agg(min("nilai").alias("terrendah")) \
    .orderBy("nama")

df_min_nama.show()

Nilai Terrendah per Siswa:
+----+---------+
|nama|terrendah|
+----+---------+
|Andi|       78|
|Budi|       82|
|Cici|       88|
+----+---------+



In [47]:
# Aggregasi nilai terrendah per siswa dengan sorting nama descending
print("Nilai Terrendah per Siswa:")
df_min_nama = df_nilai.groupBy("nama") \
    .agg(min("nilai").alias("terrendah")) \
    .orderBy(desc("nama"))

df_min_nama.show()

Nilai Terrendah per Siswa:
+----+---------+
|nama|terrendah|
+----+---------+
|Cici|       88|
|Budi|       82|
|Andi|       78|
+----+---------+



# Contoh Spark Action Collect

## Informasi Action Collect
collect() adalah Spark action yang akan:
- Mengeksekusi semua transformasi sebelumnya
- Mengembalikan semua data ke driver program dalam bentuk list of Row objects
- Bisa digunakan untuk memproses data di lokal

## Catatan Penting
Gunakan collect() hanya untuk data kecil karena semua data akan dimuat ke memori driver
Untuk data besar, lebih aman menggunakan take() atau limit()
collect() biasanya digunakan untuk:
- Memvalidasi hasil transformasi
- Membuat visualisasi sederhana
- Mengekstrak hasil akhir ke sistem lokal

In [48]:
# Menggunakan action collect()
print("\nMenggunakan Collect untuk Data Mata Pelajaran:")
result_subject = df_avg_subject.collect()
for row in result_subject:
    print(f"{row.mata_pelajaran}: {row.rata_rata:.2f}")


Menggunakan Collect untuk Data Mata Pelajaran:
Fisika: 83.67
Matematika: 87.67


In [49]:
print("\nMenggunakan Collect untuk Data Siswa :")
result_students = df_avg_nama.collect()
for row in result_students:
    print(f"{row.nama}: {row.rata_rata:.2f}")


Menggunakan Collect untuk Data Siswa :
Andi: 81.50
Budi: 86.00
Cici: 89.50


In [50]:
# Tampilkan dalam format DataFrame asli
print("\nTampilan DataFrame Asli setelah Collect:")
df_avg_nama.show()


Tampilan DataFrame Asli setelah Collect:
+----+---------+
|nama|rata_rata|
+----+---------+
|Andi|     81.5|
|Budi|     86.0|
|Cici|     89.5|
+----+---------+



# Contoh Spark Action Count

## Informasi Action Count

1. count() adalah Spark action yang:
    - Menghitung jumlah record dalam DataFrame
    - Men-trigger eksekusi semua transformasi sebelumnya
    - Bisa digunakan di berbagai konteks (global, grouped, filtered)

2. Variasi penggunaan count:
    - Basic count: df.count()
    - Grouped count: df.groupBy().count()
    - Conditional count: df.filter(condition).count()
    - SQL count: Menggunakan sintaks SQL

3. Best practice:
    - Hindari count() pada DataFrame besar tanpa filter
    - Untuk data grouped, gunakan groupBy().count() daripada collect()
    - Gunakan approx_count_distinct() untuk data besar yang membutuhkan perkiraan

## 1. Count dasar untuk seluruh DataFrame

In [52]:
total_records = df_nilai.count()
print(f"\nTotal records dalam DataFrame: {total_records}")


Total records dalam DataFrame: 6


## 2. Count setelah aggregasi

In [53]:
df_avg_subject = df_nilai.groupBy("mata_pelajaran") \
    .agg(avg("nilai").alias("rata_rata"))

jumlah_mata_pelajaran = df_avg_subject.count()
print(f"Jumlah mata pelajaran unik: {jumlah_mata_pelajaran}")

Jumlah mata pelajaran unik: 2


## 3. Count dengan filter

In [54]:
count_fisika = df_nilai.filter(df_nilai.mata_pelajaran == "Fisika").count()
print(f"Jumlah nilai Fisika: {count_fisika}")

Jumlah nilai Fisika: 3


## 4. Count dalam grouped data

In [55]:
print("\nJumlah data per siswa:")
df_nilai.groupBy("nama").count().orderBy(desc("nama")).show()


Jumlah data per siswa:
+----+-----+
|nama|count|
+----+-----+
|Cici|    2|
|Budi|    2|
|Andi|    2|
+----+-----+



## Menghitung jumlah siswa dengan rata-rata di atas 85

In [56]:
df_avg = df_nilai.groupBy("nama").agg(avg("nilai").alias("rata_rata"))
count_top_students = df_avg.filter(df_avg.rata_rata > 85).count()
print(f"Jumlah siswa dengan rata-rata > 85: {count_top_students}")

Jumlah siswa dengan rata-rata > 85: 2


# Contoh Spark Action Save as file

## 1. Simpan sebagai CSV

In [57]:
df_avg.write \
    .mode("overwrite") \
    .option("header", "true") \
    .csv("output/rata_nilai.csv")

print("Data berhasil disimpan dalam format CSV")

Data berhasil disimpan dalam format CSV


## 2. Simpan sebagai Parquet

In [59]:
df_avg.write \
    .mode("overwrite") \
    .parquet("output/rata_nilai.parquet")

print("Data berhasil disimpan dalam format Parquet")

Data berhasil disimpan dalam format Parquet
