## Create Final Table
清理后的数据中，课程信息和注册信息混杂在一起。这不仅带来了冗余，也带来了一些麻烦，例如：
- 未来我们想要实现按教授名索引课程的功能，然而，部分课程由多个教授任课，这给我们分裂不同课程带来了麻烦。
- 未来我们可能会加入关于不同季度课程的评分数据，如果添加进当前表中会使其非常冗杂。

因此，我们决定将当前清理过的表分为四张表：
- Professors
  - 存储教授信息
  - `prof_id`: 主键
  - `prof_last_name`: 教授的姓, 不可为null
  - `prof_first_name`: 教授的名, 可为null
  - `prof_middle_name`: 中间名, 可谓null

- Courses
  - 存储课程信息
  - `course_offering_id`: 主键
  - `department`: 部门
  - `course_id`: 课程编号
  - `year`: 学年
  - `quarter`: 季度
  - `total`: 总座位数

- Course_Professors
  - 链接表，链接课程和教授。为什么不做列？：因为同一门课可能有多个教授。
  - `course_offering_id`: 外键，连接到 Course 表
  - `prof_id`: 外键，链接到 Professors 表

- Enrollment_Snapshots
  - 注册数据快照表
  - `snapshot_id`: 主键
  - `course_offering_id`: 外键，连接到 Course 表
  - `snapshot_date`: 日期
  - `enrolled_count`: 注册人数
  - `waitlist_count`: 候补名单人数

在未来，我们还可能添加 Comments 表，Course_Rating 表, Professor_Rating 表，Department_Requirement 表等

## Create Process

#### Required parameters and test S3 connection

In [0]:
import json
import os
import uuid
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, sum, when, lit
from pyspark.sql.functions import sum, avg, max, min, count, countDistinct, first, last, mean, stddev, collect_list, collect_set, approx_count_distinct, expr
from pyspark.sql.functions import col, from_unixtime, to_timestamp, date_format, row_number
from pyspark.sql.functions import split, locate, explode, trim, substring, size


# 必要的参数，链接 AWS S3


In [0]:
spark.conf.set("fs.s3a.access.key", AWS_ACCESS_KEY)
spark.conf.set("fs.s3a.secret.key", AWS_SECRET_KEY)

In [0]:
# 关于S3的基本参数
base_path = f"s3a://{BUCKET_NAME}/ucsd"
path_final_data = f"{base_path}/final/final"
path_final_table = f"{base_path}/final_table"

try:
    df = spark.read.csv(f"{path_final_data}", header=True, inferSchema=True)
    display(df.show(3))
    df.printSchema()
except Exception as e:
    print(f"Table read failed: {e}")

+--------------------+----------+-----+--------+-----------+----+-------+----------+---------+
|                prof|      date|total|waitlist|enrolled_ct|year|quarter|department|course_id|
+--------------------+----------+-----+--------+-----------+----+-------+----------+---------+
|Butler; Elizabeth...|2024-01-05|   68|      11|         68|2024| Winter|       AAS|       10|
|Butler; Elizabeth...|2023-11-25|   68|       0|         47|2024| Winter|       AAS|       10|
|Butler; Elizabeth...|2023-11-28|   68|       9|         66|2024| Winter|       AAS|       10|
+--------------------+----------+-----+--------+-----------+----+-------+----------+---------+
only showing top 3 rows

root
 |-- prof: string (nullable = true)
 |-- date: date (nullable = true)
 |-- total: integer (nullable = true)
 |-- waitlist: integer (nullable = true)
 |-- enrolled_ct: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- quarter: string (nullable = true)
 |-- department: string (nullable = 

#### Tool functions

In [0]:
# uuid 生成函数
uuid_udf = F.udf(lambda: str(uuid.uuid4()), StringType())

# 分裂 prof 为first_name, last_name, middle_name
def split_prof_name(df):

    # 分裂多个教授授课的课程
    # 有一些课程由多个教授授课，这种情况下教授名字被 & 链接
    # 例如 Bafna; Vineet & Zhong; Sheng 
    # 将其分裂为多行
    df = df.withColumn("prof", explode(split(col("prof"), "& "))) \
            .withColumn("prof", trim(col("prof")))

    # 对于professor的名字，其格式为 last_name; first_name middle_name(可能为null)
    # 将其分裂为三列
    # 如果没有中间名，prof_middle_name 列为null
    # 如果教授为 Staff，prof_first_name 和 prof_last_name 都为 Staff

    df = df.withColumn("isStaff", col("prof") == lit("Staff"))

    df = df.withColumn(
        "prof_last_name",
        when(col("isStaff"), "Staff")
        .otherwise(trim(split(col("prof"), "; ").getItem(0)))
    ).withColumn(
        "first_middle_name",
        when(col("isStaff"), "Staff")
        .otherwise(trim(split(col("prof"), "; ").getItem(1)))
    )

    df = df.withColumn(
        "prof_first_name", 
        when(col("isStaff"), "Staff")
        .otherwise(split(col("first_middle_name"), " ", 2).getItem(0))
    ).withColumn(
        "prof_middle_name",
        when(col("isStaff"), lit(None))
        .otherwise(
            # 检查有没有middlename
            when(size(split(col("first_middle_name"), " ")) > 1, split(col("first_middle_name"), " ", 2).getItem(1))
            .otherwise(lit(None))
        )
    )
    
    df = df.drop("isStaff", "first_middle_name")

    return df

# 准备 professors 表的数据
def create_prof_table_data(df_original):
    df = df_original
    df = df.select("prof").distinct()
    
    # 分裂教授名字
    df = split_prof_name(df)
    df = df.drop("prof").distinct()

    # 生成主键列
    df = df.withColumn("prof_id", uuid_udf())

    return df

# 准备 Courses 表的数据
def create_courses_table_data(df):
    df = df.select("department", "course_id", "year", "quarter", "total").distinct()
    # 生成主键
    df = df.withColumn("course_offering_id", uuid_udf())
    return df

# 建立 Courses-Professors 连接表
# 两张表之间暂时的JOIN列为 prof 列
def create_courses_professors_table_data(courses, professors, registrations_original):
    
    # 现在多了 prof_first_name, prof_last_name, prof_middle_name 列
    df = split_prof_name(registrations_original)

    # 获取 prof_id
    df = df.join(professors, on=["prof_first_name", "prof_last_name", "prof_middle_name"], how="inner")

    # 获取 courses_id
    df = df.join(courses, on=["year", "quarter", "department", "course_id", "total"], how="inner")

    # 只取需要的两列
    df = df.select("prof_id", "course_offering_id").distinct()

    return df

def create_enrollment_snapshots_table_data(registrations_original):
    df = registrations_original
    
    # 获取 courses_id
    df = df.join(courses, on=["year", "quarter", "department", "course_id", "total"], how="inner")

    # 只取需要的列
    df = df.select("date", "waitlist", "enrolled_ct", "course_offering_id")

    return df


In [0]:
data_final = df = spark.read.csv(f"{path_final_data}", header=True, inferSchema=True)

In [0]:
df_professors = create_prof_table_data(data_final)
df_professors.show(3)

+--------------+---------------+----------------+--------------------+
|prof_last_name|prof_first_name|prof_middle_name|             prof_id|
+--------------+---------------+----------------+--------------------+
|     Borkowski|        Michael|           Humes|e3b57a0c-f7ee-46f...|
|           Som|        Brandon|               D|8affeedc-fbfc-4b9...|
|    Schurmeier|       Kimberly|            null|d32f18d1-e2e8-48d...|
+--------------+---------------+----------------+--------------------+
only showing top 3 rows



In [0]:
df_courses = create_courses_table_data(data_final)
df_courses.show(3)

+----------+---------+----+-------+-----+--------------------+
|department|course_id|year|quarter|total|  course_offering_id|
+----------+---------+----+-------+-----+--------------------+
|       AAS|       11|2024| Winter|   68|a6ced994-4dbd-4e1...|
|      AESE|     278B|2024| Winter|   35|538f65b9-72e6-404...|
|       AAS|       10|2024| Winter|   68|2d0e2d58-55e7-4c2...|
+----------+---------+----+-------+-----+--------------------+
only showing top 3 rows



In [0]:
df_courses_professors = create_courses_professors_table_data(df_courses, df_professors, data_final)
df_courses_professors.show(10)

+--------------------+--------------------+
|             prof_id|  course_offering_id|
+--------------------+--------------------+
|d604ad46-e95a-4fc...|28ccc907-6990-4b0...|
|60198c99-ced1-447...|baf4e4f2-426f-4ce...|
|35cdec05-a87e-4d7...|28ccc907-6990-4b0...|
|35cdec05-a87e-4d7...|963b500f-e578-41a...|
|b168e66f-1785-494...|b2347f63-5668-4e3...|
|9cf9dde4-e897-4e8...|56047e23-a84c-447...|
|50db2a59-592c-466...|f8275cb2-4b8b-4b2...|
|50db2a59-592c-466...|fcc0c32a-11b4-409...|
|f94d2751-5e7e-464...|ce15ce21-189e-4f3...|
|d604ad46-e95a-4fc...|6563b4c0-cd18-430...|
+--------------------+--------------------+
only showing top 10 rows



In [0]:
df_courses_professors.count()

Out[65]: 1635

In [0]:
data_final.select("department", "course_id", "year", "quarter", "prof").where(col("prof") != lit("Staff")).distinct().count()

Out[67]: 2741