In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "893b937a-206a-4738-9b10-f45bc1216923",
   "metadata": {},
   "source": [
    "# ETL with Spark (Local)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "22df453c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, TimestampType\n",
    "\n",
    "import pyspark.sql.functions as F"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "c8ddf25d",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = \"github_events_01.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "967f8dc3",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession.builder \\\n",
    "    .appName(\"ETL\") \\\n",
    "    .getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "d63bfa2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = spark.read.option(\"multiline\", \"true\").json(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f0a94871",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- actor: struct (nullable = true)\n",
      " |    |-- avatar_url: string (nullable = true)\n",
      " |    |-- display_login: string (nullable = true)\n",
      " |    |-- gravatar_id: string (nullable = true)\n",
      " |    |-- id: long (nullable = true)\n",
      " |    |-- login: string (nullable = true)\n",
      " |    |-- url: string (nullable = true)\n",
      " |-- created_at: string (nullable = true)\n",
      " |-- id: string (nullable = true)\n",
      " |-- org: struct (nullable = true)\n",
      " |    |-- avatar_url: string (nullable = true)\n",
      " |    |-- gravatar_id: string (nullable = true)\n",
      " |    |-- id: long (nullable = true)\n",
      " |    |-- login: string (nullable = true)\n",
      " |    |-- url: string (nullable = true)\n",
      " |-- payload: struct (nullable = true)\n",
      " |    |-- action: string (nullable = true)\n",
      " |    |-- comment: struct (nullable = true)\n",
      " |    |    |-- author_association: string (nullable = true)\n",
      " |    |    |-- body: string (nullable = true)\n",
      " |    |    |-- created_at: string (nullable = true)\n",
      " |    |    |-- html_url: string (nullable = true)\n",
      " |    |    |-- id: long (nullable = true)\n",
      " |    |    |-- issue_url: string (nullable = true)\n",
      " |    |    |-- node_id: string (nullable = true)\n",
      " |    |    |-- performed_via_github_app: string (nullable = true)\n",
      " |    |    |-- reactions: struct (nullable = true)\n",
      " |    |    |    |-- +1: long (nullable = true)\n",
      " |    |    |    |-- -1: long (nullable = true)\n",
      " |    |    |    |-- confused: long (nullable = true)\n",
      " |    |    |    |-- eyes: long (nullable = true)\n",
      " |    |    |    |-- heart: long (nullable = true)\n",
      " |    |    |    |-- hooray: long (nullable = true)\n",
      " |    |    |    |-- laugh: long (nullable = true)\n",
      " |    |    |    |-- rocket: long (nullable = true)\n",
      " |    |    |    |-- total_count: long (nullable = true)\n",
      " |    |    |    |-- url: string (nullable = true)\n",
      " |    |    |-- updated_at: string (nullable = true)\n",
      " |    |    |-- url: string (nullable = true)\n",
      " |    |    |-- user: struct (nullable = true)\n",
      " |    |    |    |-- avatar_url: string (nullable = true)\n",
      " |    |    |    |-- events_url: string (nullable = true)\n",
      " |    |    |    |-- followers_url: string (nullable = true)\n",
      " |    |    |    |-- following_url: string (nullable = true)\n",
      " |    |    |    |-- gists_url: string (nullable = true)\n",
      " |    |    |    |-- gravatar_id: string (nullable = true)\n",
      " |    |    |    |-- html_url: string (nullable = true)\n",
      " |    |    |    |-- id: long (nullable = true)\n",
      " |    |    |    |-- login: string (nullable = true)\n",
      " |    |    |    |-- node_id: string (nullable = true)\n",
      " |    |    |    |-- organizations_url: string (nullable = true)\n",
      " |    |    |    |-- received_events_url: string (nullable = true)\n",
      " |    |    |    |-- repos_url: string (nullable = true)\n",
      " |    |    |    |-- site_admin: boolean (nullable = true)\n",
      " |    |    |    |-- starred_url: string (nullable = true)\n",
      " |    |    |    |-- subscriptions_url: string (nullable = true)\n",
      " |    |    |    |-- type: string (nullable = true)\n",
      " |    |    |    |-- url: string (nullable = true)\n",
      " |    |-- issue: struct (nullable = true)\n",
      " |    |    |-- active_lock_reason: string (nullable = true)\n",
      " |    |    |-- assignee: string (nullable = true)\n",
      " |    |    |-- assignees: array (nullable = true)\n",
      " |    |    |    |-- element: string (containsNull = true)\n",
      " |    |    |-- author_association: string (nullable = true)\n",
      " |    |    |-- body: string (nullable = true)\n",
      " |    |    |-- closed_at: string (nullable = true)\n",
      " |    |    |-- comments: long (nullable = true)\n",
      " |    |    |-- comments_url: string (nullable = true)\n",
      " |    |    |-- created_at: string (nullable = true)\n",
      " |    |    |-- events_url: string (nullable = true)\n",
      " |    |    |-- html_url: string (nullable = true)\n",
      " |    |    |-- id: long (nullable = true)\n",
      " |    |    |-- labels: array (nullable = true)\n",
      " |    |    |    |-- element: struct (containsNull = true)\n",
      " |    |    |    |    |-- color: string (nullable = true)\n",
      " |    |    |    |    |-- default: boolean (nullable = true)\n",
      " |    |    |    |    |-- description: string (nullable = true)\n",
      " |    |    |    |    |-- id: long (nullable = true)\n",
      " |    |    |    |    |-- name: string (nullable = true)\n",
      " |    |    |    |    |-- node_id: string (nullable = true)\n",
      " |    |    |    |    |-- url: string (nullable = true)\n",
      " |    |    |-- labels_url: string (nullable = true)\n",
      " |    |    |-- locked: boolean (nullable = true)\n",
      " |    |    |-- milestone: struct (nullable = true)\n",
      " |    |    |    |-- closed_at: string (nullable = true)\n",
      " |    |    |    |-- closed_issues: long (nullable = true)\n",
      " |    |    |    |-- created_at: string (nullable = true)\n",
      " |    |    |    |-- creator: struct (nullable = true)\n",
      " |    |    |    |    |-- avatar_url: string (nullable = true)\n",
      " |    |    |    |    |-- events_url: string (nullable = true)\n",
      " |    |    |    |    |-- followers_url: string (nullable = true)\n",
      " |    |    |    |    |-- following_url: string (nullable = true)\n",
      " |    |    |    |    |-- gists_url: string (nullable = true)\n",
      " |    |    |    |    |-- gravatar_id: string (nullable = true)\n",
      " |    |    |    |    |-- html_url: string (nullable = true)\n",
      " |    |    |    |    |-- id: long (nullable = true)\n",
      " |    |    |    |    |-- login: string (nullable = true)\n",
      " |    |    |    |    |-- node_id: string (nullable = true)\n",
      " |    |    |    |    |-- organizations_url: string (nullable = true)\n",
      " |    |    |    |    |-- received_events_url: string (nullable = true)\n",
      " |    |    |    |    |-- repos_url: string (nullable = true)\n",
      " |    |    |    |    |-- site_admin: boolean (nullable = true)\n",
      " |    |    |    |    |-- starred_url: string (nullable = true)\n",
      " |    |    |    |    |-- subscriptions_url: string (nullable = true)\n",
      " |    |    |    |    |-- type: string (nullable = true)\n",
      " |    |    |    |    |-- url: string (nullable = true)\n",
      " |    |    |    |-- description: string (nullable = true)\n",
      " |    |    |    |-- due_on: string (nullable = true)\n",
      " |    |    |    |-- html_url: string (nullable = true)\n",
      " |    |    |    |-- id: long (nullable = true)\n",
      " |    |    |    |-- labels_url: string (nullable = true)\n",
      " |    |    |    |-- node_id: string (nullable = true)\n",
      " |    |    |    |-- number: long (nullable = true)\n",
      " |    |    |    |-- open_issues: long (nullable = true)\n",
      " |    |    |    |-- state: string (nullable = true)\n",
      " |    |    |    |-- title: string (nullable = true)\n",
      " |    |    |    |-- updated_at: string (nullable = true)\n",
      " |    |    |    |-- url: string (nullable = true)\n",
      " |    |    |-- node_id: string (nullable = true)\n",
      " |    |    |-- number: long (nullable = true)\n",
      " |    |    |-- performed_via_github_app: string (nullable = true)\n",
      " |    |    |-- reactions: struct (nullable = true)\n",
      " |    |    |    |-- +1: long (nullable = true)\n",
      " |    |    |    |-- -1: long (nullable = true)\n",
      " |    |    |    |-- confused: long (nullable = true)\n",
      " |    |    |    |-- eyes: long (nullable = true)\n",
      " |    |    |    |-- heart: long (nullable = true)\n",
      " |    |    |    |-- hooray: long (nullable = true)\n",
      " |    |    |    |-- laugh: long (nullable = true)\n",
      " |    |    |    |-- rocket: long (nullable = true)\n",
      " |    |    |    |-- total_count: long (nullable = true)\n",
      " |    |    |    |-- url: string (nullable = true)\n",
      " |    |    |-- repository_url: string (nullable = true)\n",
      " |    |    |-- state: string (nullable = true)\n",
      " |    |    |-- state_reason: string (nullable = true)\n",
      " |    |    |-- timeline_url: string (nullable = true)\n",
      " |    |    |-- title: string (nullable = true)\n",
      " |    |    |-- updated_at: string (nullable = true)\n",
      " |    |    |-- url: string (nullable = true)\n",
      " |    |    |-- user: struct (nullable = true)\n",
      " |    |    |    |-- avatar_url: string (nullable = true)\n",
      " |    |    |    |-- events_url: string (nullable = true)\n",
      " |    |    |    |-- followers_url: string (nullable = true)\n",
      " |    |    |    |-- following_url: string (nullable = true)\n",
      " |    |    |    |-- gists_url: string (nullable = true)\n",
      " |    |    |    |-- gravatar_id: string (nullable = true)\n",
      " |    |    |    |-- html_url: string (nullable = true)\n",
      " |    |    |    |-- id: long (nullable = true)\n",
      " |    |    |    |-- login: string (nullable = true)\n",
      " |    |    |    |-- node_id: string (nullable = true)\n",
      " |    |    |    |-- organizations_url: string (nullable = true)\n",
      " |    |    |    |-- received_events_url: string (nullable = true)\n",
      " |    |    |    |-- repos_url: string (nullable = true)\n",
      " |    |    |    |-- site_admin: boolean (nullable = true)\n",
      " |    |    |    |-- starred_url: string (nullable = true)\n",
      " |    |    |    |-- subscriptions_url: string (nullable = true)\n",
      " |    |    |    |-- type: string (nullable = true)\n",
      " |    |    |    |-- url: string (nullable = true)\n",
      " |-- public: boolean (nullable = true)\n",
      " |-- repo: struct (nullable = true)\n",
      " |    |-- id: long (nullable = true)\n",
      " |    |-- name: string (nullable = true)\n",
      " |    |-- url: string (nullable = true)\n",
      " |-- type: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0a80aace",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.createOrReplaceTempView(\"staging_events\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b582ae43-5961-4a40-a9c0-696404ed1dc7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+-----------------+\n",
      "|               actor|          created_at|         id|                 org|             payload|public|                repo|             type|\n",
      "+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+-----------------+\n",
      "|{https://avatars....|2022-08-17T15:51:05Z|23487929637|{https://avatars....|{created, {COLLAB...|  true|{75340147, 350org...|IssueCommentEvent|\n",
      "+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+-----------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "table = spark.sql(\"\"\"\n",
    "    select\n",
    "        *\n",
    "        \n",
    "    from\n",
    "        staging_events\n",
    "\"\"\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "7347e0c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "table = spark.sql(\"\"\"\n",
    "    select\n",
    "        id\n",
    "        , type\n",
    "        , created_at\n",
    "        , to_date(created_at) as date\n",
    "        , year(created_at) as year\n",
    "        \n",
    "    from\n",
    "        staging_events\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "917cf32e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+-----------------+--------------------+----------+----+\n",
      "|         id|             type|          created_at|      date|year|\n",
      "+-----------+-----------------+--------------------+----------+----+\n",
      "|23487929637|IssueCommentEvent|2022-08-17T15:51:05Z|2022-08-17|2022|\n",
      "+-----------+-----------------+--------------------+----------+----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "table.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "2b0dd36b-e33e-4f04-8cb4-46c307b0d8b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_csv = \"output_csv\"\n",
    "output_parquet = \"output_parquet\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "74af3629",
   "metadata": {},
   "outputs": [],
   "source": [
    "table.write.partitionBy(\"year\").mode(\"overwrite\").csv(output_csv)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "5fcbfd76",
   "metadata": {},
   "outputs": [],
   "source": [
    "table.write.partitionBy(\"year\").mode(\"overwrite\").parquet(output_parquet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "194c1b1c-7515-49a0-b480-9a3ed3b1eac7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}