In [1]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Analyze GitHub Archive Data'). \
    master('yarn'). \
    getOrCreate()

In [3]:
ghdata = spark.read.table(f'{username}_ghraw_db.ghactivity')

In [4]:
ghdata.printSchema()

root
 |-- actor: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- display_login: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id: string (nullable = true)
 |-- org: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- action: string (nullable = true)
 |    |-- before: string (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _links: struct (nullable = true)
 |    |    |    |-- html: struct (nullable = true)
 |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- pull_request: struct (nul

In [5]:
ghdata. \
    select('type'). \
    distinct(). \
    show(truncate=False)

+-----------------------------+
|type                         |
+-----------------------------+
|PullRequestReviewEvent       |
|PushEvent                    |
|GollumEvent                  |
|ReleaseEvent                 |
|CommitCommentEvent           |
|CreateEvent                  |
|PullRequestReviewCommentEvent|
|IssueCommentEvent            |
|DeleteEvent                  |
|IssuesEvent                  |
|ForkEvent                    |
|PublicEvent                  |
|MemberEvent                  |
|WatchEvent                   |
|PullRequestEvent             |
+-----------------------------+



In [6]:
ghdata. \
    select('payload.ref_type'). \
    distinct(). \
    show()

+----------+
|  ref_type|
+----------+
|      null|
|       tag|
|    branch|
|repository|
+----------+



In [7]:
from pyspark.sql.functions import count, col, lit

ghdata. \
    groupBy('payload.ref_type'). \
    agg(count(lit(1)).alias('event_count')). \
    orderBy(col('event_count').desc()). \
    show()

+----------+-----------+
|  ref_type|event_count|
+----------+-----------+
|      null|    4715550|
|    branch|     664177|
|repository|     234416|
|       tag|      72786|
+----------+-----------+



In [6]:
ghdata.count()

5686929

In [8]:
from pyspark.sql.functions import substring, count, col, lit

ghdata. \
    filter('payload.ref_type = "repository" AND type = "CreateEvent"'). \
    groupBy(substring('created_at', 1, 10).alias('created_dt')). \
    agg(count(lit(1)).alias('repo_count')). \
    orderBy('created_dt'). \
    show()

+----------+----------+
|created_dt|repo_count|
+----------+----------+
|2021-01-13|    118524|
|2021-01-14|    115892|
+----------+----------+

