In [37]:
from pyspark.sql.functions import *
from pyspark.sql.types import BooleanType

In [2]:
ghlog = spark.read.json("data-files/ghlog/2015-03-01-0.json")
# ghlog = spark.read.json("data-files/ghlog/*.json")

In [3]:
print( ghlog.columns )

['actor', 'created_at', 'id', 'org', 'payload', 'public', 'repo', 'type']


In [None]:
ghlog.printSchema()

In [7]:
ghlog.select('actor').show(5, False)

+----------------------------------------------------------------------------------------------------------------------------------+
|actor                                                                                                                             |
+----------------------------------------------------------------------------------------------------------------------------------+
|[https://avatars.githubusercontent.com/u/739622?, , 739622, treydock, https://api.github.com/users/treydock]                      |
|[https://avatars.githubusercontent.com/u/9063348?, , 9063348, bezerrathm, https://api.github.com/users/bezerrathm]                |
|[https://avatars.githubusercontent.com/u/2474382?, , 2474382, demianborba, https://api.github.com/users/demianborba]              |
|[https://avatars.githubusercontent.com/u/9614759?, , 9614759, GoogleCodeExporter, https://api.github.com/users/GoogleCodeExporter]|
|[https://avatars.githubusercontent.com/u/3196287?, , 3196287, tedsan

In [9]:
ghlog.select('type').distinct().show()

+--------------------+
|                type|
+--------------------+
|           PushEvent|
|         GollumEvent|
|        ReleaseEvent|
|  CommitCommentEvent|
|         CreateEvent|
|PullRequestReview...|
|   IssueCommentEvent|
|         DeleteEvent|
|         IssuesEvent|
|           ForkEvent|
|         PublicEvent|
|         MemberEvent|
|          WatchEvent|
|    PullRequestEvent|
+--------------------+



In [10]:
pushes = ghlog.filter(col('type') == "PushEvent")

In [11]:
pushes.count(), ghlog.count()

(8793, 17786)

In [12]:
pushes.select('actor.login').show(5)

+----------------+
|           login|
+----------------+
|      bezerrathm|
|     demianborba|
|ricardocastaneda|
|          ex3ndr|
|       furutachi|
+----------------+
only showing top 5 rows



In [14]:
grouped = pushes.groupBy('actor.login').count()

In [15]:
grouped.show(5)

+------------+-----+
|       login|count|
+------------+-----+
|john-griffin|    1|
|   digitized|    3|
| theCodeBear|    1|
|      WillHK|    1|
|  sakuya3834|    1|
+------------+-----+
only showing top 5 rows



In [16]:
ordered = grouped.orderBy(col('count').desc())

In [17]:
ordered.show(5)

+------------------+-----+
|             login|count|
+------------------+-----+
|      greatfirebot|  192|
|diversify-exp-user|  146|
|     KenanSulayman|   72|
|        manuelrp07|   45|
|    mirror-updates|   42|
+------------------+-----+
only showing top 5 rows



In [18]:
employees = sc.textFile('data-files/github-employees.txt')

In [19]:
employees.take(5)

['aclindsa', 'adamschwartz', 'ahsojar', 'AiMadobe', 'Akkyie']

In [20]:
employees = employees.collect()

In [22]:
type(employees), employees[:10]

(list,
 ['aclindsa',
  'adamschwartz',
  'ahsojar',
  'AiMadobe',
  'Akkyie',
  'albertn198',
  'alexanderdidenko',
  'allelos',
  'andy-armstrong',
  'aprilx2222'])

In [23]:
bc_employees = sc.broadcast(employees) # 각 worker에게 복사본을 전달해서 유지하는 명령

In [29]:
def isEmployee(id):
    return id in bc_employees.value

In [30]:
isEmployeeUDF = spark.udf.register("isEmployeeUDF", isEmployee, returnType=BooleanType()) # spark sql에서 사용할 수 있는 함수 등록

In [31]:
filtered = ordered.where(isEmployeeUDF(col('login')))

In [32]:
filtered.show(5)

+---------------+-----+
|          login|count|
+---------------+-----+
|  KenanSulayman|   72|
|     manuelrp07|   45|
|        Somasis|   26|
|direwolf-github|   24|
|EmanueleMinotto|   22|
+---------------+-----+
only showing top 5 rows



In [33]:
filtered.write.format('json').save('output/ghlog')

In [34]:
filtered.rdd.getNumPartitions()

24

In [35]:
repartitioned = filtered.repartition(1)
repartitioned.rdd.getNumPartitions()

1

In [36]:
repartitioned.write.format('json').save('output/ghlog2')