### Welcome to source{d} jgit-spark-connector

In [None]:
from sourced.engine import Engine
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder\
.master("local[*]").appName("Examples")\
.getOrCreate()

engine = Engine(spark, "/repositories", "siva")

print("%d repositories successfully loaded" % (engine.repositories.count()/2))

### Print the schema to show meta data from repositories

In [None]:
engine.repositories.printSchema()

### Show the first 10 repositories in /repositories

Double the number of repositories maybe shown because of remote and local references in git

In [None]:
engine.repositories.show()

### Filter for only the remote repositories

Prints the full remote reference (incase of GitHub, it's the URL) of the first 10 repositories

In [None]:
engine.repositories\
.references.filter("is_remote = true")\
.select("repository_id")\
.distinct()\
.show(10, False)

### Print schema of blobs in all head commits

In [None]:
head_blobs = engine.repositories.filter("is_fork = false")\
.references.filter("is_remote = true")\
.head_ref.commits.tree_entries.blobs\
.printSchema()

### Get the first 50 Python blobs from head commits

In [None]:
head_blobs = engine.repositories.filter("is_fork = false")\
.references.filter("is_remote = true")\
.head_ref.commits\
.tree_entries.blobs\
.classify_languages()\
.filter("is_binary = false")\
.filter("lang = 'Python'")\
.extract_uasts()\
.limit(50)\
.cache()

### Get the schema

In [None]:
head_blobs.printSchema()

### Print result

In [None]:
head_blobs.show(50)

### Or instead do a sampling of 5% of all the blobs in both Python & Java

In [None]:
head_blobs = engine.repositories.filter("is_fork = false")\
.references.filter("is_remote = true")\
.head_ref.commits\
.tree_entries.blobs\
.classify_languages()\
.filter("is_binary = false")\
.filter(col("lang").isin(["Python", "Java"]))\
.sample(False, 0.05, 1024)\
.extract_uasts()\
.cache()

In [None]:
c = head_blobs.count()
print(c)

In [None]:
head_blobs.show(c)

### Get identifier tokens

In [None]:
idents = head_blobs.query_uast('//*[@roleIdentifier and not(@roleIncomplete)]').cache()

In [None]:
idents.printSchema()

In [None]:
idents = idents.select("repository_id","path","result").distinct()

In [None]:
tokens = idents.extract_tokens().cache()

In [None]:
tokens.printSchema()

In [None]:
tokens.limit(10).show()

### Filter blobs based on a specific token in the blob

In [None]:
tokens.select("path").where(array_contains("tokens", "print")).show(50, False)

### Top languages per number of blobs

Results is only Python because there is a filter above on the blobs

In [None]:
top_ten_langs = head_blobs.distinct()\
.groupBy("lang").agg(count("*").alias("count"))\
.orderBy("count").sort(desc("count")).limit(10)\
.show()

### Decode the blobs into UTF-8 string

Instead of using the UAST's sometimes you want to use code as a string.

In [None]:
decoded_blobs = engine.repositories.filter("is_fork = false").references\
.head_ref.commits\
.tree_entries.blobs\
.classify_languages()\
.filter("is_binary = false")\
.filter("lang = 'Python'")\
.sample(False, 0.2, 4214)\
.withColumn('content', decode('content', 'UTF-8'))\
.cache()

In [None]:
decoded_blobs.printSchema()

In [None]:
decoded_blobs.show()

### Showing commits that have a certain commit message

In [None]:
master_commits = engine.repositories\
.references.filter("is_remote = true")\
.master_ref.commits.cache()

In [None]:
master_commits.printSchema()

In [None]:
master_commits.distinct().count()

In [None]:
bug_commits = master_commits\
.filter(col("message").like("%fix%"))\
.filter(col("message").like("%bug%"))\
.distinct()\
.cache()

In [None]:
bug_commits.count()

By taking samples instead of using limit, you can change the seed and see different results. The second argument to .show() is a bool, when set to True does not truncate the resulting row.

In [None]:
bug_commits.select("repository_id", "message")\
.sample(False, 0.05, 3421)\
.show(10,True)

In [None]:
bug_commits = master_commits\
.filter(col("message").like("%fix%"))\
.filter(col("message").like("%bug%"))\
.distinct()\
.cache()

In [None]:
bug_commits.count()

By taking samples instead of using limit, you can change the seed and see different results. The second argument to .show() is a bool, when set to True does not truncate the resulting row.

In [None]:
bug_commits.select("repository_id", "message")\
.sample(False, 0.05, 3421)\
.show(10,True)