In [1]:
from sourced.spark import API as SparkAPI
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.master("local").appName("Examples")\
.getOrCreate()

api = SparkAPI(spark, "/repositories")

In [2]:
repositories_df = api.repositories
references_df = repositories_df.references
commits_df = references_df.commits
files_df = commits_df.files

### Get all HEAD references for all the repositories

In [3]:
heads_df = references_df.head_ref

### Get repositories that are marked as no forks

In [4]:
no_forks_df = repositories_df.filter(repositories_df.is_fork == 0)

### Get only references for original repositories

In [5]:
no_forks_heads_df = heads_df.join(no_forks_df, heads_df.repository_id == no_forks_df.id).drop('repository_id').distinct()

In [6]:
head_commits_df = no_forks_heads_df.join(commits_df, no_forks_heads_df.hash == commits_df.hash)

In [7]:
no_forks_heads_df.show()

+---------------+--------------------+--------------------+--------------------+-------+
|           name|                hash|                  id|                urls|is_fork|
+---------------+--------------------+--------------------+--------------------+-------+
|refs/heads/HEAD|3bbb74a39a2d7e55b...|github.com/geekco...|[https://github.c...|  false|
|refs/heads/HEAD|ab0da50d140873e61...|github.com/AlexPa...|[https://github.c...|  false|
|refs/heads/HEAD|3ba1266862a13c471...|github.com/antoni...|[https://github.c...|  false|
|refs/heads/HEAD|55c65f93887341953...|github.com/dotfen...|[https://github.c...|  false|
|refs/heads/HEAD|ab0da50d140873e61...|github.com/jmailh...|[https://github.c...|  false|
|refs/heads/HEAD|290440b64a73f5c7e...|github.com/mingra...|[https://github.c...|  false|
|refs/heads/HEAD|95bec4223682ba38a...|github.com/bluepi...|[https://github.c...|  false|
+---------------+--------------------+--------------------+--------------------+-------+



In [8]:
head_commits_df.drop("blobs").drop("files").show()

+---------------+--------------------+--------------------+--------------------+-------+--------------------+------------------+-----+--------------------+----------------+--------------------+--------------------+-------------+--------------------+-------------+-------------------+--------------------+--------------+-------------------+
|           name|                hash|                  id|                urls|is_fork|       repository_id|    reference_name|index|                hash|         message|             parents|                tree|parents_count|        author_email|  author_name|        author_date|     committer_email|committer_name|     committer_date|
+---------------+--------------------+--------------------+--------------------+-------+--------------------+------------------+-----+--------------------+----------------+--------------------+--------------------+-------------+--------------------+-------------+-------------------+--------------------+--------------+-

In [9]:
files_df.limit(20).show()

+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------------+-----+--------------------+--------------------+--------------------+-------------+-------------------+-----------+-------------------+-------------------+--------------+-------------------+
|           file_hash|             content|         commit_hash|is_binary|                path|       repository_id|     reference_name|index|             message|             parents|               blobs|parents_count|       author_email|author_name|        author_date|    committer_email|committer_name|     committer_date|
+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------------+-----+--------------------+--------------------+--------------------+-------------+-------------------+-----------+-------------------+-------------------+--------------+-------------------+
|c35d5d02dae341e30.

In [10]:
api.files(repository_ids=["github.com/mingrammer/funmath.git"], reference_names=["refs/heads/HEAD"]).classify_languages().where('lang = "Python"').extract_uasts().select('repository_id', 'reference_name', 'path', 'lang', 'uast').show()

+--------------------+---------------+--------------------+------+--------------------+
|       repository_id| reference_name|                path|  lang|                uast|
+--------------------+---------------+--------------------+------+--------------------+
|github.com/mingra...|refs/heads/HEAD|fibonacci/fibonac...|Python|[0A 06 4D 6F 64 7...|
|github.com/mingra...|refs/heads/HEAD|lcm/lcm_optimal_e...|Python|[0A 06 4D 6F 64 7...|
|github.com/mingra...|refs/heads/HEAD|prime/sieve_of_er...|Python|[0A 06 4D 6F 64 7...|
|github.com/mingra...|refs/heads/HEAD|prime/is_prime_im...|Python|[0A 06 4D 6F 64 7...|
|github.com/mingra...|refs/heads/HEAD|fibonacci/fibonac...|Python|[0A 06 4D 6F 64 7...|
|github.com/mingra...|refs/heads/HEAD|factorial/factori...|Python|[0A 06 4D 6F 64 7...|
|github.com/mingra...|refs/heads/HEAD|euclidean/distanc...|Python|[0A 06 4D 6F 64 7...|
|github.com/mingra...|refs/heads/HEAD|          gcd/gcd.py|Python|[0A 06 4D 6F 64 7...|
|github.com/mingra...|refs/heads

In [11]:
api.files(repository_ids=["github.com/antoniolg/androidmvp.git"], reference_names=["refs/heads/HEAD"]).classify_languages().where('lang = "Java"').extract_uasts().select('repository_id', 'reference_name', 'path', 'lang', 'uast').show()

+--------------------+---------------+--------------------+----+--------------------+
|       repository_id| reference_name|                path|lang|                uast|
+--------------------+---------------+--------------------+----+--------------------+
|github.com/antoni...|refs/heads/HEAD|app/src/main/java...|Java|[0A 0F 43 6F 6D 7...|
|github.com/antoni...|refs/heads/HEAD|app/src/main/java...|Java|[0A 0F 43 6F 6D 7...|
|github.com/antoni...|refs/heads/HEAD|app/src/main/java...|Java|[0A 0F 43 6F 6D 7...|
|github.com/antoni...|refs/heads/HEAD|app/src/main/java...|Java|[0A 0F 43 6F 6D 7...|
|github.com/antoni...|refs/heads/HEAD|app/src/main/java...|Java|[0A 0F 43 6F 6D 7...|
|github.com/antoni...|refs/heads/HEAD|app/src/main/java...|Java|[0A 0F 43 6F 6D 7...|
|github.com/antoni...|refs/heads/HEAD|app/src/main/java...|Java|[0A 0F 43 6F 6D 7...|
|github.com/antoni...|refs/heads/HEAD|app/src/main/java...|Java|[0A 0F 43 6F 6D 7...|
|github.com/antoni...|refs/heads/HEAD|app/src/main/jav