In [None]:
from sourced.spark import API as SparkAPI
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder\
.master("local[*]").appName("Examples")\
.getOrCreate()

api = SparkAPI(spark, "/repositories")

### Count the total of non-fork repositories

In [None]:
api.repositories.filter("is_fork = false")\
.select("id").distinct()\
.count()

### Get all the files of all head commits

In [None]:
head_files = api.repositories.filter("is_fork = false").references\
.head_ref.commits.filter("index = 0")\
.files\
.classify_languages()\
.filter("is_binary = false")\
.select("lang","file_hash").filter("lang is not null").cache()

### Get the schema

In [None]:
head_files.printSchema()

### Print result

In [None]:
head_files.show()

### Top languages per number of files

In [None]:
top_ten_langs = head_files.distinct()\
.groupBy("lang").agg(count("*").alias("count"))\
.orderBy("count").sort(desc("count")).limit(20)\
.show()

### Get all Java files

In [None]:
 head_files.groupBy("lang").agg(count("*").alias("count")).filter("lang='Java'").show()

### Unpersist DataFrame from cache

In [None]:
head_files.unpersist()

### Identifiers count using Bblfsh and libUast integrations

In [None]:
head_files = api.repositories.filter("is_fork = false").references\
.head_ref.commits.filter("index = 0")\
.files\
.classify_languages()\
.extract_uasts()\
.query_uast('//*[@roleIdentifier]')\
.query_uast('/*[not(@roleIncomplete)]', 'result', 'result2')\
.filter("is_binary = false")\
.filter("lang = 'Python'")\
.select("lang","file_hash","result2").distinct().cache()

In [None]:
head_files.show()