In [None]:
from sourced.engine import Engine
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder\
.master("local[*]").appName("Examples")\
.getOrCreate()

engine = Engine(spark, "/repositories", "siva")

### Count the total of non-fork repositories

In [None]:
engine.repositories.filter("is_fork = false")\
.select("id").distinct()\
.count()

### Get all the blobs of all head commits

In [None]:
head_blobs = engine.repositories.filter("is_fork = false").references\
.head_ref.commits.first_reference_commit\
.tree_entries.blobs\
.classify_languages()\
.filter("is_binary = false")\
.select("file_hash", "path", "content", "lang").filter("lang is not null").cache()

### Get the schema

In [None]:
head_blobs.printSchema()

### Print result

In [None]:
head_blobs.show()

### Top languages per number of blobs

In [None]:
top_ten_langs = head_blobs.distinct()\
.groupBy("lang").agg(count("*").alias("count"))\
.orderBy("count").sort(desc("count")).limit(10)\
.show()

### Get all Java blobs

In [None]:
 head_blobs.groupBy("lang").agg(count("*").alias("count")).filter("lang='Java'").show()

In [None]:
head_blobs.limit(10).show()

### Get identifiers of all Python blobs

In [None]:
idents = engine.repositories.filter("is_fork = false").references\
.head_ref.commits.first_reference_commit\
.tree_entries.blobs\
.classify_languages()\
.extract_uasts()\
.query_uast('//*[@roleIdentifier and not(@roleIncomplete)]')\
.filter("is_binary = false")\
.filter("lang = 'Python'")\
.select("file_hash","result").distinct()

### Get the tokens from the identifiers

In [None]:
tokens = idents.extract_tokens()

In [None]:
tokens.limit(10).show()