In [None]:
from sourced.engine import Engine
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder\
.master("local[*]").appName("playground")\
.getOrCreate()

engine = Engine(spark, "repositories")

### Count the total of non-fork repositories

In [None]:
engine.repositories.filter("is_fork = false")\
.select("id").distinct()\
.count()

### Get all the files of all head commits

In [None]:
head_files = engine.repositories.filter("is_fork = false").references\
.head_ref.commits.first_reference_commit\
.files\
.classify_languages()\
.filter("is_binary = false")\
.select("file_hash", "path", "content", "lang").filter("lang is not null").cache()

### Get the schema

In [None]:
head_files.printSchema()

### Print result

In [None]:
head_files.show()

### Top languages per number of files

In [None]:
top_ten_langs = head_files.distinct()\
.groupBy("lang").agg(count("*").alias("count"))\
.orderBy("count").sort(desc("count")).limit(10)\
.show()

### Get all Java files

In [None]:
 head_files.groupBy("lang").agg(count("*").alias("count")).filter("lang='Java'").show()

In [None]:
head_files.limit(10).show()

### Get identifiers of all Python files

In [None]:
idents = engine.repositories.filter("is_fork = false").references\
.head_ref.commits.first_reference_commit\
.files\
.classify_languages()\
.extract_uasts()\
.query_uast('//*[@roleIdentifier and not(@roleIncomplete)]')\
.filter("is_binary = false")\
.filter("lang = 'Python'")\
.select("file_hash","result").distinct()

### Get the tokens from the identifiers

In [None]:
tokens = idents.extract_tokens()

In [None]:
tokens.limit(10).show()