In [1]:
from dotenv import load_dotenv
from github import Github, Auth
import os

load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

if not GITHUB_TOKEN:
    raise ValueError("Specify GITHUB_TOKEN in .env file.")

g = Github(auth=Auth.Token(GITHUB_TOKEN))

In [6]:
query = "pyspark in:name,description"
sort = "stars"
order = "desc"
limit = 3

repos = g.search_repositories(query=query, sort=sort, order=order)

popular_repos = []
for i, repo in enumerate(repos):
    if i >= limit:
        break

    popular_repos.append({
        "name": repo.full_name,
        "url": repo.clone_url,
        "stars": repo.stargazers_count,
        "description": repo.description
    })

for repo in popular_repos:
    print(repo)

{'name': 'AlexIoannides/pyspark-example-project', 'url': 'https://github.com/AlexIoannides/pyspark-example-project.git', 'stars': 2013, 'description': 'Implementing best practices for PySpark ETL jobs and applications.'}
{'name': 'uber/petastorm', 'url': 'https://github.com/uber/petastorm.git', 'stars': 1863, 'description': 'Petastorm library enables single machine or distributed training and evaluation of deep learning models from datasets in Apache Parquet format. It supports ML frameworks such as Tensorflow, Pytorch, and PySpark and can be used from pure Python code.'}
{'name': 'jadianes/spark-py-notebooks', 'url': 'https://github.com/jadianes/spark-py-notebooks.git', 'stars': 1666, 'description': 'Apache Spark & Python (pySpark) tutorials for Big Data Analysis and Machine Learning as IPython / Jupyter notebooks'}


In [9]:
import git
import json
from nbconvert import PythonExporter
import nbformat
import tempfile
import subprocess

for repo in popular_repos[2:]:
    clone_url = repo["url"]

    with tempfile.TemporaryDirectory() as tmpdir:
        # clone repo into temporary directory
        repo_dir = os.path.join(tmpdir, "repo")
        git.Repo.clone_from(clone_url, repo_dir, depth=1)

        # find .ipynb files and convert into .py files
        for root, _, files in os.walk(repo_dir):
            for file in files:
                if file.endswith(".ipynb"):
                    ipynb_path = os.path.join(root, file)
                    py_path = os.path.join(root, file.replace(".ipynb", ".py"))

                    with open(ipynb_path, "r", encoding="utf-8") as rf:
                        nb_node = nbformat.read(rf, as_version=4)

                        exporter = PythonExporter()
                        python_code, _ = exporter.from_notebook_node(nb_node)

                        with open(py_path, "w", encoding="utf-8") as wf:
                            wf.write(python_code)

        # use semgrep to detect PySpark sql, df, udf usage
        result = subprocess.run(
            ["semgrep", "scan", "--config", "pyspark-rules.yml", repo_dir, "--json"],
            capture_output=True,
            text=True,
            check=False
        )

        try:
            data = json.loads(result.stdout)
            print(data)
            matches = data.get("results", [])
            print(f"Found {len(matches)} potential UDF matches\n")

            for match in matches:
                file_path = match["path"]
                start_offset = match["start"]["offset"]
                end_offset = match["end"]["offset"]

                with open(file_path, "r") as f:
                    content = f.read()
                
                snippet = content[start_offset:end_offset]

                print("File: ", file_path)
                print(match["extra"]["message"])
                print(snippet)
                print("="*20)

        except json.JSONDecodeError:
            print("Semgrep output not valid JSON.")
            print(result.stdout[:500])



{'version': '1.140.0', 'results': [{'check_id': 'pyspark-df-detection', 'path': '/tmp/tmpngz4j9yj/repo/nb10-sql-dataframes/nb10-sql-dataframes.py', 'start': {'line': 116, 'col': 1, 'offset': 4469}, 'end': {'line': 116, 'col': 65, 'offset': 4533}, 'extra': {'message': 'Detected PySpark DF operation.', 'metadata': {}, 'severity': 'INFO', 'fingerprint': 'requires login', 'lines': 'requires login', 'validation_state': 'NO_VALIDATOR', 'engine_kind': 'OSS'}}, {'check_id': 'pyspark-df-detection', 'path': '/tmp/tmpngz4j9yj/repo/nb10-sql-dataframes/nb10-sql-dataframes.py', 'start': {'line': 116, 'col': 1, 'offset': 4469}, 'end': {'line': 116, 'col': 90, 'offset': 4558}, 'extra': {'message': 'Detected PySpark DF operation.', 'metadata': {}, 'severity': 'INFO', 'fingerprint': 'requires login', 'lines': 'requires login', 'validation_state': 'NO_VALIDATOR', 'engine_kind': 'OSS'}}, {'check_id': 'pyspark-df-detection', 'path': '/tmp/tmpngz4j9yj/repo/nb10-sql-dataframes/nb10-sql-dataframes.py', 'start