In [1]:
from dotenv import load_dotenv
from github import Github, Auth
import os

load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

if not GITHUB_TOKEN:
    raise ValueError("Specify GITHUB_TOKEN in .env file.")

g = Github(auth=Auth.Token(GITHUB_TOKEN))

In [2]:
query = "pyspark in:name,description"
sort = "stars"
order = "desc"
limit = 20

repos = g.search_repositories(query=query, sort=sort, order=order)

popular_repos = []
for i, repo in enumerate(repos):
    if i >= limit:
        break

    popular_repos.append({
        "name": repo.full_name,
        "url": repo.clone_url,
        "stars": repo.stargazers_count,
        "description": repo.description
    })

for repo in popular_repos:
    print(repo)

{'name': 'AlexIoannides/pyspark-example-project', 'url': 'https://github.com/AlexIoannides/pyspark-example-project.git', 'stars': 2013, 'description': 'Implementing best practices for PySpark ETL jobs and applications.'}
{'name': 'uber/petastorm', 'url': 'https://github.com/uber/petastorm.git', 'stars': 1865, 'description': 'Petastorm library enables single machine or distributed training and evaluation of deep learning models from datasets in Apache Parquet format. It supports ML frameworks such as Tensorflow, Pytorch, and PySpark and can be used from pure Python code.'}
{'name': 'jadianes/spark-py-notebooks', 'url': 'https://github.com/jadianes/spark-py-notebooks.git', 'stars': 1666, 'description': 'Apache Spark & Python (pySpark) tutorials for Big Data Analysis and Machine Learning as IPython / Jupyter notebooks'}
{'name': 'ptyadana/SQL-Data-Analysis-and-Visualization-Projects', 'url': 'https://github.com/ptyadana/SQL-Data-Analysis-and-Visualization-Projects.git', 'stars': 1578, 'de

In [13]:
import git
import json
from nbconvert import PythonExporter
import nbformat
import tempfile
import subprocess

# Create results jsonl
os.makedirs("results", exist_ok=True)
summary_path = os.path.join("results", "summary.jsonl")

with open(summary_path, "w", encoding="utf-8") as summary_file:
    for repo in popular_repos:
        repo_name = repo["name"]
        clone_url = repo["url"]

        with tempfile.TemporaryDirectory() as tmpdir:
            # clone repo into temporary directory
            repo_dir = os.path.join(tmpdir, "repo")
            git.Repo.clone_from(clone_url, repo_dir, depth=1)

            # find .ipynb files and convert into .py files
            for root, _, files in os.walk(repo_dir):
                for file in files:
                    if file.endswith(".ipynb"):
                        ipynb_path = os.path.join(root, file)
                        py_path = os.path.join(root, "CONVERTED"+file.replace(".ipynb", ".py"))

                        with open(ipynb_path, "r", encoding="utf-8") as ipynbf:
                            nb_node = nbformat.read(ipynbf, as_version=4)

                            exporter = PythonExporter()
                            try:
                                python_code, _ = exporter.from_notebook_node(nb_node)
                                with open(py_path, "w", encoding="utf-8") as pyf:
                                    pyf.write(python_code)
                            except:
                                continue


            # use semgrep to detect udf definitions
            semgrep_result = subprocess.run(
                ["semgrep", "scan", "--config", "pyspark-rules.yml", repo_dir, "--json"],
                capture_output=True,
                encoding="utf-8", #quick fix: to avoid byte serialization issue on Windows laptops.
                text=True,
                check=False
            )

            # parse pyspark dataframe expressions and track udf usage
            try:
                data = json.loads(semgrep_result.stdout)
                matches = data.get("results", [])
                print(f"Found {len(matches)} potential matches in {repo_name}\n")

                repo_results = {
                    "repo_name" : repo_name,
                    "clone_url" : clone_url,
                }
                
                file_dic = {}
                for match in matches:
                    file_path = match["path"]
                    rel_path = os.path.relpath(file_path, repo_dir)
                    if rel_path not in file_dic:
                        file_dic[rel_path] = {
                            "udfs": [],
                            "df_exprs": []
                        }

                    start_offset = match["start"]["offset"]
                    end_offset = match["end"]["offset"]
                    with open(file_path, "r") as f:
                        content = f.read()
                        snippet = content[start_offset:end_offset]
                    
                        if match["check_id"] == "pyspark-udf-definition":
                            file_dic[rel_path]["udfs"].append({
                                "name": match["extra"]["message"],
                                "def": snippet
                            })

                        elif match["check_id"] == "pyspark-df-expression":
                            file_dic[rel_path]["df_exprs"].append(snippet)

                repo_results["files"] = [{"path": k, **v} for k,v in file_dic.items()]

                # write to results jsonl
                summary_file.write(json.dumps(repo_results, ensure_ascii=False) + "\n")
                summary_file.flush()


            except json.JSONDecodeError:
                print("Semgrep output not valid JSON.")
                print(semgrep_result.stdout[:500])



Found 1 potential matches in AlexIoannides/pyspark-example-project

Found 2 potential matches in uber/petastorm

Found 17 potential matches in jadianes/spark-py-notebooks

Found 0 potential matches in ptyadana/SQL-Data-Analysis-and-Visualization-Projects

Found 192 potential matches in hi-primus/optimus

Found 209 potential matches in spark-examples/pyspark-examples

Found 0 potential matches in mahmoudparsian/pyspark-tutorial

Found 0 potential matches in palantir/pyspark-style-guide

Found 0 potential matches in kavgan/nlp-in-practice

Found 0 potential matches in lensacom/sparkit-learn

Found 1 potential matches in pyspark-ai/pyspark-ai

Found 0 potential matches in lyhue1991/eat_pyspark_in_10_days

Found 0 potential matches in WeBankFinTech/Scriptis

Found 4 potential matches in MrPowers/chispa

Found 18 potential matches in mrpowers-io/quinn

Found 23 potential matches in drabastomek/learningPySpark

Found 0 potential matches in kevinschaich/pyspark-cheatsheet

Found 14 potential 