# Scraping GitHub for PySpark Usage

This notebook provides a framework to clone repos and find pre-defined PySpark patterns defined in `./pyspark-rules.yml`. These patterns include PySpark DataFrame expressions, PySpark UDF definitions, and import usage in function definitions.

## Using GitHub API

In [17]:
from dotenv import load_dotenv
from github import Github, Auth
import os

load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

if not GITHUB_TOKEN:
    raise ValueError("Specify GITHUB_TOKEN in .env file.")

g = Github(auth=Auth.Token(GITHUB_TOKEN))

## Currently looking at most popular repos mentioning PySpark

Feel free to change the list of repos used for pattern searching. The most popular repos include a lot of tutorials and styleguides which may not be representative of true PySpark workloads.

In [18]:
query = "pyspark in:name,description"
sort = "stars"
order = "desc"
limit = 1

repos = g.search_repositories(query=query, sort=sort, order=order)

popular_repos = []
for i, repo in enumerate(repos):
    if i >= limit:
        break

    popular_repos.append({
        "name": repo.full_name,
        "url": repo.clone_url,
        "stars": repo.stargazers_count,
        "description": repo.description
    })

for repo in popular_repos:
    print(repo)

{'name': 'AlexIoannides/pyspark-example-project', 'url': 'https://github.com/AlexIoannides/pyspark-example-project.git', 'stars': 2015, 'description': 'Implementing best practices for PySpark ETL jobs and applications.'}


## Using Semgrep to search for PySpark patterns

For each repo, the following cell will:
1. Clone repo into a temp directory
2. Convert any notebooks (`.ipynb`) into python files (`.py`) using `nbconvert`
3. Capture the output of `semgrep scan` in a JSON object using the rules specified in `./pyspark-rules.yml`
4. Process matches for DataFrame expressions and UDF definitions
5. Process matches for imported library usage in functions that are tagged as UDFs
6. Store processed result as JSON (see `./README.md` for schema)

All results are stored in `./results/summary.jsonl`

In [19]:
import git
import json
from nbconvert import PythonExporter
import nbformat
import tempfile
import subprocess
import ast

# Create results jsonl
os.makedirs("results", exist_ok=True)
summary_path = os.path.join("results", "summary.jsonl")

with open(summary_path, "w", encoding="utf-8") as summary_file:
    for repo in popular_repos:
        repo_name = repo["name"]
        clone_url = repo["url"]

        with tempfile.TemporaryDirectory() as tmpdir:
            # clone repo into temporary directory
            repo_dir = os.path.join(tmpdir, "repo")
            git.Repo.clone_from(clone_url, repo_dir, depth=1)

            # find .ipynb files and convert into .py files
            for root, _, files in os.walk(repo_dir):
                for file in files:
                    if file.endswith(".ipynb"):
                        ipynb_path = os.path.join(root, file)
                        py_path = os.path.join(root, "CONVERTED"+file.replace(".ipynb", ".py"))

                        with open(ipynb_path, "r", encoding="utf-8") as ipynbf:
                            nb_node = nbformat.read(ipynbf, as_version=4)

                            exporter = PythonExporter()
                            try:
                                python_code, _ = exporter.from_notebook_node(nb_node)
                                with open(py_path, "w", encoding="utf-8") as pyf:
                                    pyf.write(python_code)
                            except:
                                continue


            # use semgrep to detect udf definitions
            semgrep_result = subprocess.run(
                ["semgrep", "scan", "--config", "pyspark-rules.yml", repo_dir, "--json"],
                capture_output=True,
                encoding="utf-8", #quick fix: to avoid byte serialization issue on Windows laptops.
                text=True,
                check=False
            )

            # parse pyspark dataframe expressions and track udf usage
            try:
                data = json.loads(semgrep_result.stdout)
                matches = data.get("results", [])
                print(f"Found {len(matches)} potential matches in {repo_name}\n")

                repo_results = {
                    "repo_name" : repo_name,
                    "clone_url" : clone_url,
                }
                
                file_dic = {}
                for match in matches:
                    if match["check_id"] == "library-usage":
                        continue # process dataframe expressions & udf definitions first

                    file_path = match["path"]
                    rel_path = os.path.relpath(file_path, repo_dir)
                    if rel_path not in file_dic:
                        file_dic[rel_path] = {
                            "udfs": {},
                            "df_exprs": []
                        }
                    
                    start_offset = match["start"]["offset"]
                    end_offset = match["end"]["offset"]
                    with open(file_path, "r") as f:
                        content = f.read()
                        snippet = content[start_offset:end_offset]
                    
                        if match["check_id"] == "pyspark-udf-definition":
                            udf_name = match["extra"]["message"]
                            file_dic[rel_path]["udfs"][udf_name] = {
                                "def": snippet,
                                "calls": []
                            }

                        elif match["check_id"] == "pyspark-df-expression":
                            # file_dic[rel_path]["df_exprs"].append(snippet)
                            try:
                                tree = ast.parse(snippet, mode="eval")
                                file_dic[rel_path]["df_exprs"].append(snippet)
                            except:
                                continue

                for match in matches:
                    if match["check_id"] != "library-usage":
                        continue # only process library calls once udf's have been tagged

                    msg_fields = match["extra"]["message"].split(":", 2)
                    func_name = msg_fields[0]
                    library_name = msg_fields[1]
                    call_name = msg_fields[2]

                    file_path = match["path"]
                    rel_path = os.path.relpath(file_path, repo_dir)
                    if rel_path in file_dic:
                        if func_name in file_dic[rel_path]["udfs"]:
                            file_dic[rel_path]["udfs"][func_name]["calls"].append({
                                "library": library_name,
                                "call": call_name
                            })
                
                for file_data in file_dic.values():
                    file_data["udfs"] = [
                        {"name": k, **v} for k,v in file_data["udfs"].items()
                    ]

                repo_results["files"] = [{"path": k, **v} for k,v in file_dic.items()]

                # write to results jsonl
                summary_file.write(json.dumps(repo_results, ensure_ascii=False) + "\n")
                summary_file.flush()


            except json.JSONDecodeError:
                print("Semgrep output not valid JSON.")
                print(semgrep_result.stdout[:500])



Found 11 potential matches in AlexIoannides/pyspark-example-project

