In [None]:
from dotenv import load_dotenv
from github import Github, Auth
import os

load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

if not GITHUB_TOKEN:
    raise ValueError("Specify GITHUB_TOKEN in .env file.")

g = Github(auth=Auth.Token(GITHUB_TOKEN))

In [15]:
query = "pyspark in:name,description"
sort = "stars"
order = "desc"
limit = 1

repos = g.search_repositories(query=query, sort=sort, order=order)

popular_repos = []
for i, repo in enumerate(repos):
    if i >= limit:
        break

    popular_repos.append({
        "name": repo.full_name,
        "url": repo.clone_url,
        "stars": repo.stargazers_count,
        "description": repo.description
    })

for repo in popular_repos:
    print(repo)

{'name': 'AlexIoannides/pyspark-example-project', 'url': 'https://github.com/AlexIoannides/pyspark-example-project.git', 'stars': 2013, 'description': 'Implementing best practices for PySpark ETL jobs and applications.'}


In [21]:
import git
import json
import tempfile
import subprocess

semgrep_rule = """
rules:
  - id: pyspark-udf-detection
    languages: [python]
    message: "Detected PySpark UDF definition or registration."
    severity: INFO
    patterns:
      - pattern-either:
          - pattern: |
              @udf(...)
              def $FUNC(...):
                  ...
          - pattern: |
              def $FUNC(...):
                  ...
      - pattern-either:
          - pattern: spark.udf.register($NAME, $FUNC)
          - pattern: $SPARK.udf.register($NAME, $FUNC)
          - pattern: F.udf(...)
          - pattern: pyspark.sql.functions.udf(...)

  - id: pyspark-df-detection
    languages: [python]
    message: "Detected PySpark DF operation."
    severity: INFO
    patterns:
      - pattern-either:
          - pattern: $DF.select(...)
          - pattern: $DF.filter(...)
          - pattern: $DF.where(...)
          - pattern: $DF.groupBy(...)
          - pattern: $DF.agg(...)
          - pattern: $DF.join(...)
          - pattern: $DF.withColumn(...)
          - pattern: $DF.drop(...)
          - pattern: $DF.distinct(...)
          - pattern: $DF.limit(...)
          - pattern: $DF.union(...)
          - pattern: $DF.orderBy(...)
          - pattern: $DF.sort(...)
          - pattern: $DF.write.$FUNC(...)
          - pattern: $DF.write.format(...).save(...)
          - pattern: $DF.write.mode(...).parquet(...)
          - pattern: $DF.write.mode(...).csv(...)
"""

for repo in popular_repos:
    clone_url = repo["url"]

    with tempfile.TemporaryDirectory() as tmpdir:
        repo_dir = os.path.join(tmpdir, "repo")
        git.Repo.clone_from(clone_url, repo_dir, depth=1)

        rule_path = os.path.join(tmpdir, "pyspark-udf.yaml")
        with open(rule_path, "w") as f:
            f.write(semgrep_rule)

        result = subprocess.run(
            ["semgrep", "scan", "--config", rule_path, repo_dir, "--json"],
            capture_output=True,
            text=True,
            check=False
        )

        try:
            data = json.loads(result.stdout)
            print(data)
            matches = data.get("results", [])
            print(f"Found {len(matches)} potential UDF matches\n")

            for match in matches:
                file_path = match["path"]
                start_offset = match["start"]["offset"]
                end_offset = match["end"]["offset"]

                with open(file_path, "r") as f:
                    content = f.read()
                
                snippet = content[start_offset:end_offset]

                print("File: ", file_path)
                print(match["extra"]["message"])
                print(snippet)
                print("="*20)

        except json.JSONDecodeError:
            print("Semgrep output not valid JSON.")
            print(result.stdout[:500])



{'version': '1.140.0', 'results': [{'check_id': 'tmp.tmpd67nisb5.pyspark-df-detection', 'path': '/tmp/tmpd67nisb5/repo/dependencies/spark.py', 'start': {'line': 77, 'col': 31, 'offset': 2951}, 'end': {'line': 77, 'col': 59, 'offset': 2979}, 'extra': {'message': 'Detected PySpark DF operation.', 'metadata': {}, 'severity': 'INFO', 'fingerprint': 'requires login', 'lines': 'requires login', 'validation_state': 'NO_VALIDATOR', 'engine_kind': 'OSS'}}, {'check_id': 'tmp.tmpd67nisb5.pyspark-df-detection', 'path': '/tmp/tmpd67nisb5/repo/dependencies/spark.py', 'start': {'line': 80, 'col': 23, 'offset': 3076}, 'end': {'line': 80, 'col': 44, 'offset': 3097}, 'extra': {'message': 'Detected PySpark DF operation.', 'metadata': {}, 'severity': 'INFO', 'fingerprint': 'requires login', 'lines': 'requires login', 'validation_state': 'NO_VALIDATOR', 'engine_kind': 'OSS'}}, {'check_id': 'tmp.tmpd67nisb5.pyspark-df-detection', 'path': '/tmp/tmpd67nisb5/repo/dependencies/spark.py', 'start': {'line': 98, '