### Install the necessary tools for this script

In [None]:
!git clone https://github.com/opengrep/opengrep-rules.git
!curl -fsSL https://raw.githubusercontent.com/opengrep/opengrep/main/install.sh | bash
!pip install kagglehub[pandas-datasets]

### Importing necessary modules

In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from pandas import DataFrame

import os
import json
import shutil
import csv

In [6]:
def load_dataset(file_path):
  df: DataFrame  = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "wilfriedkonan/cod-blocks",
    file_path,
  )
  return df

datasets = {
    "c": "c.json",
    "csharp": "csharp.json",
    "html": "html.json",
    "java": "java.json",
    "javascript": "javascript.json",
    "php": "php.json",
    "python": "python.json",
    "sql": "sql.json",
}

Looping through each language in the Kaggle dataset and turning the .json files into actual files, then saving them to files/LANGUAGE/codes

In [7]:
for language in datasets.keys():
    os.makedirs(f"files/{language}/codes/", exist_ok=True)
    os.makedirs(f"files/{language}/rules/", exist_ok=True)
    print(language)
    df = load_dataset(datasets[language])
    for index, data_point in df.iterrows():
        with open(f"files/{language}/codes/{data_point['filename']}", "w", encoding="utf-8") as f:
            f.write(data_point["code"])
    

c


  df: DataFrame  = kagglehub.load_dataset(


csharp


  df: DataFrame  = kagglehub.load_dataset(


html


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=3&file_name=html.json...


100%|██████████| 4.07M/4.07M [00:00<00:00, 8.88MB/s]


java


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=3&file_name=java.json...


100%|██████████| 7.96M/7.96M [00:00<00:00, 9.97MB/s]


javascript


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=3&file_name=javascript.json...


100%|██████████| 32.6M/32.6M [00:02<00:00, 12.0MB/s]


php


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=3&file_name=php.json...


100%|██████████| 2.23M/2.23M [00:00<00:00, 5.11MB/s]


python


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=3&file_name=python.json...


100%|██████████| 63.6M/63.6M [00:08<00:00, 8.19MB/s]


sql


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=3&file_name=sql.json...


100%|██████████| 4.37M/4.37M [00:00<00:00, 8.55MB/s]


### Filtering for security rules in codegrep-rules repository

In [None]:
def copy_security_yaml_rules(src_root: str, dst_root: str):
    """
    Walk src_root, find all .yaml files under any 'security' folder,
    and copy them to dst_root, preserving subdirectory structure.
    """
    for root, dirs, files in os.walk(src_root):
        # only consider paths that have 'security' in their hierarchy
        if 'security' in root.split(os.sep):
            for file in files:
                if file.endswith('.yaml'):
                    # compute relative path under src_root
                    rel_dir = os.path.relpath(root, src_root)
                    dst_dir = os.path.join(dst_root, rel_dir)
                    os.makedirs(dst_dir, exist_ok=True)

                    src_file = os.path.join(root, file)
                    dst_file = os.path.join(dst_dir, file)
                    shutil.copy2(src_file, dst_file)
                    print(f"Copied: {rel_dir}/{file}")

In [None]:
for language in datasets.keys():
    if not os.path.exists(f"opengrep-rules/{language}"):
        continue
    copy_security_yaml_rules(f"opengrep-rules/{language}", f"files/{language}/rules/")

### Runing static analysis tool
Looping through each language and running the codegrep static analysis tool on them, and saving the results in files/language/output.sarif

In [None]:
for language in datasets.keys():
    if os.path.exists(f"opengrep-rules/{language}"):
        !/root/.opengrep/cli/latest/opengrep scan --sarif-output=files/{language}/output.sarif -f files/{language}/rules files/{language}/codes


### Converting Sarif files into CSV
(for ease of use)

In [None]:
for language in datasets.keys():
    if os.path.exists(f"opengrep-rules/{language}"):
        rows = []
        sarif_path = f"files/{language}/output.sarif"
        if not os.path.exists(sarif_path):
            continue
        
        
        with open(sarif_path, "r", encoding="utf-8") as f:
            data = json.loads(f.read())
            for run in data["runs"]:
                for result in run.get("results", []):
                    message = result.get("message", {}).get("text", "")
                    rule_id = result.get("ruleId", "")
                    
                    # Some results may have multiple locations
                    for location in result.get("locations", []):
                        loc = location.get("physicalLocation", {})
                        artifact = loc.get("artifactLocation", {})
                        region = loc.get("region", {})

                        conversation_hash = artifact.get("uri", "").split("/")[-1].split("_")[0]
                        code_index = artifact.get("uri", "").split("/")[-1].split("_")[1].split(".")[0]
                        start_line = region.get("startLine", "")
                        start_column = region.get("startColumn", "")

                        rows.append([conversation_hash, code_index, start_line, start_column, rule_id, message])

        # Write to CSV
        csv_path = f"files/{language}/{language}.csv" 
        with open(csv_path, "w", newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["conversation_hash", "code_index", "error_line", "error_character", "error_id", "error_message"])
            writer.writerows(rows)

        print(f"CSV written to: {csv_path}")