### Install the necessary tools for this script

In [2]:
!git clone https://github.com/opengrep/opengrep-rules.git
!curl -fsSL https://raw.githubusercontent.com/opengrep/opengrep/main/install.sh | bash
!pip install kagglehub[pandas-datasets]

fatal: destination path 'opengrep-rules' already exists and is not an empty directory.


Go to https:/github.com/sigstore/cosign to install it.
Destination binary /home/pooria/.opengrep/cli/v1.8.2/opengrep already exists.
Updated symlink from /home/pooria/.opengrep/cli/latest/opengrep to point to /home/pooria/.opengrep/cli/v1.8.2/opengrep.

To launch Opengrep now, type:
opengrep



### Importing necessary modules

In [3]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from pandas import DataFrame

import os
import json
import shutil
import csv

In [4]:
def load_dataset(file_path):
  df: DataFrame  = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "wilfriedkonan/cod-blocks",
    file_path,
  )
  return df

datasets = {
    "c": "c_cleaned.json",
    "csharp": "csharp.json",
    "java": "java_cleaned.json",
    "javascript": "javascript_cleaned.json",
    "php": "php.json",
    "python": "python_cleaned.json",
    "sql": "sql.json",
}

Looping through each language in the Kaggle dataset and turning the .json files into actual files, then saving them to files/LANGUAGE/codes

In [11]:
for language in datasets.keys():
    os.makedirs(f"files/{language}/codes/", exist_ok=True)
    os.makedirs(f"files/{language}/rules/", exist_ok=True)
    print(language)
    df = load_dataset(datasets[language])
    for index, data_point in df.iterrows():
        with open(f"files/{language}/codes/{data_point['filename']}", "w", encoding="utf-8") as f:
            f.write(data_point["code"])
    

c


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=4&file_name=c_cleaned.json...


100%|██████████| 8.62M/8.62M [00:00<00:00, 9.66MB/s]


csharp


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=4&file_name=csharp.json...


100%|██████████| 6.37M/6.37M [00:00<00:00, 8.31MB/s]


java


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=4&file_name=java_cleaned.json...


100%|██████████| 7.90M/7.90M [00:01<00:00, 8.25MB/s]


javascript


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=4&file_name=javascript_cleaned.json...


100%|██████████| 11.3M/11.3M [00:01<00:00, 10.0MB/s]


php


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=4&file_name=php.json...


100%|██████████| 2.23M/2.23M [00:00<00:00, 5.79MB/s]


python


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=4&file_name=python_cleaned.json...


100%|██████████| 52.5M/52.5M [00:04<00:00, 11.2MB/s]


sql


  df: DataFrame  = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/wilfriedkonan/cod-blocks?dataset_version_number=4&file_name=sql.json...


100%|██████████| 4.37M/4.37M [00:00<00:00, 7.77MB/s]


### Filtering for security rules in codegrep-rules repository

In [12]:
def copy_security_yaml_rules(src_root: str, dst_root: str):
    """
    Walk src_root, find all .yaml files under any 'security' folder,
    and copy them to dst_root, preserving subdirectory structure.
    """
    for root, dirs, files in os.walk(src_root):
        # only consider paths that have 'security' in their hierarchy
        if 'security' in root.split(os.sep):
            for file in files:
                if file.endswith('.yaml'):
                    # compute relative path under src_root
                    rel_dir = os.path.relpath(root, src_root)
                    dst_dir = os.path.join(dst_root, rel_dir)
                    os.makedirs(dst_dir, exist_ok=True)

                    src_file = os.path.join(root, file)
                    dst_file = os.path.join(dst_dir, file)
                    shutil.copy2(src_file, dst_file)
                    print(f"Copied: {rel_dir}/{file}")

In [13]:
for language in datasets.keys():
    if not os.path.exists(f"opengrep-rules/{language}"):
        continue
    copy_security_yaml_rules(f"opengrep-rules/{language}", f"files/{language}/rules/")

Copied: lang/security/insecure-use-scanf-fn.yaml
Copied: lang/security/insecure-use-gets-fn.yaml
Copied: lang/security/double-free.yaml
Copied: lang/security/insecure-use-memset.yaml
Copied: lang/security/insecure-use-strtok-fn.yaml
Copied: lang/security/insecure-use-printf-fn.yaml
Copied: lang/security/use-after-free.yaml
Copied: lang/security/info-leak-on-non-formatted-string.yaml
Copied: lang/security/insecure-use-string-copy-fn.yaml
Copied: lang/security/insecure-use-strcat-fn.yaml
Copied: lang/security/function-use-after-free.yaml
Copied: lang/security/random-fd-exhaustion.yaml
Copied: dotnet/security/use_ecb_mode.yaml
Copied: dotnet/security/mvc-missing-antiforgery.yaml
Copied: dotnet/security/razor-template-injection.yaml
Copied: dotnet/security/net-webconfig-trace-enabled.yaml
Copied: dotnet/security/use_weak_rsa_encryption_padding.yaml
Copied: dotnet/security/use_deprecated_cipher_algorithm.yaml
Copied: dotnet/security/web-config-insecure-cookie-settings.yaml
Copied: dotnet/se

### Runing static analysis tool
Looping through each language and running the codegrep static analysis tool on them, and saving the results in files/language/output.sarif

In [6]:
for language in datasets.keys():
    if os.path.exists(f"opengrep-rules/{language}"):
        !opengrep scan --no-git-ignore --sarif-output=files/{language}/output.sarif -f files/{language}/rules files/{language}/codes



┌──────────────┐
│ Opengrep CLI │
└──────────────┘

[2K                                                                                
Scanning 5620 files with 12 Code rules:
            
  [4mCODE RULES[0m
  Scanning 5620 files with 12 c rules.
          
  [4mPROGRESS[0m
   
[2K  [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [35m100%[0m [33m0:00:03[0m                                                                                m[91m━[0m[91m━[0m[91m━[0m[91m━[0m[91m━[0m[35m━[0m[90m━[0m[90m━[0m[90m━[0m[90m━[0m[90m━[0m[90m━[0m[90m━[0m[90m━[0m[90m━[0m[90m━[0m[90m━[0m[35m━[0m[91m━[0m[91m━[0m[91m━[0m[91m━[0m[91m━[0m [35m  0%[0m [33m-:--:--[0m
[?25h                      
                      
┌────────────────────┐
│ 1851 Code Findings │
└────────────────────┘
                                                                     
  [36m[22m[24m  files/c/codes/000de7ac199d40df8d53d93bd8fa687d_3.c[0m
   [33m ❯❱

### Converting Sarif files into CSV
(for ease of use)

In [7]:
for language in datasets.keys():
    if os.path.exists(f"opengrep-rules/{language}"):
        rows = []
        sarif_path = f"files/{language}/output.sarif"
        if not os.path.exists(sarif_path):
            continue
        
        
        with open(sarif_path, "r", encoding="utf-8") as f:
            data = json.loads(f.read())
            for run in data["runs"]:
                for result in run.get("results", []):
                    message = result.get("message", {}).get("text", "")
                    rule_id = result.get("ruleId", "")
                    
                    # Some results may have multiple locations
                    for location in result.get("locations", []):
                        loc = location.get("physicalLocation", {})
                        artifact = loc.get("artifactLocation", {})
                        region = loc.get("region", {})

                        conversation_hash = artifact.get("uri", "").split("/")[-1].split("_")[0]
                        code_index = artifact.get("uri", "").split("/")[-1].split("_")[1].split(".")[0]
                        start_line = region.get("startLine", "")
                        start_column = region.get("startColumn", "")

                        rows.append([conversation_hash, code_index, start_line, start_column, rule_id, message])

        # Write to CSV
        csv_path = f"files/{language}/{language}.csv" 
        with open(csv_path, "w", newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["conversation_hash", "code_index", "error_line", "error_character", "error_id", "error_message"])
            writer.writerows(rows)

        print(f"CSV written to: {csv_path}")

CSV written to: files/c/c.csv
CSV written to: files/csharp/csharp.csv
CSV written to: files/java/java.csv
CSV written to: files/javascript/javascript.csv
CSV written to: files/php/php.csv
CSV written to: files/python/python.csv


## Or all in one CSV file

In [8]:
csv_path = f"codegrep_results.csv" 
with open(csv_path, "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["conversation_hash", "language", "code_index", "error_line", "error_character", "error_id", "error_message"])
    for language in datasets.keys():
        if os.path.exists(f"opengrep-rules/{language}"):
            rows = []
            sarif_path = f"files/{language}/output.sarif"
            if not os.path.exists(sarif_path):
                continue
            
            
            with open(sarif_path, "r", encoding="utf-8") as f:
                data = json.loads(f.read())
                for run in data["runs"]:
                    for result in run.get("results", []):
                        message = result.get("message", {}).get("text", "")
                        rule_id = result.get("ruleId", "")
                        
                        # Some results may have multiple locations
                        for location in result.get("locations", []):
                            loc = location.get("physicalLocation", {})
                            artifact = loc.get("artifactLocation", {})
                            region = loc.get("region", {})

                            conversation_hash = artifact.get("uri", "").split("/")[-1].split("_")[0]
                            code_index = artifact.get("uri", "").split("/")[-1].split("_")[1].split(".")[0]
                            start_line = region.get("startLine", "")
                            start_column = region.get("startColumn", "")

                            rows.append([conversation_hash, language, code_index, start_line, start_column, rule_id, message])

            # Write to CSV
            writer.writerows(rows)