In [None]:
import subprocess
import os
import re
from pprint import pprint

In [None]:
js_getVARS_loc = os.path.join(os.path.abspath('parsergen'), "get_vars.js")
php_getVARS_loc = os.path.join(os.path.abspath('parsergen'), "get_vars.php")
php_samples_loc = os.path.join(os.path.abspath('parsergen'), "php_samples")
js_samples_loc = os.path.join(os.path.abspath('parsergen'), "js_samples")

In [None]:
print(js_getVARS_loc)
print(php_getVARS_loc)
print(php_samples_loc)
print(js_samples_loc)

/home/srn/Desktop/bloom-experiment-lab/dataprocessing/parsergen/get_vars.js
/home/srn/Desktop/bloom-experiment-lab/dataprocessing/parsergen/get_vars.php
/home/srn/Desktop/bloom-experiment-lab/dataprocessing/parsergen/php_samples
/home/srn/Desktop/bloom-experiment-lab/dataprocessing/parsergen/js_samples


In [None]:
# Get all files with parent directory
php_samples = [os.path.join(php_samples_loc, f) for f in os.listdir(php_samples_loc) if os.path.isfile(os.path.join(php_samples_loc, f))]
js_samples = [os.path.join(js_samples_loc, f) for f in os.listdir(js_samples_loc) if os.path.isfile(os.path.join(js_samples_loc, f))]

print(js_samples)
print(php_samples)

['/home/srn/Desktop/bloom-experiment-lab/dataprocessing/parsergen/js_samples/sample1.js']
['/home/srn/Desktop/bloom-experiment-lab/dataprocessing/parsergen/php_samples/sample1.php', '/home/srn/Desktop/bloom-experiment-lab/dataprocessing/parsergen/php_samples/sample2.php']


In [None]:
js_get_vars_sourcetype = "module"
node_cmd = ["node", js_getVARS_loc, js_get_vars_sourcetype, js_samples[0]]
php_cmd = ["php", php_getVARS_loc, php_samples[0]]

js_vars = []
php_vars = []

try:
    node_output = subprocess.check_output(node_cmd, stderr=subprocess.PIPE).decode('utf-8')
    js_vars = node_output.split(',')
    php_output = subprocess.check_output(php_cmd, stderr=subprocess.PIPE).decode('utf-8')
    php_vars = php_output.split(',')
except subprocess.CalledProcessError as e:
    print(e.stderr)


In [None]:
print(js_vars)
print(php_vars)

['express', 'crypto', 'app', 'db', 'FLAG', 'PORT', 'users', 'isAdmin', 'newAdmin', 'user', 'pass', 'query', 'id']
['$a', '$b', "$_GET['q']", '$sql', "$_SERVER['REQUEST_METHOD']", '$c', "$_POST['c']", '$input', '$data', '$api_url', '$options', '$context', '$result', "$_POST['login-submit']", "$_POST['username']", "$_POST['password']", '$username', '$password', "$_SERVER['REMOTE_ADDR']"]


In [None]:
def code_cleaner(filename):
    with open(filename, 'r') as f:
        code = f.read()

    # GENERAL: 
    # remove multiline comments
    code = re.sub(r'/\*(.*?)\*/', '', code, flags=re.DOTALL)
    # remove all single line comments (//|#) except if (//|#) is inside of a string like "htes // asdf" or 'htes # asdf'
    code = re.sub(r'(?<!\\)(["\'])(?:\\.|(?!\1).)*?\1|//.*?$|#.*?$', 
                  lambda m: m.group(0) if m.group(0).startswith('"') or m.group(0).startswith("'") else '', code, flags=re.MULTILINE)
    # remove all newlines after a ( ,|.|\(|\[ ) or spaces after a ( ,|.|\(|\[ )
    code = re.sub(r'(\[|\(|,|\.)\s+', r'\1', code)
    # remove all newlines before a ( ,|.|;|\)|\] ) or spaces before a ( ,|.|;|\)|\] )
    code = re.sub(r'\s+(\]|\)|,|\.|;)', r'\1', code)
    # remove all trailing comma before a ( \) | \] )
    code = re.sub(r',(\s*[\]\)])', r'\1', code)
    # split code into lines
    code = code.split('\n')
    # remove leading and trailing whitespace
    code = [line.strip() for line in code]
    # remove all semi-colons at the end of a line
    code = [re.sub(r'(;|{)$', '', line).strip() for line in code]
    
    # After removing aliens, we can remove some twigs symbols, single words and numbers
    # remove all elements that are one word or numeric only in a string or symbols only in a string
    code = [line for line in code if not re.match(r'^\W+$', line) 
                                        and not re.match(r'^\w+$', line) 
                                            and not re.match(r'^\d+$', line)]

    # PHP:
    # remove all php tags
    code = [line for line in code if not line.startswith('<?php') and not line.startswith('?>')]

    # lastly remove all empty lines
    code = list(filter(None, code))

    return code


In [None]:
js_code = code_cleaner(js_samples[0])
js_code

["const express = require('express')",
 "const crypto = require('crypto')",
 'const app = express()',
 "const db = require('better-sqlite3')('db.sqlite3')",
 'db.exec(`DROP TABLE IF EXISTS users;`)',
 'db.exec(`CREATE TABLE users(id INTEGER PRIMARY KEY,username TEXT,password TEXT);`)',
 'const FLAG = process.env.FLAG || "dice{test_flag}"',
 'const PORT = process.env.PORT || 3000',
 'const users = [...Array(100_000)].map(() => ({ user: `user-${crypto.randomUUID()}`,pass: crypto.randomBytes(8).toString("hex") }))',
 'db.exec(`INSERT INTO users (id,username,password) VALUES ${users.map((u,i) => `(${i},\'${u.user}\',\'${u.pass}\')`).join(",")}`)',
 'const isAdmin = {}',
 'const newAdmin = users[Math.floor(Math.random() * users.length)]',
 'isAdmin[newAdmin.user] = true',
 'app.use(express.urlencoded({ extended: false }))',
 'app.use(express.static("public"))',
 'app.post("/api/login",(req,res) =>',
 'const { user,pass } = req.body',
 'console.log("[REQUEST BODY] : ",req.body)',
 "const que

In [None]:
php_code = code_cleaner(php_samples[0])
php_code

["$a = 'Simple string'",
 'function query($a)',
 'echo $a',
 'query($a)',
 "$b = $_GET['q']",
 '$sql = `SELECT * FROM table WHERE id = ${b}`',
 "if($_SERVER['REQUEST_METHOD'] === 'POST')",
 "$c = $_POST['c']",
 '$sql = `SELECT * FROM table WHERE id = ${c}`',
 'function SQLQuery($sql)',
 'echo $sql',
 'error_reporting(0)',
 'function Check_Admin($input)',
 "$input = iconv ('UTF-8','US-ASCII//TRANSLIT',$input)",
 'if(preg_match("/admin/i",$input))',
 'return true',
 'return false',
 'function send_to_api($data)',
 'print_r($data)',
 "$api_url = 'http://127.0.0.1:5000/login'",
 "$options = ['http' => ['method' => 'POST','header' => 'Content-Type: application/x-www-form-urlencoded','content' => $data]]",
 '$context = stream_context_create($options)',
 '$result = file_get_contents($api_url,false,$context)',
 'if ($result !== false)',
 'echo "Response from Flask app: $result"',
 'echo "Failed to communicate with Flask app."',
 "if(isset($_POST['login-submit']))",
 "if(!empty($_POST['username

In [None]:
# Extract the variable references of JS and PHP
def extract_vars_references(vars, code):
    references = []
    for var in vars:
        # case sensitive and match whole word only or if wrapped in a special character
        var_pattern = r'(?<!\w)' + re.escape(var) + r'(?!\w)'
        
        var_references = []
        for line in code:
            if re.search(var_pattern, line):
                var_references.append(line)
        references.append((var, var_references))

    return references

In [None]:
js_vars_references = extract_vars_references(js_vars, js_code)
# print(js_vars)
pprint(js_vars_references)

[('express',
  ["const express = require('express')",
   'const app = express()',
   'app.use(express.urlencoded({ extended: false }))',
   'app.use(express.static("public"))']),
 ('crypto',
  ["const crypto = require('crypto')",
   'const users = [...Array(100_000)].map(() => ({ user: '
   '`user-${crypto.randomUUID()}`,pass: crypto.randomBytes(8).toString("hex") '
   '}))']),
 ('app',
  ['const app = express()',
   'app.use(express.urlencoded({ extended: false }))',
   'app.use(express.static("public"))',
   'app.post("/api/login",(req,res) =>',
   'app.listen(PORT,() => console.log(`web/funnylogin listening on port '
   '${PORT}`))']),
 ('db',
  ["const db = require('better-sqlite3')('db.sqlite3')",
   'db.exec(`DROP TABLE IF EXISTS users;`)',
   'db.exec(`CREATE TABLE users(id INTEGER PRIMARY KEY,username TEXT,password '
   'TEXT);`)',
   'db.exec(`INSERT INTO users (id,username,password) VALUES ${users.map((u,i) '
   '=> `(${i},\'${u.user}\',\'${u.pass}\')`).join(",")}`)',
   'con

In [None]:
php_vars_references = extract_vars_references(php_vars, php_code)
# print("VARAIBLES:", php_vars)
# print("REFERENCES:")
# for var, refs in php_vars_references:
#     print("VAR:", var)
#     print("REFS:", refs)
#     print()
print(php_vars)
pprint(php_vars_references)

['$a', '$b', "$_GET['q']", '$sql', "$_SERVER['REQUEST_METHOD']", '$c', "$_POST['c']", '$input', '$data', '$api_url', '$options', '$context', '$result', "$_POST['login-submit']", "$_POST['username']", "$_POST['password']", '$username', '$password', "$_SERVER['REMOTE_ADDR']"]
[('$a', ["$a = 'Simple string'", 'function query($a)', 'echo $a', 'query($a)']),
 ('$b', ["$b = $_GET['q']"]),
 ("$_GET['q']", ["$b = $_GET['q']"]),
 ('$sql',
  ['$sql = `SELECT * FROM table WHERE id = ${b}`',
   '$sql = `SELECT * FROM table WHERE id = ${c}`',
   'function SQLQuery($sql)',
   'echo $sql']),
 ("$_SERVER['REQUEST_METHOD']", ["if($_SERVER['REQUEST_METHOD'] === 'POST')"]),
 ('$c', ["$c = $_POST['c']"]),
 ("$_POST['c']", ["$c = $_POST['c']"]),
 ('$input',
  ['function Check_Admin($input)',
   "$input = iconv ('UTF-8','US-ASCII//TRANSLIT',$input)",
   'if(preg_match("/admin/i",$input))']),
 ('$data',
  ['function send_to_api($data)',
   'print_r($data)',
   "$options = ['http' => ['method' => 'POST','head

In [None]:
def check_variable_usage(code_snippet):
    # php regex rules for catching tainted variables with user input
    php_pattern = re.compile(r'\b(?:php|http)://|(?:(\$_(?:GET|POST|REQUEST|SERVER|COOKIE|ENV|FILES)\b)|\b(?:GET|POST|REQUEST|SERVER|COOKIE|ENV|FILES)\b)\b')
    # pure js regex rules for catching tainted variables with user input
    js_pattern = re.compile(r'(?:req|request)\.(?:body|params|query|headers)', re.IGNORECASE)
    # express js regex rules for catching tainted variables with user input
    express_js_pattern = re.compile(r'(?:req|request)\.(?:body|params|query|headers|param|queryparam|get|post|paramfrom)', re.IGNORECASE)

    php_match = re.search(php_pattern, code_snippet)
    js_match = re.search(js_pattern, code_snippet)
    express_js_match = re.search(express_js_pattern, code_snippet)

    return bool(php_match), bool(js_match), bool(express_js_match)


In [None]:

def get_tainted_variables(references):
    tainted_variables = set()

    for var, snippets in references:
        for snippet in snippets:
            matches = check_variable_usage(snippet)
            if any(matches):
                tainted_variables.add(var)

    return list(tainted_variables)


In [None]:
tainted_js_vars = get_tainted_variables(js_vars_references)
tainted_php_vars = get_tainted_variables(php_vars_references)

print("Tainted JS")
pprint(tainted_js_vars)
print()
print("Tainted PHP")
pprint(tainted_php_vars)


Tainted JS
['user', 'pass']

Tainted PHP
["$_SERVER['REMOTE_ADDR']",
 '$username',
 '$password',
 "$_GET['q']",
 "$_POST['username']",
 "$_SERVER['REQUEST_METHOD']",
 '$api_url',
 '$b',
 '$data',
 '$options',
 "$_POST['login-submit']",
 "$_POST['c']",
 "$_POST['password']",
 '$c']


In [None]:
def extract_tainted_snippets(references, tainted_variables):
    tainted_var_and_snippets = []

    regex_pattern = r'(?<![a-zA-Z0-9_]){}(?![a-zA-Z0-9_])'

    for tainted_var in tainted_variables:
        tainted_snippets = set()  # Initialize a set to store tainted snippets for each variable so that we don't have duplicates
        for var, snippets in references:
            for snippet in snippets:
                if re.findall(regex_pattern.format(re.escape(tainted_var)), snippet, flags=re.IGNORECASE):
                    tainted_snippets.add(snippet)  # Add the tainted snippet to the set
                    break  # Stop searching for this snippet if a tainted variable is found
        # Append variable and its tainted snippets
        tainted_var_and_snippets.append((tainted_var, list(tainted_snippets)))  # Convert the set to a list and append to the list of tainted variables and snippets

    return tainted_var_and_snippets
            

In [None]:
tainted_js_var_and_code = extract_tainted_snippets(js_vars_references, tainted_js_vars)
tainted_php_var_and_code = extract_tainted_snippets(php_vars_references, tainted_php_vars)


pprint(tainted_js_var_and_code)
print()
pprint(tainted_php_var_and_code)

[('user',
  ["const query = `SELECT id FROM users WHERE username = '${user}' AND "
   "password = '${pass}';`",
   'db.exec(`INSERT INTO users (id,username,password) VALUES ${users.map((u,i) '
   '=> `(${i},\'${u.user}\',\'${u.pass}\')`).join(",")}`)',
   'const users = [...Array(100_000)].map(() => ({ user: '
   '`user-${crypto.randomUUID()}`,pass: crypto.randomBytes(8).toString("hex") '
   '}))',
   'isAdmin[newAdmin.user] = true']),
 ('pass',
  ["const query = `SELECT id FROM users WHERE username = '${user}' AND "
   "password = '${pass}';`",
   'db.exec(`INSERT INTO users (id,username,password) VALUES ${users.map((u,i) '
   '=> `(${i},\'${u.user}\',\'${u.pass}\')`).join(",")}`)',
   'const users = [...Array(100_000)].map(() => ({ user: '
   '`user-${crypto.randomUUID()}`,pass: crypto.randomBytes(8).toString("hex") '
   '}))'])]

[("$_SERVER['REMOTE_ADDR']",
  ['if(Check_Admin($username) && $_SERVER[\'REMOTE_ADDR\']!=="127.0.0.1")']),
 ('$username',
  ["$username=$_POST['username']"