In [1]:
import subprocess
import os
import re
from pprint import pprint

In [2]:
js_getVARS_loc = os.path.join(os.path.abspath('parsergen'), "get_vars.js")
php_getVARS_loc = os.path.join(os.path.abspath('parsergen'), "get_vars.php")
php_samples_loc = os.path.join(os.path.abspath('parsergen'), "php_samples")
js_samples_loc = os.path.join(os.path.abspath('parsergen'), "js_samples")

In [3]:
print(js_getVARS_loc)
print(php_getVARS_loc)
print(php_samples_loc)
print(js_samples_loc)

/media/shanks/Lab01/CSE/CSE_00003_A/bloom-experiment-lab/dataprocessing/parsergen/get_vars.js
/media/shanks/Lab01/CSE/CSE_00003_A/bloom-experiment-lab/dataprocessing/parsergen/get_vars.php
/media/shanks/Lab01/CSE/CSE_00003_A/bloom-experiment-lab/dataprocessing/parsergen/php_samples
/media/shanks/Lab01/CSE/CSE_00003_A/bloom-experiment-lab/dataprocessing/parsergen/js_samples


In [4]:
# Get all files with parent directory
php_samples = [os.path.join(php_samples_loc, f) for f in os.listdir(php_samples_loc) if os.path.isfile(os.path.join(php_samples_loc, f))]
js_samples = [os.path.join(js_samples_loc, f) for f in os.listdir(js_samples_loc) if os.path.isfile(os.path.join(js_samples_loc, f))]

print(js_samples)
print(php_samples)

['/media/shanks/Lab01/CSE/CSE_00003_A/bloom-experiment-lab/dataprocessing/parsergen/js_samples/sample1.js']
['/media/shanks/Lab01/CSE/CSE_00003_A/bloom-experiment-lab/dataprocessing/parsergen/php_samples/sample1.php']


In [5]:
js_get_vars_sourcetype = "module"
node_cmd = ["node", js_getVARS_loc, js_get_vars_sourcetype, js_samples[0]]
php_cmd = ["php", php_getVARS_loc, php_samples[0]]

js_vars = []
php_vars = []

try:
    node_output = subprocess.check_output(node_cmd, stderr=subprocess.PIPE).decode('utf-8')
    js_vars = node_output.split(',')
    php_output = subprocess.check_output(php_cmd, stderr=subprocess.PIPE).decode('utf-8')
    php_vars = php_output.split(',')
except subprocess.CalledProcessError as e:
    print(e.stderr)


In [6]:
print(js_vars)
print(php_vars)

['express', 'crypto', 'app', 'db', 'FLAG', 'PORT', 'users', 'isAdmin', 'newAdmin', 'user', 'pass', 'query', 'id']
['$a', '$b', "$_GET['q']", '$sql', "$_SERVER['REQUEST_METHOD']", '$c', "$_POST['c']", '$input', '$data', '$api_url', '$options', '$context', '$result', "$_POST['login-submit']", "$_POST['username']", "$_POST['password']", '$username', '$password', "$_SERVER['REMOTE_ADDR']"]


In [7]:
def code_cleaner(filename):
    with open(filename, 'r') as f:
        code = f.read()

    # GENERAL: 
    # remove multiline comments
    code = re.sub(r'/\*(.*?)\*/', '', code, flags=re.DOTALL)
    # remove all single line comments (//|#) except if (//|#) is inside of a string like "htes // asdf" or 'htes # asdf'
    code = re.sub(r'(?<!\\)(["\'])(?:\\.|(?!\1).)*?\1|//.*?$|#.*?$', 
                  lambda m: m.group(0) if m.group(0).startswith('"') or m.group(0).startswith("'") else '', code, flags=re.MULTILINE)
    # remove all newlines after a ( ,|.|\(|\[ ) or spaces after a ( ,|.|\(|\[ )
    code = re.sub(r'(\[|\(|,|\.)\s+', r'\1', code)
    # remove all newlines before a ( ,|.|;|\)|\] ) or spaces before a ( ,|.|;|\)|\] )
    code = re.sub(r'\s+(\]|\)|,|\.|;)', r'\1', code)
    # remove all trailing comma before a ( \) | \] )
    code = re.sub(r',(\s*[\]\)])', r'\1', code)
    # split code into lines
    code = code.split('\n')
    # remove leading and trailing whitespace
    code = [line.strip() for line in code]
    # remove all semi-colons at the end of a line
    code = [re.sub(r'(;|{)$', '', line).strip() for line in code]
    
    # After removing aliens, we can remove some twigs symbols, single words and numbers
    # remove all elements that are one word or numeric only in a string or symbols only in a string
    code = [line for line in code if not re.match(r'^\W+$', line) 
                                        and not re.match(r'^\w+$', line) 
                                            and not re.match(r'^\d+$', line)]

    # PHP:
    # remove all php tags
    code = [line for line in code if not line.startswith('<?php') and not line.startswith('?>')]

    # lastly remove all empty lines
    code = list(filter(None, code))

    return code


In [8]:
js_code = code_cleaner(js_samples[0])
js_code

["const express = require('express')",
 "const crypto = require('crypto')",
 'const app = express()',
 "const db = require('better-sqlite3')('db.sqlite3')",
 'db.exec(`DROP TABLE IF EXISTS users;`)',
 'db.exec(`CREATE TABLE users(id INTEGER PRIMARY KEY,username TEXT,password TEXT);`)',
 'const FLAG = process.env.FLAG || "dice{test_flag}"',
 'const PORT = process.env.PORT || 3000',
 'const users = [...Array(100_000)].map(() => ({ user: `user-${crypto.randomUUID()}`,pass: crypto.randomBytes(8).toString("hex") }))',
 'db.exec(`INSERT INTO users (id,username,password) VALUES ${users.map((u,i) => `(${i},\'${u.user}\',\'${u.pass}\')`).join(",")}`)',
 'const isAdmin = {}',
 'const newAdmin = users[Math.floor(Math.random() * users.length)]',
 'isAdmin[newAdmin.user] = true',
 'app.use(express.urlencoded({ extended: false }))',
 'app.use(express.static("public"))',
 'app.post("/api/login",(req,res) =>',
 'const { user,pass } = req.body',
 'console.log("[REQUEST BODY] : ",req.body)',
 "const que

In [9]:
php_code = code_cleaner(php_samples[0])
php_code

["$a = 'Simple string'",
 'function query($a)',
 'echo $a',
 'query($a)',
 "$b = $_GET['q']",
 '$sql = `SELECT * FROM table WHERE id = ${b}`',
 "if($_SERVER['REQUEST_METHOD'] === 'POST')",
 "$c = $_POST['c']",
 '$sql = `SELECT * FROM table WHERE id = ${c}`',
 'function SQLQuery($sql)',
 'echo $sql',
 'error_reporting(0)',
 'function Check_Admin($input)',
 "$input = iconv ('UTF-8','US-ASCII//TRANSLIT',$input)",
 'if(preg_match("/admin/i",$input))',
 'return true',
 'return false',
 'function send_to_api($data)',
 'print_r($data)',
 "$api_url = 'http://127.0.0.1:5000/login'",
 "$options = ['http' => ['method' => 'POST','header' => 'Content-Type: application/x-www-form-urlencoded','content' => $data]]",
 '$context = stream_context_create($options)',
 '$result = file_get_contents($api_url,false,$context)',
 'if ($result !== false)',
 'echo "Response from Flask app: $result"',
 'echo "Failed to communicate with Flask app."',
 "if(isset($_POST['login-submit']))",
 "if(!empty($_POST['username

In [10]:
# Extract the variable references of JS and PHP
def extract_vars_references(vars, code):
    references = []
    for var in vars:
        # case sensitive and match whole word only or if wrapped in a special character
        var_pattern = r'(?<!\w)' + re.escape(var) + r'(?!\w)'
        
        var_references = []
        for line in code:
            if re.search(var_pattern, line):
                var_references.append(line)
        references.append((var, var_references))

    return references

In [11]:
js_vars_references = extract_vars_references(js_vars, js_code)
print(js_vars)
pprint(js_vars_references, width=150)

['express', 'crypto', 'app', 'db', 'FLAG', 'PORT', 'users', 'isAdmin', 'newAdmin', 'user', 'pass', 'query', 'id']
[('express',
  ["const express = require('express')",
   'const app = express()',
   'app.use(express.urlencoded({ extended: false }))',
   'app.use(express.static("public"))']),
 ('crypto',
  ["const crypto = require('crypto')",
   'const users = [...Array(100_000)].map(() => ({ user: `user-${crypto.randomUUID()}`,pass: crypto.randomBytes(8).toString("hex") }))']),
 ('app',
  ['const app = express()',
   'app.use(express.urlencoded({ extended: false }))',
   'app.use(express.static("public"))',
   'app.post("/api/login",(req,res) =>',
   'app.listen(PORT,() => console.log(`web/funnylogin listening on port ${PORT}`))']),
 ('db',
  ["const db = require('better-sqlite3')('db.sqlite3')",
   'db.exec(`DROP TABLE IF EXISTS users;`)',
   'db.exec(`CREATE TABLE users(id INTEGER PRIMARY KEY,username TEXT,password TEXT);`)',
   'db.exec(`INSERT INTO users (id,username,password) VALU

In [30]:
php_vars_references = extract_vars_references(php_vars, php_code)
print("VARAIBLES:", php_vars)
print("REFERENCES:")
for var, refs in php_vars_references:
    print("VAR:", var)
    print("REFS:", refs)
    print()

VARAIBLES: ['$a', '$b', "$_GET['q']", '$sql', "$_SERVER['REQUEST_METHOD']", '$c', "$_POST['c']", '$input', '$data', '$api_url', '$options', '$context', '$result', "$_POST['login-submit']", "$_POST['username']", "$_POST['password']", '$username', '$password', "$_SERVER['REMOTE_ADDR']"]
REFERENCES:
VAR: $a
REFS: ["$a = 'Simple string'", 'function query($a)', 'echo $a', 'query($a)']

VAR: $b
REFS: ["$b = $_GET['q']"]

VAR: $_GET['q']
REFS: ["$b = $_GET['q']"]

VAR: $sql
REFS: ['$sql = `SELECT * FROM table WHERE id = ${b}`', '$sql = `SELECT * FROM table WHERE id = ${c}`', 'function SQLQuery($sql)', 'echo $sql']

VAR: $_SERVER['REQUEST_METHOD']
REFS: ["if($_SERVER['REQUEST_METHOD'] === 'POST')"]

VAR: $c
REFS: ["$c = $_POST['c']"]

VAR: $_POST['c']
REFS: ["$c = $_POST['c']"]

VAR: $input
REFS: ['function Check_Admin($input)', "$input = iconv ('UTF-8','US-ASCII//TRANSLIT',$input)", 'if(preg_match("/admin/i",$input))']

VAR: $data
REFS: ['function send_to_api($data)', 'print_r($data)', "$opti