In [8]:
import subprocess
import numpy as np
from sklearn.preprocessing import normalize
import os
import re

In [9]:
"""
Pattern for string concatenation:
    case 1 and case 2
    optionalGroup( ( " or ' or ` ) \s* ( + or . ) \s*) anychar_except( " or ' or ` ) optionalGroup( \s* ( + or . ) \s* ( " or ' or ` ) )

    case 3 
    let strings = requiredGroup ( " or ` ) anychar requiredGroup ( " or ` )
    - strings contains $anychar or ${anychar}
    - consider a string like "SELECT * FROM users WHERE user_id = '$userid1_2'"
    - consider a string like `SELECT * FROM users WHERE user_id = ${userid1_2}`
"""
def hasConcatenation(statement):
    case1 = re.compile(r'(["\'`]\s*[+\.]\s*?)\(?[a-zA-Z0-9_$]\)?[^"\'`]')
    case2 = re.compile(r'[^"\'`]\(?[a-zA-Z0-9_$]\)?\s*[+\.]\s*?["\'`]')
    case3 = re.compile(r'(\$\w+)|(\$\{\w+\})')
    grab_string = re.compile(r'(["`]).*?\1')
    string = re.finditer(grab_string, statement)
    # check for case 1 and case 2
    if re.search(case1, statement) or re.search(case2, statement):
        return True
    # if no match found in case 1 and case 2, check for case 3
    for s in string:
        if re.search(case3, s.group()):
            return True
    # if not match found in case 1-3, return False
    return False

# This will match SQL syntax and concatenated strings in a given SQL statement.
def matchSqlStament(statement):
    sql_syntax_pattern = re.compile(r'\b(?:SELECT|INSERT|UPDATE|DELETE|FROM|WHERE|AND|OR)\b', re.IGNORECASE)
    # Initialize counts to zero
    sql_syntax_count = 0
    concatenated_string_count = 0
    if re.search(sql_syntax_pattern, statement):
        sql_syntax_count = 1
        if hasConcatenation(statement):
            concatenated_string_count = 1
    # Return a single list containing the count of patterns found
    return [sql_syntax_count, concatenated_string_count]

def matchHTMLTags(statement):
    # Define a pattern to match HTML tags
    html_tag_pattern = re.compile(r'<.*?>')
    # detect echo $anyvar
    echo_pattern = re.compile(r'echo\s*\$')
    # detect echo function 
    echo_function_pattern = re.compile(r'\b(?:echo|print|printf|sprintf|vprintf|vsprintf|innertext|outerhtml|innerHTML|outerHTML|document.write|document.writeln|document.open|document.close|document.writeIn|document.writeLn|document.writeInLn|text\s*=)\b', re.IGNORECASE)
    
    # Initialize counts to zero
    html_tag_count = 0
    concatenated_string_count = 0
    if re.search(html_tag_pattern, statement) or re.search(echo_function_pattern, statement):
        html_tag_count = 1
        if hasConcatenation(statement):
            concatenated_string_count = 1
    if re.search(echo_pattern, statement):
        html_tag_count = 1
        concatenated_string_count = 1
    # Return a single list containing the count of patterns found
    return [html_tag_count, concatenated_string_count]

def matchDangerousFunctions(statement):
    # Define a pattern to match dangerous functions
    dangerous_function_pattern = re.compile(r'\b(?:exec|system|shell_exec|passthru|eval|assert|create_function|popen|proc_open|preg_replace|unserialize|Function|ReflectionFunction|setTimeout|setInterval|spawn|execSync|execFile)\b', re.IGNORECASE)
    # Initialize counts to zero
    dangerous_function_count = 0
    concatenated_string_count = 0
    if re.search(dangerous_function_pattern, statement):
        dangerous_function_count = 1
        if hasConcatenation(statement):
            concatenated_string_count = 1
    # Return a single list containing the count of patterns found
    return [dangerous_function_count, concatenated_string_count]

def matchImportFunctions(statement):
    # Define a pattern to match import functions
    import_function_pattern = re.compile(r'\b(?:require|require_once|include|include_once|import|file_get_contents|fopen|fread|fclose|readfile|parse_ini_file|readFileSync|readFile)\b', re.IGNORECASE)
    uri_scheme_pattern = re.compile(r'\b(?:http|https|ftp|ftps|sftp|ssh|scp|file|data|php|phar|expect|zip|rar|tar|gzip|bzip2|compress|zlib|ssh2|expect|data|php|phar|zip|rar|tar|gzip|bzip2|compress|zlib|ssh2)://\b', re.IGNORECASE)
    # Initialize counts to zero
    import_function_count = 0
    uri_scheme_count = 0
    uri_scheme_concatenation_count = 0
    concatenated_string_count = 0
    if re.search(import_function_pattern, statement):
        import_function_count = 1
        if hasConcatenation(statement):
            concatenated_string_count = 1
    if re.search(uri_scheme_pattern, statement):
        uri_scheme_count = 1
        if hasConcatenation(statement):
            uri_scheme_concatenation_count = 1
    # Return a single list containing the count of patterns found
    return [import_function_count, concatenated_string_count, uri_scheme_count, uri_scheme_concatenation_count]

def matchValidations(statement):
    # Define a pattern to match validation functions
    validation_function_pattern = re.compile(r'\b(?:filter_var|filter_input|filter_var_array|filter_input_array|preg_match|preg_match_all|htmlspecialchars|preg_replace|preg_replace_callback|preg_replace_callback_array|preg_split|preg_grep|preg_filter|preg_last_error|test|match|validate|check|verify|sanitize|clean|escape|encode|decode|hash|encrypt|decrypt|secure|validate|check|verify|sanitize|clean|escape|encode|decode|hash|encrypt|decrypt|secure)\b', re.IGNORECASE)
    operator_check_pattern = re.compile(r'\b(?:==|===|!=|!==|<=|>=|<|>)\b')
    if_statement_pattern = re.compile(r'\bif\s*\(\s*.*\s*\)')
    # Initialize counts to zero
    validation_function_count = 0
    operator_check_count = 0
    if_statement_count = 0
    if re.search(if_statement_pattern, statement):
        if_statement_count = 1
    if re.search(validation_function_pattern, statement):
        validation_function_count = 1
    if re.search(operator_check_pattern, statement):
        operator_check_count = 1
    return [if_statement_count, validation_function_count, operator_check_count]

def matchObjectPrototype(statement):
    # Define a pattern to match prototype pollution
    prototype_assignment_pattern = re.compile(r'Object\.prototype\.[\w$]+\s*=\s*.+')
    object_assignment_pattern = re.compile(r'([\w$]+|Object)\s*=\s*{[\w$]+:\s*.+,')
    object_manipulation_pattern = re.compile(r'Object\.(assign|setPrototypeOf)\s*\([\w$]+\s*,\s*{[\w$]+:\s*.+}\s*\)')
    json_parse_pattern = re.compile(r'JSON\.parse\s*\([\w$]+\s*\)')
    property_check_pattern = re.compile(r'\bif\s*\(\s*!\s*[\w$]+\s*\.hasOwnProperty\s*\(\s*[\w$]+\s*\)\s*\)\s*{')
    default_object_assignment_pattern = re.compile(r'[\w$]+\s*=\s*[\w$]+\s*\|\|\s*{};')
    dynamic_property_assignment_pattern = re.compile(r'[\w$]+\s*\[\s*[\w$]+\s*\]\s*=\s*.+')
    array_copy_pattern = re.compile(r'const\s+[\w$]+\s*=\s*[\w$]+\s*\[\s*[\w$]+\s*\];')
    
    prototype_assignment_count = 0
    object_assignment_count = 0
    object_manipulation_count = 0
    json_parse_count = 0
    property_check_count = 0
    default_object_assignment_count = 0
    dynamic_property_assignment_count = 0
    array_copy_count = 0

    if re.search(prototype_assignment_pattern, statement):
        prototype_assignment_count += 1
    if re.search(object_assignment_pattern, statement):
        object_assignment_count += 1
    if re.search(object_manipulation_pattern, statement):
        object_manipulation_count += 1
    if re.search(json_parse_pattern, statement):
        json_parse_count += 1
    if re.search(property_check_pattern, statement):
        property_check_count += 1
    if re.search(default_object_assignment_pattern, statement):
        default_object_assignment_count += 1
    if re.search(dynamic_property_assignment_pattern, statement):
        dynamic_property_assignment_count += 1
    if re.search(array_copy_pattern, statement):
        array_copy_count += 1

    return [prototype_assignment_count, object_assignment_count, object_manipulation_count,
            json_parse_count, property_check_count, default_object_assignment_count,
            dynamic_property_assignment_count, array_copy_count]

In [10]:
JS_VARIABLE_EXTRACTOR = os.path.join("parsergen", "get_vars.js")
PHP_VARIABLE_EXTRACTOR = os.path.join("parsergen", "get_vars.php")

def extract_variables(file, lang):
    match lang:
        case "js":
            cmd = ["node", JS_VARIABLE_EXTRACTOR, "module", file]
        case "php":
            cmd = ["php", PHP_VARIABLE_EXTRACTOR, file]
        case _:
            raise Exception("Unsupported language")
        
    res = subprocess.check_output(cmd, stderr=subprocess.PIPE).decode("utf-8")
    res = res.split(',')
    return res   

In [11]:
def code_cleaner(filename):
    with open(filename, 'r') as f:
        code = f.read()
    # GENERAL: 
    # remove multiline comments
    code = re.sub(r'/\*(.*?)\*/', '', code, flags=re.DOTALL)
    # remove all single line comments (//|#) except if (//|#) is inside of a string like "htes // asdf" or 'htes # asdf'
    code = re.sub(r'(?<!\\)(["\'])(?:\\.|(?!\1).)*?\1|//.*?$|#.*?$', 
                  lambda m: m.group(0) if m.group(0).startswith('"') or m.group(0).startswith("'") else '', code, flags=re.MULTILINE)
    # remove all newlines after a ( ,|.|\(|\[ ) or spaces after a ( ,|.|\(|\[ )
    code = re.sub(r'(\[|\(|,|\.)\s+', r'\1', code)
    # remove all newlines before a ( ,|.|;|\)|\] ) or spaces before a ( ,|.|;|\)|\] )
    code = re.sub(r'\s+(\]|\)|,|\.|;)', r'\1', code)
    # remove all trailing comma before a ( \) | \] )
    code = re.sub(r',(\s*[\]\)])', r'\1', code)
    # split code into lines
    code = code.split('\n')
    # convert to numpy array for faster processing
    code = np.array(code)
    # remove leading and trailing whitespace
    code = np.char.strip(code)
    # remove all semi-colons and open curly braces at the end of a line
    for i in np.nditer(code, op_flags=['readwrite']):
        line = ''.join(i.item(0))
        i[...] = re.sub(r'(;|{)$', '', line).strip()
    # After removing aliens, we can remove some twigs symbols, single words and numbers
    # remove all elements that are one word or numeric only in a string or symbols only in a string.
    # Use NumPy vectorized operations with regular expressions to filter lines.
    non_whitespace = ~np.vectorize(lambda x: bool(re.match(r'^\W+$', x)))(code)
    non_word = ~np.vectorize(lambda x: bool(re.match(r'^\w+$', x)))(code)
    non_digit = ~np.vectorize(lambda x: bool(re.match(r'^\d+$', x)))(code)
    # Filter lines based on conditions
    code = code[non_whitespace & non_word & non_digit]
    # remove all php tags
    code = code[~np.char.startswith(code, '<?php') & ~np.char.startswith(code, '?>')]
    # lastly remove all empty lines
    code = code[code != '']
    return code

# Gets variable references of JS and PHP
def vars_references(vars, code):
    references = []
    for var in vars:
        # case sensitive and match whole word only or if wrapped in a special character
        var_pattern = r'(?<!\w)' + re.escape(var) + r'(?!\w)'
        var_references = [line for line in code if re.search(var_pattern, line)]
        references.append((var, var_references))
    return references

def check_variable_usage(code_snippet):
    # php regex rules for catching tainted variables with user input
    php_pattern = re.compile(r'(?:(\$_(?:GET|POST|REQUEST|SERVER|COOKIE|ENV|FILES)\b)|\b(?:GET|POST|REQUEST|SERVER|COOKIE|ENV|FILES)\b)\b')
    # pure js regex rules for catching tainted variables with user input
    js_pattern = re.compile(r'(?:\w+)\.(?:body|params|query|headers)', re.IGNORECASE)
    # express js regex rules for catching tainted variables with user input
    express_js_pattern = re.compile(r'(?:\w+)\.(?:body|params|query|headers|param|queryparam|get|post|paramfrom)', re.IGNORECASE)
    php_match = re.search(php_pattern, code_snippet)
    js_match = re.search(js_pattern, code_snippet)
    express_js_match = re.search(express_js_pattern, code_snippet)
    return bool(php_match), bool(js_match), bool(express_js_match)

def tainted_variables(references):
    tainted_variables = []
    for var, snippets in references:
        for snippet in snippets:
            matches = check_variable_usage(snippet)
            if any(matches):
                tainted_variables.append(var)
    # remove duplicates without changing the order
    tainted_variables = list(dict.fromkeys(tainted_variables))
    return tainted_variables

def tainted_vars_snippets(references, tainted_variables, variables):
    # Iterate over each variable in the variables list
    for var, snippets in references:
        for snippet in snippets:
            for variable in variables:
                # Check if the variable is used in the snippet
                if variable in snippet and variable != var:
                    # Add the variable to the references if it's not already present
                    if not any(variable == v[0] for v in references):
                        references.append((variable, []))
                    # Add the variable to the tainted variables list if it's not already present
                    if variable not in tainted_variables:
                        tainted_variables.append(variable)

    # Extract tainted snippets for each tainted variable
    tainted_var_and_snippets = []
    for tainted_var in tainted_variables:
        tainted_snippets = []
        for var, snippets in references:
            if var == tainted_var:
                tainted_snippets.extend(snippets)
        tainted_var_and_snippets.append((tainted_var, tainted_snippets))
    return tainted_var_and_snippets


def begin_preprocessing(variables, file):
    code = code_cleaner(file)
    references = vars_references(variables, code)
    tainted_vars = tainted_variables(references)
    tainted_result = tainted_vars_snippets(references, tainted_vars, variables)
    return tainted_result

def grab_pattern(tainted_varsnippets):
    if (len(tainted_varsnippets) == 0):
        return np.zeros((6, 8), dtype=float)

    var_sql_statements = []
    var_html_tags = []
    var_dangerous_functions = []
    var_import_functions = []
    var_validations = []
    var_objectprototype = []

    sql_statements = np.array([])
    html_tags = np.array([])
    dangerous_functions = np.array([])
    import_functions = np.array([])
    validations = np.array([])
    objectprototype = np.array([])

    for var, snippets in tainted_varsnippets:
        # (variable, found_patterns)
        for snippet in snippets:
            if len(sql_statements) == 0:
                sql_statements = np.append(sql_statements, matchSqlStament(snippet))
            else:
                sql_statements = np.sum([sql_statements, matchSqlStament(snippet)], axis=0)
            if len(html_tags) == 0:
                html_tags = np.append(html_tags, matchHTMLTags(snippet))
            else:
                html_tags = np.sum([html_tags, matchHTMLTags(snippet)], axis=0)
            if len(dangerous_functions) == 0:
                dangerous_functions = np.append(dangerous_functions, matchDangerousFunctions(snippet))
            else:
                dangerous_functions = np.sum([dangerous_functions, matchDangerousFunctions(snippet)], axis=0)
            if len(import_functions) == 0:
                import_functions = np.append(import_functions, matchImportFunctions(snippet))
            else:
                import_functions = np.sum([import_functions, matchImportFunctions(snippet)], axis=0)
            if len(validations) == 0:
                validations = np.append(validations, matchValidations(snippet))
            else:
                validations = np.sum([validations, matchValidations(snippet)], axis=0)
            if len(objectprototype) == 0:
                objectprototype = np.append(objectprototype, matchObjectPrototype(snippet))
            else:
                objectprototype = np.sum([objectprototype, matchObjectPrototype(snippet)], axis=0)

            var_sql_statements.append([var, sql_statements])
            var_html_tags.append([var, html_tags])
            var_dangerous_functions.append([var, dangerous_functions])
            var_import_functions.append([var, import_functions])
            var_validations.append([var, validations])
            var_objectprototype.append([var, objectprototype])

    # Create a matrix of patterns
    matrix = [
        sql_statements,
        html_tags,
        dangerous_functions,
        import_functions,
        validations,
        objectprototype
    ]
    
    # Find the maximum length of the arrays in the matrix
    max_length = max(len(arr) for arr in matrix)
    # Create a new matrix with the same number of rows as the original matrix
    pattern = np.zeros((len(matrix), max_length), dtype=int)
    # Fill the new matrix with values from the original array
    for i, arr in enumerate(matrix):
        pattern[i, :len(arr)] = arr
    # Normalize the matrix
    pattern = normalize(pattern, axis=1, norm='l1')
    return pattern

def preprocess(file, lang):
    match lang:
        case "js":
            variables = extract_variables(file, lang)
        case "php":
            variables = extract_variables(file, lang)
        case _:
            raise Exception("Unsupported language")
        
    if (len(variables) == 0):
        pattern = np.zeros((6, 8), dtype=float)
    else:
        res = begin_preprocessing(variables, file)
        pattern = grab_pattern(res)

    return pattern


In [None]:
php_samples_loc = 'parsergen/php_samples'
js_samples_loc = 'parsergen/js_samples'

php_samples = [os.path.join(php_samples_loc, f) for f in os.listdir(php_samples_loc) if os.path.isfile(os.path.join(php_samples_loc, f))]
js_samples = [os.path.join(js_samples_loc, f) for f in os.listdir(js_samples_loc) if os.path.isfile(os.path.join(js_samples_loc, f))]