In [1]:
import numpy as np
from scipy import stats

In [2]:
# Make sure the input data only includes 1s and 0s
# Assume any lines with other chars are human-generated (human error)

def clean_data_test(line):
    allowed = set('1' + '0')
    return set(line) > allowed

In [3]:
# Simple test for whether the sample is within a normal Bernoulli distribution

def binomial_test(line):
    p = stats.binom_test(line.count('1'), len(line))
    return p < 0.05

In [4]:
# Calculate the transition occurrences matrix, which counts occurrences of a pattern 
# n represents the number of previous chars we're tracking
# The columns represent the previous state, rows represent the current state
# ie for n=2 [00 -> 0, 01 -> 0]
#            [10 -> 1, 11 -> 1]
# Dividing each by the sum of its column will get the probability of that pattern transition
# Flattening the matrix will get the number of occurrences of each permutation at size n

def get_transition_matrix(line, n):
    num_permutations = 2**(n-1)
    transition_matrix = np.zeros((2, num_permutations), dtype=int)
    
    if n < 2:
        return transition_matrix
    
    for i in range(n-1, len(line)):
        curr_outcome = int(line[i])
        prev_pattern = line[i-n+1:i]
        col_index = int(prev_pattern, 2)
        
        transition_matrix[curr_outcome,col_index] += 1
    
    return transition_matrix

In [5]:
# Test the hypothesis that p=0.5 (iid) for each transition

def markov_test(line, n):
    transition_matrix = get_transition_matrix(line, n)
    
    for column in transition_matrix.T:
        p = stats.binom_test(column)
        if p < 0.05:
            return True
    return False

def run_markov_tests(line):
    for n in range(2, 4):
        if markov_test(line, n):
            return True
    return False

In [6]:
# Calculate average random standard deviations for transition matrices of size n

def get_random_st_devs():
    st_dev_by_size = dict()
    
    sample_size = 1000
    random_data = generate_train_data(sample_size)
    
    for n in range(2, 6):
        st_dev = 0
        for data in random_data:
            permutation_matrix = get_transition_matrix(data, n).flatten()
            st_dev += np.std(permutation_matrix)

        st_dev_by_size[n] = st_dev / sample_size

    return st_dev_by_size

In [7]:
# Identify if there are any patterns used significantly more than average throughout a sample

def pattern_matching(line, n, st_dev):
    permutation_matrix = get_transition_matrix(line, n).flatten()
    mean = len(line) / len(permutation_matrix)
    
    for pattern in permutation_matrix:
        if pattern > mean + st_dev * 3:
            return True
    return False

def run_pattern_matching(line, st_devs):
    for n in range(3, 6):
        if pattern_matching(line, n, st_devs.get(n)):
            return True
    return False

In [8]:
# Run different tests in order and mark results[index] as true if it detects human-generated behaviour

def process_data(data):
    st_devs = get_random_st_devs()
    results = dict()
    
    for index in range(len(data)):
        line = data[index].strip()

        if clean_data_test(line):
            results[index] = True
            continue

        if binomial_test(line):
            results[index] = True
            continue

        if run_markov_tests(line):
            results[index] = True
            continue

        if run_pattern_matching(line, st_devs):
            results[index] = True
            continue
        
        results[index] = False

    return results

In [9]:
# Generate random training data or get it from strings.txt

from random import randint, seed

def generate_line():
    line = ""
    for i in range(0, 150):
        line += str(randint(0, 1))
    return line

def generate_train_data(length):
    seed(11)
    lst = []
    for i in range(0, length):
        lst.append(generate_line())
    return lst

def get_data(test):
    if test:
        strings_file = open('strings.txt', 'r')
        return strings_file.readlines()

    return generate_train_data(1000)

In [11]:
# Test matching on all randomly generated strings

data = get_data(False)
training_results = process_data(data)
print('False Positive Rate: %f%%' % float(sum(training_results.values()) / len(training_results) * 100))

False Positive Rate: 29.800000%


In [12]:
# Test on human-generated strings

strings_file = open('human_strings.txt', 'r')
human_strings = strings_file.readlines()
human_results = process_data(human_strings)

print('True Positive Rate: %f%%' % float(sum(human_results.values()) / len(human_strings) * 100))

True Positive Rate: 96.666667%


In [13]:
# Run tests on strings.txt to get results

data = get_data(True)
results = process_data(data)

print(results)
print('Found %s human-generated strings' % sum(results.values()))

{0: True, 1: True, 2: True, 3: False, 4: True, 5: False, 6: False, 7: False, 8: False, 9: False, 10: False, 11: False, 12: True, 13: False, 14: True, 15: True, 16: False, 17: False, 18: False, 19: True, 20: True, 21: True, 22: False, 23: False, 24: True, 25: False, 26: False, 27: False, 28: True, 29: False, 30: False, 31: False, 32: False, 33: False, 34: False, 35: True, 36: False, 37: True, 38: True, 39: True, 40: True, 41: False, 42: True, 43: False, 44: True, 45: False, 46: False, 47: False, 48: True, 49: False, 50: True, 51: False, 52: False, 53: True, 54: True, 55: True, 56: False, 57: True, 58: True, 59: False, 60: True, 61: False, 62: True, 63: True, 64: False, 65: True, 66: False, 67: False, 68: False, 69: False, 70: False, 71: True, 72: False, 73: False, 74: False, 75: False, 76: False, 77: True, 78: False, 79: True, 80: False, 81: False, 82: True, 83: True, 84: False, 85: False, 86: True, 87: True, 88: True, 89: True, 90: False, 91: True, 92: True, 93: True, 94: False, 95: Fa