# Unit tests and speed comparison of the various distance functions

In [1]:
%%time

from antifraud import read_adj_dict_from_file, read_into_list_of_tuples
from graph_algorithms import Graph

# set file names
batch  = '../paymo_input/batch_payment.txt'
stream = '../paymo_input/stream_payment.txt'

# Read files
# Turn on verboisity to see the lines that did not conform to the csv format (Note: 
# it's the same 5 or so lines over and over again)
batch0 = read_adj_dict_from_file (batch, verbose = False)
stream = read_into_list_of_tuples (stream, verbose = False) 

# attach
batch = Graph (batch0)

CPU times: user 31.3 s, sys: 1.27 s, total: 32.5 s
Wall time: 32.6 s


In [2]:
len (batch.adj), len (batch.adj2), batch.num_nodes

(77360, 77360, 77360)

In [3]:
def pass_fail (logic, message = ''):
    if logic:
        print ('[PASS]: ', end = '')
    else:
        print ('[FAIL]: ', end = '')
    print (message)    

# Test if the adacency lists (of degress 1 & 2) are self-consistent

This is accomplished by making sure that it is symmetric: for adjacency list 'adj_list' and any nodes x, y of G, y is in adj_list [x] iff x is in adj_list [y].

In [4]:
%%time
batch.is_self_consistent ()

CPU times: user 48.4 s, sys: 7.39 ms, total: 48.4 s
Wall time: 48.4 s


True

# Check if adj2 is truly inclusive

In [5]:
%%time
batch.is_inclusive ()

CPU times: user 184 ms, sys: 125 µs, total: 184 ms
Wall time: 183 ms


True

# Check for consistency among distance functions
Checking for inconsistency among the functions ***self.distance***, ***self.distance_lt_n***, and ***self.if_lte_deg4***.

In [6]:
%%time

# Check if distance & distance_lte give inconsistent answers

total = 0 # count total number of inconsistent pairs

# just check for the first 1,000 pairs. self.distance and self.distance_lte are too expensive
for pair in stream [:1000]:
    
    # self.distance outputs an integer, so check if its output is > -1
    lt4 = batch.distance (pair, n = 4) > -1
    
    # count number of inconsistent pairs among self.distance, self.distance_lt_n, and self.if_lte_deg4
    if batch.distance_lte (pair, n = 4) != lt4 or batch.if_lte_deg4 (pair) != lt4:
        total += 1
        print (pair)

        
        
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 15.1 s, sys: 7.9 ms, total: 15.1 s
Wall time: 15.1 s


# Further tests
Assuming the function ***self.distance_lte*** is accurate, use it to check if the results of ***self.if_lte_deg2*** and ***self.if_lte_deg4*** are consistent with its results.

## deg 2 or fewer

In [7]:
%%time

# Check if if_lte_deg2 & distance_lt_n (n=2) give inconsistent answers

total = 0
for pair in stream [:100000]:
    if batch.if_lte_deg2 (pair) != batch.distance_lte (pair, n = 2):
        total += 1    # count number of inconsistent pairs
        print (pair)  # print out the inconsistent pairs
    
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 5.92 s, sys: 264 ms, total: 6.18 s
Wall time: 6.18 s


## deg 4 or fewer

In [8]:
%%time

# Check if if_lte_deg4 & distance_lt_n (n=4) give inconsistent answers

total = 0
for pair in stream [:1000]:
    if batch.if_lte_deg4 (pair) != batch.distance_lte (pair, n = 4):
        total += 1
        print (pair)
    
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 5.72 s, sys: 15.9 ms, total: 5.74 s
Wall time: 5.74 s


# Comparing the time the various distance functions take

## ***self.distance***

In [9]:
%%time

total = 0
for pair in stream [:1000]:
    if batch.distance (pair, n = 4) > -1:
        total += 1
    
print (total, end = '\n\n')    

934

CPU times: user 9.58 s, sys: 6.94 ms, total: 9.59 s
Wall time: 9.59 s


## ***self.distance_lte***

In [10]:
%%time

total = 0
for pair in stream [:1000]:
    if batch.distance_lte (pair, n = 4):
        total += 1
    
print (total, end = '\n\n')    

934

CPU times: user 5.38 s, sys: 0 ns, total: 5.38 s
Wall time: 5.38 s


## ***self.if_lte_deg4***

In [11]:
%%time

total = 0
for pair in stream [:1000]:
    if batch.if_lte_deg4 (pair):
        total += 1
    
print (total, end = '\n\n')

934

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 15.9 ms


## Speedup

From the above, the time it takes each function to compute the degree of separation of a pair (up to deg 4), is

|function|time|times slower than ***if_lte_deg4***|
|---|---|---|
|***distance***|10.1 s |359.4|
|***distance_let***|5.35 s|190.4|
|***if_lte_deg4***|28.1 ms|1|

# Timing the functions using only pairs of deg 4 & 5 apart

In [12]:
from time import time

In [13]:
# build lists of pairs of distance 4 & 5 apart and store into dist4 & dist5, resp.

dist4, dist5 = [], []

k, l = 0, 0
for pair in stream:
    dist = batch.distance (pair, n = 5)
    
    if dist == 5:
        dist5.append (pair)
        k += 1
    elif dist == 4:
        dist4.append (pair)
        l += 1
        
    if k >= 100 and l >= 100:
        break
        
# only want the first 100 pairs of each
dist4 = dist4 [:100]
dist5 = dist5 [:100]

## Get worst-case scenario comparison

In [14]:
# worst-case scenario (uses dist5)

t1 = time ()
for pair in dist5:
    batch.if_lte_deg4 (pair)

t2 = time ()

for pair in dist5:
    batch.distance_lte
    batch.distance_lte (pair, 4)

t3 = time ()

dt_method_1 = t2 - t1
dt_method_2 = t3 - t2
print ('elapsed time method 1: {} us'.format (dt_method_1 * 1e6 / 100))
print ('elapsed time method 2: {} ms'.format (dt_method_2 * 1e3 / 100))
print ('dt_method_2 / dt_method_1: ', dt_method_2 / dt_method_1, end = '\n\n')

elapsed time method 1: 171.45872116088867 us
elapsed time method 2: 43.9630126953125 ms
dt_method_2 / dt_method_1:  256.40581241743723



## average case scenario

In [15]:
# average case scenario (uses dist4)

t1 = time ()
for pair in dist4:
    batch.if_lte_deg4 (pair)

t2 = time ()

for pair in dist4:
    batch.distance_lte (pair, 4)

t3 = time ()

dt_method_1 = t2 - t1
dt_method_2 = t3 - t2
print ('elapsed time method 1: {} us'.format (dt_method_1 * 1e6 / 100))
print ('elapsed time method 2: {} ms'.format (dt_method_2 * 1e3 / 100))
print ('dt_method_2 / dt_method_1: ', dt_method_2 / dt_method_1, end = '\n\n')

elapsed time method 1: 55.07230758666992 us
elapsed time method 2: 27.256712913513184 ms
dt_method_2 / dt_method_1:  494.9259275293303

