# Unit tests and speed comparison of the various distance functions

In [3]:
%%time

from antifraud import read_adj_dict_from_file, read_into_list_of_tuples
from graph_algorithms import Graph

# set file names
batch  = '../paymo_input/batch_payment.txt'
stream = '../paymo_input/stream_payment.txt'

# Read files
# Turn on verboisity to see the lines that did not conform to the csv format (Note: 
# it's the same 5 or so lines over and over again)
batch0 = read_adj_dict_from_file (batch, verbose = False)
stream = read_into_list_of_tuples (stream, verbose = False) 

# attach
batch = Graph (batch0)

CPU times: user 14.6 s, sys: 292 ms, total: 14.9 s
Wall time: 14.8 s


In [2]:
len (batch.adj), batch.num_nodes

(77360, 77360)

In [3]:
def pass_fail (logic, message = ''):
    if logic:
        print ('[PASS]: ', end = '')
    else:
        print ('[FAIL]: ', end = '')
    print (message)    

# Test if the adacency list is self-consistent

This is accomplished by making sure that it is symmetric: for adjacency list 'adj' and any nodes x, y of G, y is in adj [x] iff x is in adj [y].

In [4]:
%time batch.is_self_consistent ()

CPU times: user 431 ms, sys: 0 ns, total: 431 ms
Wall time: 429 ms


True

# Check for consistency among distance functions
Checking for inconsistency among the functions ***self.distance*** and ***self.distance_lte***

In [5]:
%%time

# Check if distance & distance_lte give inconsistent answers

total = 0 # count total number of inconsistent pairs

# just check for the first 1,000 pairs. self.distance and self.distance_lte are too expensive
for pair in stream [:1000]:
    
    # self.distance outputs an integer, so check if its output is > -1
    lt4 = batch.distance (pair, n = 4) > -1
    
    # count number of inconsistent pairs among self.distance, self.distance_lt_n, and self.if_lte_deg4
    if batch.distance_lte (pair, n = 4) != lt4:
        total += 1
        print (pair)

        
        
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 15.2 s, sys: 10 ms, total: 15.2 s
Wall time: 15.2 s


# Further tests
Assuming the function ***self.distance_lte*** is accurate, use it to check if the results of ***self.if_lte_deg2*** and ***self.if_lte_deg4*** are consistent with its results.

## deg 2 or fewer

In [6]:
%%time

# Check if if_lte_deg2 & distance_lt_n (n=2) give inconsistent answers

total = 0
for pair in stream [:100000]:
    if batch.if_lte_deg2 (pair) != batch.distance_lte (pair, n = 2):
        total += 1    # count number of inconsistent pairs
        print (pair)  # print out the inconsistent pairs
    
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 5.61 s, sys: 3.95 ms, total: 5.61 s
Wall time: 5.58 s


In [7]:
pair = (34762, 9166)

In [8]:
batch.distance (pair), batch.distance_lte (pair, 2)

(3, False)

In [9]:
batch.if_lte_deg1 (pair), batch.if_lte_deg2 (pair), batch.if_lte_deg4 (pair)

(False, False, True)

In [10]:
a, b = pair
a in batch.adj [b], b in batch.adj [a], len (batch.adj [a] & batch.adj [b])

(False, False, 0)

## deg 4 or fewer

In [11]:
%%time

# Check if if_lte_deg4 & distance_lt_n (n=4) give inconsistent answers

total = 0
for pair in stream [:1000]:
    if batch.if_lte_deg4 (pair) != batch.distance_lte (pair, n = 4):
        total += 1
        print (pair)
    
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 5.57 s, sys: 1.57 ms, total: 5.57 s
Wall time: 5.4 s


# Comparing the time the various distance functions take

## ***self.distance***

In [12]:
%%time

total = 0
for pair in stream [:1000]:
    if batch.distance (pair, n = 4) > -1:
        total += 1
    
print (total, end = '\n\n')    

934

CPU times: user 9.69 s, sys: 4.21 ms, total: 9.7 s
Wall time: 9.58 s


## ***self.distance_lte***

In [13]:
%%time

total = 0
for pair in stream [:1000]:
    if batch.distance_lte (pair, n = 4):
        total += 1
    
print (total, end = '\n\n')    

934

CPU times: user 5.52 s, sys: 3.78 ms, total: 5.52 s
Wall time: 5.5 s


## ***self.if_lte_deg4***

In [20]:
%%timeit

total = 0
for pair in stream [:1000]:
    if batch.if_lte_deg4 (pair):
        total += 1
    
#print (total, end = '\n\n')

61.8 ms ± 2.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Speedup

From the above, the time it takes each function to compute the degree of separation of a pair (up to deg 4), is

|function|time|times slower than ***if_lte_deg4***|
|---|---|---|
|***distance***|9.44 s |143.7|
|***distance_let***|5.34 s|81.3|
|***if_lte_deg4***|65.7 ms|1|

# Timing the functions using only pairs of deg 4 & 5 apart

In [15]:
from time import time

In [16]:
# build lists of pairs of distance 4 & 5 apart and store into dist4 & dist5, resp.

dist4, dist5 = [], []

k, l = 0, 0
for pair in stream:
    dist = batch.distance (pair, n = 5)
    
    if dist == 5:
        dist5.append (pair)
        k += 1
    elif dist == 4:
        dist4.append (pair)
        l += 1
        
    if k >= 100 and l >= 100:
        break
        
# only want the first 100 pairs of each
dist4 = dist4 [:100]
dist5 = dist5 [:100]

## Get worst-case scenario comparison

In [17]:
# worst-case scenario (uses dist5)

t1 = time ()
for pair in dist5:
    batch.if_lte_deg4 (pair)

t2 = time ()

for pair in dist5:
    batch.distance_lte
    batch.distance_lte (pair, 4)

t3 = time ()

dt_method_1 = t2 - t1
dt_method_2 = t3 - t2
print ('elapsed time method 1: {} us'.format (dt_method_1 * 1e6 / 100))
print ('elapsed time method 2: {} ms'.format (dt_method_2 * 1e3 / 100))
print ('dt_method_2 / dt_method_1: ', dt_method_2 / dt_method_1, end = '\n\n')

elapsed time method 1: 83.21285247802734 us
elapsed time method 2: 42.61211395263672 ms
dt_method_2 / dt_method_1:  512.0857257463756



## average case scenario

In [18]:
# average case scenario (uses dist4)

t1 = time ()
for pair in dist4:
    batch.if_lte_deg4 (pair)

t2 = time ()

for pair in dist4:
    batch.distance_lte (pair, 4)

t3 = time ()

dt_method_1 = t2 - t1
dt_method_2 = t3 - t2
print ('elapsed time method 1: {} us'.format (dt_method_1 * 1e6 / 100))
print ('elapsed time method 2: {} ms'.format (dt_method_2 * 1e3 / 100))
print ('dt_method_2 / dt_method_1: ', dt_method_2 / dt_method_1, end = '\n\n')

elapsed time method 1: 241.5609359741211 us
elapsed time method 2: 26.98106050491333 ms
dt_method_2 / dt_method_1:  111.69463471446338

