In [1]:
%%time

from antifraud import read_adj_dict_from_file, read_into_list_of_tuples
from graph_algorithms import Graph

# set file names
batch  = '../paymo_input/batch_payment.txt'
stream = '../paymo_input/stream_payment.txt'

# Read files
# Turn on verboisity to see the lines that did not conform to the csv format (Note: 
# it's the same 5 or so lines over and over again)
batch0 = read_adj_dict_from_file (batch, verbose = False)
stream = read_into_list_of_tuples (stream, verbose = False) 

# attach
batch = Graph (batch0)

CPU times: user 13.3 s, sys: 244 ms, total: 13.6 s
Wall time: 13.6 s


In [2]:
%%time
batch.build_adj ()

CPU times: user 17 s, sys: 1.11 s, total: 18.1 s
Wall time: 18.1 s


In [3]:
%%time
batch.is_self_consistent ()

CPU times: user 48.5 s, sys: 7.94 ms, total: 48.5 s
Wall time: 48.6 s


True

# Check if adj2 is truly inclusive

In [4]:
%%time
batch.is_inclusive ()

CPU times: user 178 ms, sys: 4.03 ms, total: 182 ms
Wall time: 181 ms


True

# Check for consistency among distance functions
Checking for inconsistency among the functions ***self.distance***, ***self.distance_lt_n***, and ***self.if_lte_deg4***.

In [5]:
%%time

# Check if distance & distance_lte give inconsistent answers

total = 0 # count total number of inconsistent pairs

# just check for the first 1,000 pairs. self.distance and self.distance_lte are too expensive
for pair in stream [:1000]:
    
    # self.distance outputs an integer, so check if its output is > -1
    lt4 = batch.distance (*pair, n = 4) > -1
    
    # count number of inconsistent pairs among self.distance, self.distance_lt_n, and self.if_lte_deg4
    if batch.distance_lte (*pair, n = 4) != lt4 or batch.if_lte_deg4 (pair) != lt4:
        total += 1
        print (pair)

        
        
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 16.4 s, sys: 3.91 ms, total: 16.4 s
Wall time: 16.4 s


# Further tests
Assuming the function ***self.distance_lte*** is accurate, use it to check if the results of ***self.if_lte_deg2*** and ***self.if_lte_deg4*** are consistent with its results.

## deg 2 or fewer

In [6]:
%%time

# Check if if_lte_deg2 & distance_lt_n (n=2) give inconsistent answers

total = 0
for pair in stream [:100000]:
    if batch.if_lte_deg2 (pair) != batch.distance_lte (*pair, n = 2):
        total += 1    # count number of inconsistent pairs
        print (pair)  # print out the inconsistent pairs
    
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 5.43 s, sys: 0 ns, total: 5.43 s
Wall time: 5.43 s


## deg 4 or fewer

In [7]:
%%time

# Check if if_lte_deg4 & distance_lt_n (n=4) give inconsistent answers

total = 0
for pair in stream [:1000]:
    if batch.if_lte_deg4 (pair) != batch.distance_lte (*pair, n = 4):
        total += 1
        print (pair)
    
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 5.45 s, sys: 0 ns, total: 5.45 s
Wall time: 5.45 s


# Comparing the time the various distance functions take

## ***self.distance***

In [8]:
%%time

total = 0
for pair in stream [:1000]:
    if batch.distance (*pair, n = 4) > -1:
        total += 1
    
print (total, end = '\n\n')    

934

CPU times: user 9.73 s, sys: 4 ms, total: 9.74 s
Wall time: 9.74 s


## ***self.distance_lte***

In [9]:
%%time

total = 0
for pair in stream [:1000]:
    if batch.distance_lte (*pair, n = 4):
        total += 1
    
print (total, end = '\n\n')    

934

CPU times: user 5.54 s, sys: 0 ns, total: 5.54 s
Wall time: 5.54 s


## ***self.if_lte_deg4***

In [21]:
%%time

total = 0
for pair in stream [:1000]:
    if batch.if_lte_deg4 (pair):
        total += 1
    
print (total, end = '\n\n')

934

CPU times: user 16.7 ms, sys: 7 µs, total: 16.7 ms
Wall time: 16.6 ms


## Speedup

From the above, the time it takes each function to compute the degree of separation of a pair (up to deg 4), is

|function|time|times slower than ***if_lte_deg4***|
|---|---|---|
|***distance***|10.1 s |359.4|
|***distance_let***|5.35 s|190.4|
|***if_lte_deg4***|28.1 ms|1|

# Timing the functions using only pairs of deg 4 & 5 apart

In [11]:
from time import time

In [12]:
# build lists of pairs of distance 4 & 5 apart and store into dist4 & dist5, resp.

dist4, dist5 = [], []

k, l = 0, 0
for pair in stream:
    dist = batch.distance (*pair, n = 5)
    
    if dist == 5:
        dist5.append (pair)
        k += 1
    elif dist == 4:
        dist4.append (pair)
        l += 1
        
    if k >= 100 and l >= 100:
        break
        
# only want the first 100 pairs of each
dist4 = dist4 [:100]
dist5 = dist5 [:100]

## Get worst-case scenario comparison

In [16]:
# worst-case scenario (uses dist5)

t1 = time ()
for pair in dist5:
    batch.if_lte_deg4 (pair)

t2 = time ()

for pair in dist5:
    batch.distance_lte
    batch.distance_lte (*pair, 4)

t3 = time ()

dt_method_1 = t2 - t1
dt_method_2 = t3 - t2
print ('elapsed time method 1: {} us'.format (dt_method_1 * 1e6 / 100))
print ('elapsed time method 2: {} ms'.format (dt_method_2 * 1e3 / 100))
print ('dt_method_2 / dt_method_1: ', dt_method_2 / dt_method_1, end = '\n\n')

elapsed time method 1: 218.28413009643555 us
elapsed time method 2: 44.0523886680603 ms
dt_method_2 / dt_method_1:  201.81214570476763



## average case scenario

In [19]:
# average case scenario (uses dist4)

t1 = time ()
for pair in dist4:
    batch.if_lte_deg4 (pair)

t2 = time ()

for pair in dist4:
    batch.distance_lte (*pair, 4)

t3 = time ()

dt_method_1 = t2 - t1
dt_method_2 = t3 - t2
print ('elapsed time method 1: {} us'.format (dt_method_1 * 1e6 / 100))
print ('elapsed time method 2: {} ms'.format (dt_method_2 * 1e3 / 100))
print ('dt_method_2 / dt_method_1: ', dt_method_2 / dt_method_1, end = '\n\n')

elapsed time method 1: 33.84113311767578 us
elapsed time method 2: 29.497816562652588 ms
dt_method_2 / dt_method_1:  871.6556995913767

