In [1]:
%%time

from antifraud import read_adj_dict_from_file, read_into_list_of_tuples
from graph_algorithms import Graph

# set file names
batch  = '../paymo_input/batch_payment.txt'
stream = '../paymo_input/stream_payment.txt'

# Read files
# Turn on verboisity to see the lines that did not conform to the csv format (Note: 
# it's the same 5 or so lines over and over again)
batch0 = read_adj_dict_from_file (batch, verbose = False)
stream = read_into_list_of_tuples (stream, verbose = False) 

# attach
batch = Graph (batch0)

CPU times: user 13.6 s, sys: 391 ms, total: 14 s
Wall time: 14 s


In [8]:
%%time

# 1st order adjacent lists
adj_ex = batch0.copy ()
# inclusive version
adj = adj_ex.copy ()
for key in adj.keys ():
    adj [key].add (key)   # adds itself (separated by deg 0)
    
    
    
# building degree 2 adjacent lists
adj2 = {}      # degs 0, 1, & 2
for key in adj_ex.keys ():
    tmp = set ()
    for node in adj_ex [key]:
        tmp.update (adj_ex [node])
    adj2 [key] = tmp


CPU times: user 16.4 s, sys: 27.9 ms, total: 16.4 s
Wall time: 16.4 s


# Check if adj2 is truly inclusive

In [10]:
def is_inclusive (adj, adj2):
    for key in adj.keys ():
        if key not in adj2 [key]:
            print ('key {} is not found in adj [key]'.format (key))
            return False
        if not adj [key].issubset (adj2 [key]):
            print ('adj [{}] is not a subset of adj [key]'.format (key))
            return False
    return True
is_inclusive (adj, adj2)

True

In [3]:
len (stream)

2999997

In [14]:
%%time

k = 1
total = 0
for pair in stream:
    if batch.distance (*pair, n = 4) > -1:
        total += 1
    if k == 1000:
        break
    k += 1
    
print (total, end = '\n\n')    

934

CPU times: user 9.85 s, sys: 2.82 ms, total: 9.85 s
Wall time: 9.85 s


In [15]:
%%time

k = 1
total = 0
for pair in stream:
    if batch.distance_lt_n (*pair, n = 4) == True:
        total += 1
    if k == 1000:
        break
    k += 1
    
print (total, end = '\n\n')    

934

CPU times: user 5.76 s, sys: 49 µs, total: 5.76 s
Wall time: 5.76 s


In [17]:
%%time

# Check if distance & distance_lt_n give inconsistent answers

k = 1
total = 0
for pair in stream:
    lt4 = batch.distance (*pair, n = 4) > -1
    if batch.distance_lt_n (*pair, n = 4) != lt4:
        total += 1
        print (pair)
    if k == 1000:
        break
    k += 1
    
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 15.5 s, sys: 52 µs, total: 15.5 s
Wall time: 15.5 s


In [22]:
elem = 52575
find = '1b' # elem
print (elem in batch.adj.keys ())
batch.next_degree_friends_sc ({elem}, find) # == next_degree_friends (batch0, {elem})

True


{8683, 52575, 75059}

# Building new distance functions

## deg 2 or less

In [57]:
def if_lte_deg2 (adj, adj2, pair):
    a, b = pair
    
    if a == b:
        return True
    elif a not in adj.keys () or b not in adj.keys ():
        return False
    elif a in adj [b]:
        return True
    else:
        return a in adj2 [b]

In [65]:
%%time

# Check if if_lte_deg2 & distance_lt_n (n=2) give inconsistent answers

k = 1
total = 0
for pair in stream:
    if if_lte_deg2 (adj, adj2, pair) != batch.distance_lt_n (*pair, n = 2):
        total += 1
        print (pair)
    if k == 100000:
        break
    k += 1
    
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 5.54 s, sys: 0 ns, total: 5.54 s
Wall time: 5.54 s


In [38]:
%%time

k = 1
total = 0
for pair in stream:
    if batch.distance_lt_n (*pair, n = 2) == True:
        total += 1
    if k == 1000:
        break
    k += 1
    
print (total, end = '\n\n')    

787

CPU times: user 73.1 ms, sys: 130 µs, total: 73.3 ms
Wall time: 74.6 ms


In [45]:
%%time

k = 1
total = 0
for pair in stream:
    if if_lte_deg2 (adj, adj2, pair):
        total += 1
    if k == 1000:
        break
    k += 1
    
print (total, end = '\n\n')    

787

CPU times: user 1.67 ms, sys: 24 µs, total: 1.7 ms
Wall time: 1.68 ms


## deg 4 or less

In [20]:
def if_lte_deg4 (adj, adj2, pair):
    a, b = pair
    
    if a == b:     # deg 0
        return True
    elif a not in adj.keys () or b not in adj.keys ():  # >= one of (a, b) is not a node in graph 'adj'
        return False
    elif a in adj [b]:   # a & b are deg 1 apart
        return True
    elif a in adj2 [b]:  # a & b are deg 2 apart
        return True
    else:
        for key in adj2 [b]:
            if a in adj2 [key]:
                return True
        return False

In [22]:
%%time

# Check if if_lte_deg4 & distance_lt_n (n=4) give inconsistent answers

k = 1
total = 0
for pair in stream:
    if if_lte_deg4 (adj, adj2, pair) != batch.distance_lt_n (*pair, n = 4):
        total += 1
        print (pair)
        break
    if k == 10000:
        break
    k += 1
    
print ('Number of inconsistent pairs: ', total, end = '\n\n')    

Number of inconsistent pairs:  0

CPU times: user 47.2 s, sys: 3.86 ms, total: 47.2 s
Wall time: 47.2 s


In [17]:
%%time

k = 1
total = 0
for pair in stream:
    if batch.distance (*pair, n = 4) > -1:
        total += 1
    if k == 1000:
        break
    k += 1
    
print (total, end = '\n\n')    

934

CPU times: user 10.1 s, sys: 0 ns, total: 10.1 s
Wall time: 10.1 s


In [33]:
10 / 27e-3

370.3703703703704

In [15]:
%%time

k = 1
total = 0
for pair in stream:
    if batch.distance_lt_n (*pair, n = 4):
        total += 1
    if k == 1000:
        break
    k += 1
    
print (total, end = '\n\n')    

934

CPU times: user 5.82 s, sys: 0 ns, total: 5.82 s
Wall time: 5.82 s


In [32]:
%%time

k = 1
total = 0
for pair in stream:
    if if_lte_deg4 (adj, adj2, pair):
        total += 1
    if k == 1000:
        break
    k += 1
    
print (total, 'out of ', k, ', ', total * 100.0 / k, end = '%\n\n')

934 out of  1000 ,  93.4%

CPU times: user 27.5 ms, sys: 0 ns, total: 27.5 ms
Wall time: 27.6 ms


In [29]:
%%time

k = 1
total = 0
for pair in stream:
    if if_lte_deg4 (adj, adj2, pair):
        total += 1
    k += 1
    
print (total, 'out of ', k, end = '\n\n')

2775423 out of  2999998

CPU times: user 34.8 s, sys: 12 ms, total: 34.8 s
Wall time: 34.8 s


In [30]:
2775423 * 100.0 / 2999998

92.51416167610779