In [1]:
import numpy as np
from scipy.sparse import lil_matrix, dok_matrix
import re
import networkx as nx
from collections import OrderedDict, Counter, defaultdict
from statistics import mode
import csv

In [2]:
G = nx.read_edgelist('network.tsv')

In [3]:
def check_t8(access):
    if len(access) == 5:
        return -1
    else:
        return int(access[6])

In [5]:
dictval = OrderedDict()

with open('labeled-vertices.train.tsv') as training:
    i = 0
    for line in training:
        if i%1000000 == 0:
            print(i)
        access = re.findall('\d+', line)

        i+=1
        dictval[access[0]] = (int(access[2]), int(access[4]), check_t8(access))

0
1000000
2000000
3000000
4000000
5000000


In [6]:
mostcommon_val = Counter(dictval.values()).most_common(1)[0][0]

In [7]:
def check(node):
    node = str(node)
    counter_neigh = Counter()
    counter_ego = Counter()
    print('Identity: {}'.format(dictval[node]))
    
    neighbors = G.neighbors(node)
    counter_neigh.update([dictval[x] for x in neighbors if x in dictval.keys()])
    print('Neighbors')
    print(counter_neigh)
    
    counter_ego.update([dictval[x] for x, y in nx.single_source_shortest_path_length(G, node, cutoff=2).items() if y == 2 and x in dictval.keys()])
    print('Ego')
    print(counter_ego)

In [None]:
check(88)

In [13]:
def naive(node):
    node = str(node)
    counter_a = Counter()
    counter_a.update([dictval[x] for x in G.neighbors(node) if x in dictval.keys()])
    counter_a.update([dictval[x] for x, y in nx.single_source_shortest_path_length(G, node, cutoff=2).items() if y == 2 and x in dictval.keys()])
    if counter_a.most_common(1) == []:
        return mostcommon_val
    else:
        return counter_a.most_common(1)[0][0]

In [8]:
def jaccweight(node):
    node = str(node)
    a = nx.jaccard_coefficient(G, [(node, x) for x in nx.ego_graph(G,node, radius=2).nodes() if x != node and x in dictval.keys()])
    counter_a = defaultdict(int)
    for x,y,z in a:
        counter_a[dictval[y]] += z
#     print(counter_a)
    if counter_a == {}:
        return mostcommon_val
    else:
        return max(counter_a, key=counter_a.get)

In [9]:
def t8write(combo):
    if combo[2] == -1:
        return ''
    else:
        return ' T8:{}'.format(combo[2])

In [None]:
with open('naive_result.csv', 'w') as csvfile:
    csvwrite = csv.writer(csvfile, delimiter=',')
    csvwrite.writerow(['id', 'attr'])
    
    with open('unlabeled-vertices.test.txt') as file:
        for x in file:
            clean = x.strip('\n')
            naive_result = naive(clean)
            line = 'T0:{} T1:{}'.format(naive_result[0], naive_result[1]) + t8write(naive_result)
            csvwrite.writerow([clean, line])
            print(clean, naive_result)

In [None]:
with open('jacc_result.csv', 'w') as csvfile:
    csvwrite = csv.writer(csvfile, delimiter=',')
    csvwrite.writerow(['id', 'attr'])
    
    i = 0
    with open('unlabeled-vertices.test.txt') as file:
        for x in file:
            if i%100000 == 0:
                print(i)
            clean = x.strip('\n')
            jacc_result = jaccweight(clean)
            line = 'T0:{} T1:{}'.format(jacc_result[0], jacc_result[1]) + t8write(jacc_result)
            csvwrite.writerow([clean, line])
            i += 1
#             print(clean, jacc_result)

### Failed

In [None]:
check(421294)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=25, random_state=385)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

In [None]:
rf_model = multi_target_forest.fit(X_train, y_train)
predictions = rf_model.predict(X_train)

predictions

In [None]:
y_train = dok_matrix((5301403,3))

i = 0
    # add to below
for x in list(dictval.keys()):
    if i%100000 == 0:
        print(i)

    y_train[i, 0] = dictval[x][0]
    y_train[i, 1] = dictval[x][1]
    y_train[i, 2] = dictval[x][2]        

#     X_train[i,0] = G.degree(x)

#     X_train[i,1] = nx.closeness_centrality(G,u=x)
#     X_train[i,2] = nx.average_neighbor_degree(G, nodes=x)
#     X_train[i,3] = nx.clustering(G, nodes=x)

#     #neighbors mode attributes:
#     neighbors = G.neighbors(x)
#     X_train[i,1] = mode([dictval[node][0] for node in neighbors])
#     X_train[i,2] = mode([dictval[node][1] for node in neighbors])
#     X_train[i,3] = mode([dictval[node][2] for node in neighbors])

    
    i += 1

In [None]:
X_train = dok_matrix((5301403,4))
i = 0
for x in list(dictval.keys()):
    if i%100000 == 0:
        print(i)
    X_train[i,0] = G.degree(x)
    
    
    ego = nx.ego_graph(G,x, radius=2)
    #neighbors mode attributes:
#     neighbors = G.neighbors(x)
#     select = mode([dictval[node] for node in neighbors if node in dictval.keys()])
#     X_train[i,1] = select[0]
#     X_train[i,2] = select[1]
#     X_train[i,3] = select[2]
    X_train[i,1] = ego.number_of_nodes()
    
    a = nx.jaccard_coefficient(G, [(x, node) for node in ego.nodes() if node != x and node in dictval.keys()])
    jacmeasure = Counter()
    jactotal = 0
    for b,c,d in a:
        jacmeasure[dictval[c]] += d
        jactotal += d
    X_train[i,2] =  all_combos[max(jacmeasure, key=jacmeasure.get)]
    X_train[i,3] =  max(jacmeasure.values())/jactotal
    i += 1

#### Error analysis

In [11]:
devval = OrderedDict()

with open('labeled-vertices.dev.tsv') as dev:
    i = 0
    for line in dev:
        if i%100000 == 0:
            print(i)
        access = re.findall('\d+', line)

        i+=1
        devval[access[0]] = (int(access[2]), int(access[4]), check_t8(access))
    

0
100000
200000
300000
400000
500000
600000


In [None]:
errors = {'t0':0, 't1':0, 't8':0}

for x,y in devval.items():
    if i % 100000 == 0:
        print(i)
    pred = naive(x)
    if pred[0] != y[0]:
        errors['t0'] += 1
    if pred[1] != y[1]:
        errors['t1'] += 1
    if pred[2] != y[2]:
        errors['t8'] += 1
    i+=1