In [21]:
import json
import pandas as pd
import numpy as np

In [None]:

with open("../data/MC1_preprocessed.json", "r") as f:
    data = json.load(f)

df = pd.DataFrame(data['nodes'])

In [13]:
df.head()

Unnamed: 0,type,dataset,country,id
0,company,MC1,Nalakond,spanish shrimp carriers|0
1,organization,MC1,,12744|1
2,organization,MC1,,143129355|2
3,organization,MC1,,7775|3
4,organization,MC1,,1017141|4


## Entity types

In [35]:
for type in df['type'].unique():
    if not pd.isna(type):
        print(f"{type}")
        print(df[df['type']==type]['id'][:5].tolist())
        print()

company
['spanish shrimp  carriers|0', 'faroe islands shrimp shark|7', 'saltwater solitude gmbh & co. kg|24', 'viento azul harbor llc investment|115', 'turkish sword buoy incorporated forwading|183']

organization
['12744|1', '143129355|2', '7775|3', '1017141|4', '2591586|5']

person
['william bradley|13', 'joseph griffin|14', 'jacob caldwell|21', 'ann daniels|31', 'stephanie mccormick md|32']

location
['4369922c-ce1d-40ff-904c-1f6a3000bf6b|23', 'da0a7bbb-5bd1-42aa-bcc0-b8923d5c15d4|36', '445a6124-4650-4e5a-83bb-a0e0b4040ee9|41', '77b46bc7-7e31-41a0-973c-e633011d21b0|60', 'd0123474-8425-4ab0-90d5-175920158638|92']

political_organization
['6406040|63', '72371354|67', '766649|69', '85117949|76', '348079|78']

vessel
['marine|102', 'viking fishing|116', '77|1693', 'uss hershel|193', 'uscgc spencer|200']

movement
['hours|123', 'months|227', 'weeks|390', 'conjunction|554', 'normal|712']

event
['act|212', 'months years|223', 'cookies|237', '|278', 'arts|283']



## Entities with missing type

In [38]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df[df['type'].isna()]['id'])

10                                              victim|10
11                                         perpetrator|11
15                                                    |15
25                                                 prk|25
26                                              viking|26
29                                             regular|29
81                                                   8|81
82                                               refer|82
83                                                  10|83
87                                             notices|87
103                                            255273|103
141                                                st|141
166                                     armed robbery|166
173                                       unregulated|173
206                                              sept|206
222                                        convention|222
225                                            months|225
284           

## Add type 'unknown' for nodes without type

In [41]:
for node in data['nodes']:
    node['type'] = node.get('type', 'unknown')

## Illgality classification

### Create feature vectors

Create a vector for each node. The vector should contain:
* the node category, as a one-hot vector
* for each edge type, the weighted fraction (outgoing edges of that type / all outgoing edges)
* for each edge type, the weighted fraction (incoming edges of that type / all incoming edges)
* the number of incoming edges
* the number of outgoing edges
* the betweenness centrality of the node

In [145]:
import json
import networkx as nx
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Load your data
# data = json.loads(your_json_string)

# Convert data to the format required by networkx
G = nx.MultiDiGraph()
for node in data['nodes']:
    G.add_node(node['id'], attr_dict=node)
for edge in data['edges']:
    G.add_edge(edge['source'], edge['target'], attr_dict=edge)
    
    
# Remove nodes from disconnected components
import operator
## Compute all weakly connected components
components = list(nx.weakly_connected_components(G))
## Find the largest component
largest_component = max(components, key=len)
## Create a new graph that only contains the largest component
G = G.subgraph(largest_component).copy()

# Define categories for one-hot encoding
categories = np.array(sorted(list(set([node['type'] for node in data['nodes'] if node['type'] != 'unknown'])))).reshape(-1, 1)
one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(categories)

betweenness_centralities = nx.betweenness_centrality(G)
pagerank_scores = nx.pagerank(G)

# Compute feature vectors
feature_vectors = dict()
for node in G.nodes(data=True):
    feature_vector = []
    
    if node[1]['attr_dict']['type'] == 'unknown':
        feature_vector.extend(np.array([1/len(categories)] * len(categories)))
    else:
        ohv = one_hot_encoder.transform([[node[1]['attr_dict']['type']]])
        feature_vector.extend(ohv[0])
        
    # Weighted fractions of outgoing and incoming edges
    for edge_type in ['family_relationship', 'partnership', 'membership', 'ownership']: # Update this list with your edge types
        outgoing_edges = [edge[2]['attr_dict']['weight'] for edge in G.out_edges(node[0], data=True) if edge[2]['attr_dict']['type'] == edge_type]
        incoming_edges = [edge[2]['attr_dict']['weight'] for edge in G.in_edges(node[0], data=True) if edge[2]['attr_dict']['type'] == edge_type]
        feature_vector.append(sum(outgoing_edges) / sum([edge[2]['attr_dict']['weight'] for edge in G.out_edges(node[0], data=True)]) if G.out_edges(node[0], data=True) else 0)
        feature_vector.append(sum(incoming_edges) / sum([edge[2]['attr_dict']['weight'] for edge in G.in_edges(node[0], data=True)]) if G.in_edges(node[0], data=True) else 0)
    
    # Number of incoming and outgoing edges
    feature_vector.append(G.in_degree(node[0]))
    feature_vector.append(G.out_degree(node[0]))
    feature_vector.append(betweenness_centralities[node[0]])
    feature_vector.append(pagerank_scores[node[0]])
    feature_vectors[node[0]] = feature_vector


### Fit model

In [147]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# List of illegal node IDs
illegal_nodes = [
  "armed robbery|166",
  "game thief|1371",
  "illegal|2568",
  "deepwater horizon|2680",
  "shabu|2987",
  "shark fin|3043",
  "illegal|3155",
  "cartel emergent weaponry use|3207",
  "dark web vendor illegal narcotics|3328",
  "heroin cocaine exchange bitcoin|3329",
  "officer pleads guilty|3368",
  "bribes exchange smuggling contraband|3369"
]

# Create a list of (node_id, label) pairs
node_labels = [(node_id, 1 if node_id in illegal_nodes else 0) for node_id in feature_vectors.keys()]

# Sort the list by node_id to ensure consistent order
node_labels.sort(key=lambda x: x[0])

# Separate the node IDs and labels into two lists
node_ids, labels = zip(*node_labels)

# Use the same order to generate the feature vectors
feature_vector_list = [feature_vectors[node_id] for node_id in node_ids]


# List of feature descriptions
feature_descriptions = ['Type: ' + cat for cat in categories[:,0]] + \
                       ['Outgoing fraction: family_relationship', 'Incoming fraction: family_relationship',
                        'Outgoing fraction: partnership', 'Incoming fraction: partnership',
                        'Number of incoming edges', 'Number of outgoing edges', 
                        'Betweenness centrality', 'Pagerank'] 


In [148]:

# Scale feature vectors
scaler = StandardScaler()
feature_vector_list_scaled = scaler.fit_transform(feature_vector_list)

# Define logistic regression model with L1 regularization
model = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, C=1.0)

# Fit the model
model.fit(feature_vector_list_scaled, labels)

# Get the coefficients
coefs = model.coef_[0]
intercept = model.intercept_

# Print the intercept and coefficients
print("Intercept: ", intercept)
for feature_description, coef in zip(feature_descriptions, coefs):
    print(f"{feature_description}: {coef}")


Intercept:  [-6.30916357]
Type: company: 0.23971944192235503
Type: event: 0.21208654936861596
Type: location: 0.0
Type: movement: 0.24814192753586592
Type: organization: -0.17143348973739228
Type: person: -0.023198581533754228
Type: political_organization: 0.22017107649605233
Type: vessel: 0.30851775762546485
Outgoing fraction: family_relationship: -0.4628879244447966
Incoming fraction: family_relationship: -0.5180756557070315
Outgoing fraction: partnership: 0.28570803263470756
Incoming fraction: partnership: 0.0
Number of incoming edges: -0.24142438840671115
Number of outgoing edges: 0.44937987540996455
Betweenness centrality: -0.288728108009589
Pagerank: 0.0


### Predict illegality on all training data and export it into a json dictionary.

In [157]:
# Predict the probabilities
probabilities = model.predict_proba(feature_vector_list_scaled)[:, 1]

# Create a dictionary that maps each node id to its probability
id_to_probability = {id: probability for id, probability in zip(feature_vectors.keys(), probabilities)}
# Sort the dictionary by probability, highest first
sorted_id_to_probability = sorted(id_to_probability.items(), key=lambda item: item[1], reverse=True)

# Export the sorted dictionary to a json file
with open('node_id_to_probability.json', 'w') as f:
    json.dump(dict(sorted_id_to_probability), f, indent=1)