In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import random

df = pd.read_csv("Analysis/GoldCoast_network.csv")
df_edges = df[["From","To", "Capacity"]]
df_edges



Unnamed: 0,From,To,Capacity
0,1,1371,900.0
1,2,2012,1600.0
2,3,2402,1100.0
3,4,1875,800.0
4,5,1880,800.0
...,...,...,...
11135,4806,1495,600.0
11136,4806,3606,600.0
11137,4806,415,1800.0
11138,4807,1433,400.0


In [51]:
G = nx.from_pandas_edgelist(
    df,
    source="From",
    target="To",
    edge_attr=["Capacity"],
    create_using=nx.DiGraph()
)
Nodes = G.number_of_nodes()
Edges = G.number_of_edges()
Nodes, Edges

(4783, 11140)

In [63]:
cor = pd.read_csv('nodes.csv')

In [64]:
def get_candidate_pairs_directed(G, max_distance=2):
    """
    Generate candidate directed links (u -> v) such that:
    - u != v
    - (u, v) is NOT already an existing edge
    - v is reachable from u within <= max_distance steps
    """
    candidate_pairs = set()  # use set to avoid duplicates
    
    for u in G.nodes():
        # BFS depth-limited search for successors
        visited = {u}
        frontier = [(u, 0)]  # (node, distance)
        
        while frontier:
            current, dist = frontier.pop(0)
            
            if dist == max_distance:
                continue
            
            for nxt in G.successors(current):
                if nxt not in visited:
                    visited.add(nxt)
                    frontier.append((nxt, dist + 1))
                    
                    # Add candidate if not direct edge and not same node
                    if nxt != u and not G.has_edge(u, nxt):
                        candidate_pairs.add((u, nxt))
    
    return list(candidate_pairs)

def create_labeled_dataset_directed(G, candidate_pairs, negative_ratio=1.0):
    """
    Create labeled dataset for directed graphs.
    """
    # Positive examples: existing directed edges
    positive_examples = []
    for u, v in G.edges():
        positive_examples.append((u, v, 1))  # Label 1 for positive
    
    # Negative examples: sample from candidate pairs
    n_negative = int(len(positive_examples) * negative_ratio)
    negative_samples = random.sample(candidate_pairs, min(n_negative, len(candidate_pairs)))
    
    negative_examples = []
    for u, v in negative_samples:
        negative_examples.append((u, v, 0))  # Label 0 for negative
    
    # Combine and shuffle
    all_examples = positive_examples + negative_examples
    random.shuffle(all_examples)
    
    return all_examples, positive_examples, negative_examples

def common_predecessors_f(G, u, v):
    preds_u = set(G.predecessors(u))
    preds_v = set(G.predecessors(v))
    return preds_u & preds_v

def common_successors_f(G, u, v):
    succ_u = set(G.successors(u))
    succ_v = set(G.successors(v))
    return succ_u & succ_v


def extract_features_directed(G, node_pairs):
    features = []
    
    for u, v, label in node_pairs:
        preds = common_predecessors_f(G, u, v)
        succs = common_successors_f(G, u, v)

        feature_dict = {
            'node_u': u,
            'node_v': v,
            'label': label,
            'common_predecessors': len(preds),
            'common_successors': len(succs),
            'out_in_degree_product': G.out_degree(u) * G.in_degree(v),
        }

        # Directional similarity
        feature_dict['directional_adamic_adar'] = directed_adamic_adar(G, u, v)
        feature_dict['directional_resource_allocation'] = directed_resource_allocation(G, u, v)

        feature_dict['euclidian distance'] = extract_euclid_dist(cor, u, v)
        # Capacity features
        feature_dict.update(extract_capacity_features_directed(G, u, v))
        
        features.append(feature_dict)

    return pd.DataFrame(features)

def extract_euclid_dist(cor, u, v):
    return np.sqrt((cor.iloc[u - 1]['x'] - cor.iloc[v - 1]['x'])**2 + (cor.iloc[u - 1]['y'] - cor.iloc[v - 1]['y'])**2)

def directed_adamic_adar(G, u, v):
    """Adamic-Adar for directed graphs - common successors"""
    common_successors = common_successors_f(G, u, v)
    score = 0
    for w in common_successors:
        # Use in-degree in denominator (how many nodes point to w)
        in_deg = G.in_degree(w)
        if in_deg > 1:  # Avoid division by zero and log(0)
            score += 1 / np.log(in_deg)
    return score

def directed_resource_allocation(G, u, v):
    """Resource Allocation for directed graphs - common successors"""
    common_successors = common_successors_f(G, u, v)
    score = 0
    for w in common_successors:
        in_deg = G.in_degree(w)
        if in_deg > 0:
            score += 1 / in_deg
    return score

def extract_capacity_features_directed(G, u, v):
    """Extract simplified capacity-related features for directed graph (no path capacities)."""
    cap_features = {}
    
    # Capacity around nodes (considering direction)
    u_out_caps = [G[u][n].get('Capacity', 0) for n in G.successors(u)]
    u_in_caps  = [G[n][u].get('Capacity', 0) for n in G.predecessors(u)]

    v_out_caps = [G[v][n].get('Capacity', 0) for n in G.successors(v)]
    v_in_caps  = [G[n][v].get('Capacity', 0) for n in G.predecessors(v)]
    
    cap_features.update({
        'avg_out_capacity_u': np.mean(u_out_caps) if u_out_caps else 0,
        'avg_in_capacity_u':  np.mean(u_in_caps)  if u_in_caps else 0,
        'avg_out_capacity_v': np.mean(v_out_caps) if v_out_caps else 0,
        'avg_in_capacity_v':  np.mean(v_in_caps)  if v_in_caps else 0,
        'min_out_capacity_u': min(u_out_caps) if u_out_caps else 0,
        'min_in_capacity_v': min(v_in_caps) if v_in_caps else 0,
    })

    # Replace NaNs with 0
    for k in cap_features:
        if np.isnan(cap_features[k]):
            cap_features[k] = 0

    return cap_features

In [None]:
# Step 1: Get candidate pairs for directed graph
candidate_pairs = get_candidate_pairs_directed(G, max_distance=2)
print(f"Found {len(candidate_pairs)} candidate directed node pairs")

# Step 2: Create labeled dataset
labeled_data, positives, negatives = create_labeled_dataset_directed(G, candidate_pairs)
print(f"Dataset: {len(positives)} positive, {len(negatives)} negative examples")

# Step 3: Extract features for directed graph
feature_df = extract_features_directed(G, labeled_data)
print("Feature matrix shape:", feature_df.shape)
print("Features:", feature_df.columns.tolist())

# Step 4: Train model (same as before)
feature_columns = [col for col in feature_df.columns if col not in ['node_u', 'node_v', 'label']]
X = feature_df[feature_columns]
y = feature_df['label']

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 5: Predict on new candidates
def predict_new_links_directed(G, trained_model, feature_columns, max_distance=2):
    """Predict the best new directed links"""
    all_candidates = get_candidate_pairs_directed(G, max_distance=max_distance)
    candidate_features = extract_features_directed(G, [(u, v, -1) for u, v in all_candidates])
    
    X_candidates = candidate_features[feature_columns]
    probabilities = trained_model.predict_proba(X_candidates)[:, 1]
    
    results = pd.DataFrame({
        'from_node': [p[0] for p in all_candidates],
        'to_node': [p[1] for p in all_candidates],
        'probability': probabilities
    })
    
    return results.sort_values('probability', ascending=False)

# Get your final predictions for directed links!
best_new_links = predict_new_links_directed(G, rf_model, feature_columns)
print("Top 20 recommended new directed roads:")
print(best_new_links.head(20))

Found 18649 candidate directed node pairs
Dataset: 11140 positive, 11140 negative examples
Feature matrix shape: (22280, 15)
Features: ['node_u', 'node_v', 'label', 'common_predecessors', 'common_successors', 'out_in_degree_product', 'directional_adamic_adar', 'directional_resource_allocation', 'euclidian distance', 'avg_out_capacity_u', 'avg_in_capacity_u', 'avg_out_capacity_v', 'avg_in_capacity_v', 'min_out_capacity_u', 'min_in_capacity_v']
Top 10 recommended new directed roads:
       from_node  to_node  probability
17612       1088     1098         1.00
17802       3398     1729         1.00
9311        4513     1357         1.00
2392        1096     1767         1.00
1862        4064     4060         1.00
7925        2338     1714         1.00
11814       3822     2844         1.00
499         2134     2420         1.00
8722        1770     2106         1.00
3717        2134     1291         1.00
7764        2070     2328         0.99
16724       4414     4411         0.99
3002   

In [60]:
feature_df

Unnamed: 0,node_u,node_v,label,common_predecessors,common_successors,out_in_degree_product,directional_adamic_adar,directional_resource_allocation,euclidian distance,avg_out_capacity_u,avg_in_capacity_u,avg_out_capacity_v,avg_in_capacity_v,min_out_capacity_u,min_in_capacity_v
0,3351,3350,1,0,0,9,0.000000,0.000000,0.004926,600.000000,600.000000,866.666667,866.666667,400.0,400.0
1,3476,3148,1,0,0,9,0.000000,0.000000,0.001559,666.666667,666.666667,666.666667,666.666667,400.0,400.0
2,906,3630,1,0,0,3,0.000000,0.000000,0.001609,1800.000000,1800.000000,866.666667,866.666667,1800.0,400.0
3,178,181,0,1,1,1,0.721348,0.250000,0.004309,1800.000000,1800.000000,1800.000000,1800.000000,1800.0,1800.0
4,3588,969,0,1,1,3,0.721348,0.250000,0.004702,666.666667,666.666667,1800.000000,1800.000000,400.0,1800.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22275,2763,1334,1,0,0,6,0.000000,0.000000,0.002531,866.666667,866.666667,400.000000,400.000000,400.0,400.0
22276,1274,1271,0,1,1,9,0.910239,0.333333,0.003746,866.666667,866.666667,866.666667,866.666667,400.0,400.0
22277,2297,1443,1,0,0,9,0.000000,0.000000,0.002184,866.666667,866.666667,400.000000,400.000000,400.0,400.0
22278,4315,4312,0,1,1,4,0.721348,0.250000,0.001302,900.000000,900.000000,1350.000000,900.000000,900.0,900.0
