In [None]:
import pandas as pd
import networkx as nx
import pickle
import glob
import os

In [None]:
# Read ASG Graph
with open("graph_IOX.pkl", "rb") as f:
    G = pickle.load(f)

In [None]:
path = ''
data_file_name = ''
df_all_tweet = pd.read_csv(path + data_file_name, dtype = 'str', lineterminator = '\n')

In [None]:
folder_path = ''
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

df_list = []
for file in csv_files:
    df = pd.read_csv(file)
    df['source_file'] = os.path.basename(file)
    df_list.append(df)

df_foreign_sus = pd.concat(df_list, ignore_index=True)

In [None]:
df_all_tweet['is_Retweet'] = df_all_tweet.apply(
    lambda row: 1 if isinstance(row['text'], str) and row['text'].startswith('RT @') else 0,
    axis=1)

In [None]:
len(df_all_tweet.loc[(df_all_tweet.is_Retweet==0) & (df_all_tweet.Text_Lang == 'hi')].User_ID.unique())

In [None]:
df_india = df_all_tweet.loc[(df_all_tweet.is_Retweet==0) & (df_all_tweet.Text_Lang == 'hi')][['User_ID']]

In [None]:
df_india = df_india.drop_duplicates().reset_index(drop = True)

In [None]:
df_india['Country'] = 'India'

In [None]:
df_foreign_sus.loc[df_foreign_sus.Country != 'Spanish'].sample(10)

In [None]:
df_foreign_sus['Country'] = df_foreign_sus['source_file'].str.replace('_ID.csv', '', regex=False)

In [None]:
df_india_count = df_all_tweet[(df_all_tweet.is_Retweet==0) & (df_all_tweet.Text_Lang == 'hi')].groupby('User_ID').count().reset_index()[['User_ID', 'Tweet_ID']]

In [None]:
df_india = df_india_count.loc[df_india_count.Tweet_ID>1]

In [None]:
df_india['Country'] = 'India'

In [None]:
df_foreign_sus = pd.concat([df_foreign_sus[['User_ID', 'Country']], df_india[['User_ID', 'Country']]])

In [None]:
df_foreign_sus.to_csv('Foreign_Supspected.csv', index = False)

In [None]:
df_foreign_sus = df_foreign_sus.reset_index(drop = True)

In [None]:
df_foreign_sus.User_ID = df_foreign_sus.User_ID.astype('int')

In [None]:
G_copy = G.copy()

In [None]:
import networkx as nx
import numpy as np

def run_lbp(G, foreign_accounts, max_iter=10, alpha=0.8):
    nodes = list(G.nodes())
    belief = {node: 0.0 for node in nodes}
    for node in foreign_accounts:
        belief[node] = 1.0

    for _ in range(max_iter):
        new_belief = {}
        for node in G.nodes():
            if node in foreign_accounts:
                new_belief[node] = 1.0  
            else:
                neighbor_beliefs = [belief[neigh] for neigh in G.neighbors(node)]
                if neighbor_beliefs:
                    mean_belief = np.mean(neighbor_beliefs)
                    new_belief[node] = alpha * mean_belief + (1 - alpha) * belief[node]
                else:
                    new_belief[node] = belief[node]  

        belief = new_belief

    return belief


In [None]:
import networkx as nx
import numpy as np

def run_lbp_with_edge_potentials(G, foreign_accounts, max_iter=10, alpha=0.8):
    nodes = list(G.nodes())
    max_weight = max((d.get('weight', 1.0) for u, v, d in G.edges(data=True)), default=1.0)

    edge_potentials = {}
    for u, v, d in G.edges(data=True):
        w = d.get('weight', 1.0)
        potential = w / max_weight if max_weight > 0 else 0.0
        edge_potentials[(u, v)] = potential
        edge_potentials[(v, u)] = potential  

    belief = {}
    for node in nodes:
        if node in foreign_accounts:
            belief[node] = 1.0
        else:
            belief[node] = 0.5

    for _ in range(max_iter):
        new_belief = {}
        for node in nodes:
            if node in foreign_accounts:
                new_belief[node] = 1.0 
            else:
                neighbor_beliefs = []
                total_potential = 0.0
                for neighbor in G.neighbors(node):
                    potential = edge_potentials.get((node, neighbor), 1.0)
                    neighbor_beliefs.append(potential * belief[neighbor])
                    total_potential += potential

                if total_potential > 0:
                    weighted_avg = sum(neighbor_beliefs) / total_potential
                    new_belief[node] = alpha * weighted_avg + (1 - alpha) * belief[node]
                else:
                    new_belief[node] = belief[node]  

        belief = new_belief
    return belief


In [None]:
results3 = []

for country in df_foreign_sus['Country'].unique():
    print(country)
    seed_accounts = set(df_foreign_sus[df_foreign_sus['Country'] == country]['User_ID'])
    print(len(seed_accounts))
    
    belief_scores3 = run_lbp_with_edge_potentials(G_copy, seed_accounts)

    likely_foreign = sorted(
        ((node, score) for node, score in belief_scores3.items() if node not in seed_accounts),
        key=lambda x: x[1],
        reverse=True
    )

    for node, score in likely_foreign:
        results3.append({
            'Country': country,
            'Inferred_User_ID': node,
            'Belief_Score': score
        })

df_inferred_foreign3 = pd.DataFrame(results3)

In [None]:
df_inferred_foreign3.loc[df_inferred_foreign3.Belief_Score> 0.6].groupby('Country').count()

In [None]:
for i in [0.5, 0.6, 0.7, 0.8, 0.9]:
    print(len(df_inferred_foreign3.loc[df_inferred_foreign3.Belief_Score> i].Inferred_User_ID.unique()))

In [None]:
len(df_inferred_foreign3.loc[df_inferred_foreign3.Belief_Score> 0.7].Inferred_User_ID.unique())

In [None]:
df_inferred_foreign.loc[df_inferred_foreign.Belief_Score> 0.5].groupby('Country').count()

In [None]:
import networkx as nx
import numpy as np

def run_lbp2(G, foreign_accounts, max_iter=10, alpha=0.8):

    nodes = list(G.nodes())
    belief = {node: 0.0 for node in nodes}

    for node in foreign_accounts:
        belief[node] = 1.0

    for _ in range(max_iter):
        new_belief = {}
        for node in G.nodes():
            if node in foreign_accounts:
                new_belief[node] = 1.0 
            else:
                neighbors = list(G.neighbors(node))
                if neighbors:
                    total_weight = sum(G[node][neigh].get('weight', 1.0) for neigh in neighbors)
                    if total_weight > 0:
                        weighted_belief = sum(
                            G[node][neigh].get('weight', 1.0) * belief[neigh]
                            for neigh in neighbors
                        ) / total_weight
                        new_belief[node] = alpha * weighted_belief + (1 - alpha) * belief[node]
                    else:
                        new_belief[node] = belief[node]
                else:
                    new_belief[node] = belief[node] 

        belief = new_belief

    return belief


In [None]:
def run_true_lbp(G, foreign_accounts, max_iter=10, damping=0.5, base_strength=1.0):
    from collections import defaultdict
    
    nodes = list(G.nodes())
    num_states = 2 
    node_potentials = {}
    for node in nodes:
        if node in foreign_accounts:
            node_potentials[node] = np.array([0.001, 0.999])
        else:
            node_potentials[node] = np.array([0.9, 0.1])
    def edge_potential_from_weight(weight, base_strength=1.0):

        strength = min(base_strength * (1 + np.log(1 + weight)), 10.0)
        high_val = np.exp(strength)
        low_val = np.exp(-strength)
        if low_val < 1e-10:
            low_val = 1e-10
            
        pot = np.array([
            [high_val, low_val],
            [low_val, high_val]
        ])
        return pot / np.sum(pot)  
    

    messages = defaultdict(lambda: np.ones(num_states) / num_states)
    for iteration in range(max_iter):
        new_messages = defaultdict(lambda: np.ones(num_states) / num_states)
        
        for i, j in G.edges():
            weight = G[i][j].get('weight', 1.0)
            edge_pot = edge_potential_from_weight(weight, base_strength)
            msg_i_to_j = np.zeros(num_states)
            for state_j in range(num_states):
                for state_i in range(num_states):
                    value = node_potentials[i][state_i] * edge_pot[state_i, state_j]
                    for k in G.neighbors(i):
                        if k != j:
                            value *= messages[(k, i)][state_i]
                    
                    msg_i_to_j[state_j] += value
            
            # Normalize and apply damping
            msg_sum = np.sum(msg_i_to_j)
            if msg_sum > 0:
                msg_i_to_j = msg_i_to_j / msg_sum
            else:
                msg_i_to_j = np.ones(num_states) / num_states  
            
            new_messages[(i, j)] = damping * msg_i_to_j + (1 - damping) * messages[(i, j)]
            msg_j_to_i = np.zeros(num_states)
            for state_i in range(num_states):
                for state_j in range(num_states):
                    value = node_potentials[j][state_j] * edge_pot[state_i, state_j]
                    
                    for k in G.neighbors(j):
                        if k != i:
                            value *= messages[(k, j)][state_j]
                    
                    msg_j_to_i[state_i] += value
            
            msg_sum = np.sum(msg_j_to_i)
            if msg_sum > 0:
                msg_j_to_i = msg_j_to_i / msg_sum
            else:
                msg_j_to_i = np.ones(num_states) / num_states 
                
            new_messages[(j, i)] = damping * msg_j_to_i + (1 - damping) * messages[(j, i)]
        messages = new_messages
    
    beliefs = {}
    for node in nodes:
        belief = node_potentials[node].copy()
        
        for neighbor in G.neighbors(node):
            belief *= messages[(neighbor, node)]
        
        belief_sum = np.sum(belief)
        if belief_sum > 0:
            belief = belief / belief_sum
        else:
            belief = np.array([0.5, 0.5]) 
        beliefs[node] = belief[1] 
    
    return beliefs

In [None]:
results3 = []

for country in df_foreign_sus['Country'].unique():
    print(country)
    seed_accounts = set(df_foreign_sus[df_foreign_sus['Country'] == country]['User_ID'])
    print(len(seed_accounts))
    
    belief_scores = run_true_lbp(G_copy, seed_accounts)
    for node, prob in sorted(belief_scores.items()):
        print(f"Node {node}: {prob:.3f}")


In [None]:
results = []

for country in df_foreign_sus['Country'].unique():
    print(country)
    seed_accounts = set(df_foreign_sus[df_foreign_sus['Country'] == country]['User_ID'])
    print(len(seed_accounts))
    
    belief_scores = run_lbp(G_copy, seed_accounts)

    likely_foreign = sorted(
        ((node, score) for node, score in belief_scores.items() if node not in seed_accounts),
        key=lambda x: x[1],
        reverse=True
    )

    for node, score in likely_foreign:
        results.append({
            'Country': country,
            'Inferred_User_ID': node,
            'Belief_Score': score
        })

df_inferred_foreign = pd.DataFrame(results)

In [None]:
results2 = []

for country in df_foreign_sus['Country'].unique():
    print(country)
    seed_accounts = set(df_foreign_sus[df_foreign_sus['Country'] == country]['User_ID'])
    print(len(seed_accounts))
    
    belief_scores2 = run_lbp2(G_copy, seed_accounts)

    likely_foreign = sorted(
        ((node, score) for node, score in belief_scores2.items() if node not in seed_accounts),
        key=lambda x: x[1],
        reverse=True
    )

    for node, score in likely_foreign:
        results2.append({
            'Country': country,
            'Inferred_User_ID': node,
            'Belief_Score': score
        })

df_inferred_foreign2 = pd.DataFrame(results2)

In [None]:
df_inferred_foreign2.loc[df_inferred_foreign2.Belief_Score> 0.5].groupby('Country').count()

In [None]:
df_inferred_foreign.loc[df_inferred_foreign.Belief_Score> 0.5].groupby('Country').count()

In [None]:
node_list = list(G_copy.nodes)

In [None]:
df_inferred_foreign_valid = df_inferred_foreign.loc[df_inferred_foreign.Belief_Score> 0.5]

In [None]:
df_inferred_foreign_valid.groupby('Inferred_User_ID').count().sort_values('Country')

In [None]:
len(df_inferred_foreign.loc[df_inferred_foreign.Belief_Score> 0.7].drop_duplicates('Inferred_User_ID'))

In [None]:
len(df_inferred_foreign.loc[df_inferred_foreign.Belief_Score> 0.5].drop_duplicates('Inferred_User_ID'))

In [None]:
import networkx as nx
import numpy as np
import random
from math import exp

def normalize_edge_weights(G):
    weights = [d.get('weight', 1.0) for _, _, d in G.edges(data=True)]
    max_w = max(weights) if weights else 1.0
    for u, v, d in G.edges(data=True):
        d['norm_weight'] = d.get('weight', 1.0) / max_w

def compute_edge_potential(xi, xj, wij, beta):
    # Same label: attraction, different label: repulsion
    if xi == xj:
        return exp(beta * wij)
    else:
        return exp(-beta * wij)

def run_lbp_phi(G, foreign_accounts, beta=1.0, max_iter=10, alpha=0.8):
    nodes = list(G.nodes())
    normalize_edge_weights(G)

    # Initialize beliefs
    belief = {}
    for node in nodes:
        if node in foreign_accounts:
            belief[node] = 1.0
        else:
            belief[node] = 0.5

    for _ in range(max_iter):
        new_belief = {}
        for node in nodes:
            if node in foreign_accounts:
                new_belief[node] = 1.0
            else:
                num, denom = 0.0, 0.0
                for neighbor in G.neighbors(node):
                    wij = G[node][neighbor]['norm_weight']
                    b_neighbor = belief[neighbor]

                    phi_same = compute_edge_potential(1, 1, wij, beta)
                    phi_diff = compute_edge_potential(1, 0, wij, beta)

                    # Weighted contribution
                    num += phi_same * b_neighbor
                    denom += phi_same * b_neighbor + phi_diff * (1 - b_neighbor)

                if denom > 0:
                    prob = num / denom
                    new_belief[node] = alpha * prob + (1 - alpha) * belief[node]
                else:
                    new_belief[node] = belief[node]

        belief = new_belief

    return belief

def evaluate_beta(G, foreign_accounts, beta_values, max_iter=10):
    test_size = max(1, int(0.1 * len(foreign_accounts)))
    test_nodes = set(random.sample(foreign_accounts, test_size))
    train_nodes = set(foreign_accounts) - test_nodes

    best_beta, best_loss = None, float('inf')
    results = {}

    for beta in beta_values:
        belief = run_lbp_phi(G, train_nodes, beta=beta, max_iter=max_iter)
        loss = sum(1 - belief[n] for n in test_nodes if n in belief)
        results[beta] = loss

        if loss < best_loss:
            best_loss = loss
            best_beta = beta

    return best_beta, results


In [None]:
import numpy as np
import random

def evaluate_beta(G, foreign_accounts, beta_values, max_iter=10, threshold=0.8):
    test_size = max(1, int(0.1 * len(foreign_accounts)))
    test_nodes = set(random.sample(foreign_accounts, test_size))
    train_nodes = set(foreign_accounts) - test_nodes

    best_beta_loss = None
    best_beta_ce = None
    best_loss = float('inf')
    best_ce = float('inf')

    results = {}

    for beta in beta_values:
        belief = run_lbp_phi(G, train_nodes, beta=beta, max_iter=max_iter)
        loss = sum(1 - belief[n] for n in test_nodes if n in belief)

        epsilon = 1e-10  
        cross_entropy = -sum(np.log(belief[n] + epsilon) for n in test_nodes if n in belief)
        missed_count = sum(1 for n in test_nodes if belief.get(n, 0.0) < threshold)

        results[beta] = {
            'sum_diff_loss': loss,
            'cross_entropy_loss': cross_entropy,
            'missed': missed_count,
            'total_test': len(test_nodes)
        }

        if loss < best_loss:
            best_loss = loss
            best_beta_loss = beta

        if cross_entropy < best_ce:
            best_ce = cross_entropy
            best_beta_ce = beta

    return {
        'best_beta_sum_diff': best_beta_loss,
        'best_sum_diff_loss': best_loss,
        'best_beta_cross_entropy': best_beta_ce,
        'best_cross_entropy_loss': best_ce,
        'results': results
    }


In [None]:
beliefs = run_lbp_phi(G, df_foreign_sus.loc[df_foreign_sus.Country == 'India'].User_ID.tolist(), beta=10, max_iter=10, alpha=0.8)

In [None]:
results2 = []
for i in df_foreign_sus.Country.unique():
    print(i)
    beliefs = run_lbp_phi(G, df_foreign_sus.loc[df_foreign_sus.Country == i].User_ID.tolist(), beta=10, max_iter=10, alpha=0.8)
    likely_foreign = sorted(((node, score) for node, score in beliefs.items() 
                             if node not in df_foreign_sus.User_ID.tolist()),key=lambda x: x[1],reverse=True)
    
    for node, score in likely_foreign:
        results2.append({
            'Inferred_User_ID': node,'Belief_Score': score, 'Country': i})
    
df_inferred_foreign2 = pd.DataFrame(results2)

In [None]:
df_foreign_sus.loc[df_foreign_sus.Country == 'Russian']

In [None]:
df_foreign_sus.loc[df_foreign_sus.Country == 'Russian']

In [None]:
df_inferred_foreign2.loc[(df_inferred_foreign2.Belief_Score>0.8) & (df_inferred_foreign2.Country=='Russian')]

In [None]:
len(df_inferred_foreign2.loc[df_inferred_foreign2.Belief_Score>0.8])

In [None]:
#df_inferred_foreign2.loc[(df_inferred_foreign2.Belief_Score>0.8) & (df_inferred_foreign2.Country=='Russian')].Inferred_User_ID.tolist()

In [None]:
df_inferred_foreign2.loc[(df_inferred_foreign2.Belief_Score>0.8) & (df_inferred_foreign2.Country=='Chinese')].Inferred_User_ID.tolist()

In [None]:
df_inferred_foreign_valid

In [None]:
df_inferred_foreign2

In [None]:
pd.concat([df_inferred_foreign2.loc[df_inferred_foreign2.Belief_Score>0.8],df_inferred_foreign_valid]).drop_duplicates(['Inferred_User_ID', 'Country']).to_csv('Suspected_LBP_Foreign.csv', index = False)

In [None]:
results2

In [None]:
df_for_net = df_foreign_sus.loc[df_foreign_sus.User_ID.isin(common_nodes)]

In [None]:
df_for_net =df_for_net.reset_index(drop = True)

In [None]:
b_beta= evaluate_beta(G, df_for_net.User_ID.tolist(),[0.1, 0.2,0.5, 1, 2, 5,10],threshold=0.7)

In [None]:
b_beta

In [None]:
df_foreign_sus.User_ID.tolist()

In [None]:
results2 = []
likely_foreign = sorted(((node, score) for node, score in beliefs.items() 
                         if node not in df_foreign_sus.User_ID.tolist()),key=lambda x: x[1],reverse=True)

for node, score in likely_foreign:
    results2.append({
        'Inferred_User_ID': node,'Belief_Score': score})

df_inferred_foreign2 = pd.DataFrame(results2)

In [None]:
graph_nodes = set(G.nodes()) 
df_nodes = set(df_foreign_sus['User_ID'])  
common_nodes = graph_nodes.intersection(df_nodes)
