In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from itertools import combinations
from collections import defaultdict, Counter

import community as community_louvain
from networkx.algorithms.community import (
    louvain_communities,
    modularity,
    greedy_modularity_communities,
)


In [None]:
df_generic = pd.read_csv('Generic_OutCome.csv')

In [None]:
df_cluster = pd.read_csv('All_Text_Duplicate_Tweet.csv')
df_cluster = pd.merge(df_cluster, df_generic)
df_non_generic = df_cluster.loc[df_cluster.Generic_OutCome == 'not generic']
df_only_image = pd.read_pickle('/a/bear.cs.xxx.edu./disk/bear-b/users/xxww/PolitiX/Codebase/Complete_Similarity_Both_Text_Image.pkl')

In [None]:
df_non_generic = df_non_generic.drop_duplicates(['Cluster_ID', 'User_ID']).reset_index(drop = True)

In [None]:
df_non_generic = df_non_generic.loc[~df_non_generic.Cluster_ID.isin([79, 4, 69, 189, 46968, 50,	171,331, 25, 22 ])]

In [None]:
df_non_generic.loc[df_non_generic.Cluster_ID == 4320]

In [None]:
len(df_non_generic.User_ID.unique())

In [None]:
df_non_generic.columns

In [None]:
df_non_generic

In [None]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

pair_weights = defaultdict(int)
for cluster_id, group in df_non_generic.groupby('Cluster_ID'):
    users = group['User_ID'].unique()
    for user1, user2 in combinations(sorted(users), 2):
        pair_weights[(user1, user2)] += 1
df_user_pairs = pd.DataFrame([
    {'User1': u1, 'User2': u2, 'Weight': w}
    for (u1, u2), w in pair_weights.items()
])

df_user_pairs = df_user_pairs.sort_values(by='Weight', ascending=False)
print(df_user_pairs.head())


In [None]:
df_user_pairs.loc[df_user_pairs.Weight>=50].sample(20)

In [None]:
df_only_image = df_only_image.loc[df_only_image.Source_User_ID != df_only_image.Target_User_ID]

In [None]:
df_only_image = df_only_image.loc[df_only_image.Similarity_Type == 'Image']

In [None]:
import pandas as pd
import networkx as nx

G_image = nx.Graph()
for _, row in df_only_image.iterrows():
    source = row['Source_Tweet_ID']
    target = row['Target_Tweet_ID']
    G_image.add_edge(source, target)

components = nx.connected_components(G_image)
component_mapping = {
    node: idx for idx, comp in enumerate(components) for node in comp
}

community_df = pd.DataFrame(list(component_mapping.items()), columns=['Tweet_ID', 'Cluster_ID'])
df_with_communities = df_only_image.copy()
df_with_communities = df_with_communities.merge(community_df, how='left', left_on='Source_Tweet_ID', right_on='Tweet_ID')
df_with_communities.drop(columns=['Tweet_ID'], inplace=True)


In [None]:
modularity = community_louvain.modularity(partition, G_image)

In [None]:
#modularity

In [None]:
community_df.sort_values('Cluster_ID')

In [None]:
df_concat_image = pd.concat([
    df_only_image[['Source_Tweet_ID', 'Source_User_ID']].rename(columns={'Source_Tweet_ID': 'Tweet_ID', 'Source_User_ID': 'User_ID'}),
    df_only_image[['Target_Tweet_ID', 'Target_User_ID']].rename(columns={'Target_Tweet_ID': 'Tweet_ID', 'Target_User_ID': 'User_ID'})
], ignore_index=True)


In [None]:
df_concat_image  = df_concat_image.drop_duplicates().reset_index(drop = True)

In [None]:
df_concat_image = pd.merge(df_concat_image, community_df)

In [None]:
df_concat_image.groupby('Cluster_ID').count().sort_values('User_ID')

In [None]:
df_concat_image = df_concat_image.loc[~df_concat_image.Cluster_ID.isin([161, 20])]

In [None]:
df_concat_image.sort_values('Cluster_ID')

In [None]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

pair_weights_image = defaultdict(int)
for cluster_id, group in df_concat_image.groupby('Cluster_ID'):
    users_image = group['User_ID'].unique()
    for user1, user2 in combinations(sorted(users_image), 2):
        pair_weights_image[(user1, user2)] += 1

df_user_pairs_image = pd.DataFrame([
    {'User1': u1, 'User2': u2, 'Weight': w}
    for (u1, u2), w in pair_weights_image.items()
])

df_user_pairs_image = df_user_pairs_image.sort_values(by='Weight', ascending=False)
print(df_user_pairs_image.head())


In [None]:
df_user_pairs_image

In [None]:
df_user_pairs

In [None]:
def normalize_pair(df, type_label):
    df = df.copy()
    df[['User1', 'User2']] = df[['User1', 'User2']].apply(lambda x: sorted(x), axis=1, result_type='expand')
    df['Type'] = type_label
    return df

df_user_pairs_image_norm = normalize_pair(df_user_pairs_image, 'Image')
df_user_pairs_norm = normalize_pair(df_user_pairs, 'Text')
df_combined = pd.concat([df_user_pairs_image_norm, df_user_pairs_norm], ignore_index=True)
df_final = df_combined.groupby(['User1', 'User2'], as_index=False)['Weight'].sum()


In [None]:
df_final

In [None]:
import networkx as nx
import community as community_louvain  
import matplotlib.pyplot as plt

df_final_copy = df_final.loc[df_final.Weight>=1]
G = nx.Graph()

for _, row in df_final_copy.iterrows():
    G.add_edge(row['User1'], row['User2'], weight=row['Weight'])

partition = community_louvain.best_partition(G, weight='weight') 
modularity_value = community_louvain.modularity(partition, G, weight='weight')
nx.set_node_attributes(G, partition, 'community')
print(f"Modularity: {modularity_value:.4f}")
print(f"Number of communities found: {len(set(partition.values()))}")
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
avg_degree = sum(dict(G.degree()).values()) / num_nodes if num_nodes > 0 else 0
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")
print(f"Average degree: {avg_degree:.2f}")


In [None]:
import networkx as nx
import pickle

with open("graph_IOX.pkl", "wb") as f:
    pickle.dump(G, f)


In [None]:
from collections import defaultdict
import pandas as pd

community_nodes = defaultdict(list)
for node, comm_id in partition.items():
    community_nodes[comm_id].append(node)
community_stats = []

for comm_id, nodes in community_nodes.items():
    subgraph = G.subgraph(nodes)
    num_nodes = subgraph.number_of_nodes()
    num_edges = subgraph.number_of_edges()
    avg_degree = sum(dict(subgraph.degree()).values()) / num_nodes if num_nodes > 0 else 0
    weights = [data['weight'] for _, _, data in subgraph.edges(data=True)]
    avg_weight = sum(weights) / len(weights) if weights else 0
    density = nx.density(subgraph)

    community_stats.append({
        'Community_ID': comm_id,
        'Num_Nodes': num_nodes,
        'Num_Edges': num_edges,
        'Avg_Degree': round(avg_degree, 2),
        'Avg_Weight': round(avg_weight, 2),
        'Density': round(density, 4)
    })

df_community_stats = pd.DataFrame(community_stats).sort_values(by='Num_Nodes', ascending=False)
print(df_community_stats)


In [None]:
df_community_stats.loc[df_community_stats.Num_Nodes>=5]

In [None]:
user_community_pairs = [
    {'User_ID': user, 'Community_ID': comm_id}
    for comm_id, users in community_nodes.items()
    for user in users
]
df_user_communities = pd.DataFrame(user_community_pairs)
print(df_user_communities.head())

In [None]:
df_user_communities

In [None]:
from collections import defaultdict
import pandas as pd

community_nodes = defaultdict(list)
for node, comm_id in partition.items():
    community_nodes[comm_id].append(node)
community_stats = []

for comm_id, nodes in community_nodes.items():
    print(comm_id)
    subgraph = G.subgraph(nodes)
    partition_sub = community_louvain.best_partition(subgraph, weight='weight')
    try:
        modularity_value_sub = community_louvain.modularity(partition_sub, subgraph, weight='weight')
        print(modularity_value_sub)
    except:
        None
    



In [None]:
import networkx as nx
import leidenalg
import igraph as ig

In [None]:
df_final_copy = df_final.loc[df_final.Weight>=2]
G_nd = nx.Graph()
for _, row in df_final_copy.iterrows():
    G_nd.add_edge(row['User1'], row['User2'], weight=row['Weight'])


In [None]:
G_ig = ig.Graph.from_networkx(G_nd)

In [None]:
partition_leiden = leidenalg.find_partition(G_ig, leidenalg.ModularityVertexPartition)

In [None]:
modularity = partition_leiden.modularity

In [None]:
modularity

In [None]:
centrality = nx.eigenvector_centrality(G)

In [None]:
import networkx as nx
centrality = nx.eigenvector_centrality(G)
top_nodes = {}

for comm_id, nodes in community_nodes.items():
    scored = [(node, centrality[node]) for node in nodes if node in centrality]
    scored_sorted = sorted(scored, key=lambda x: x[1], reverse=True)
    
    top_nodes[comm_id] = scored_sorted[:1]


# Include Political Affiliation

In [None]:
df_pol_res1 = pd.read_csv('PAN_Result_1.csv')
df_pol_res2 = pd.read_csv('PAN_Result_2.csv')
df_pol_res3 = pd.read_csv('PAN_Result_3.csv')

In [None]:
df_part = pd.merge(df_user_communities,df_pol_res1)
df_part = pd.merge(df_part,df_pol_res2)
df_part = pd.merge(df_part,df_pol_res3)

In [None]:
df_part = df_part.drop(columns = 'Unnamed: 0')

In [None]:
df_part.groupby('PR_Result').count()

In [None]:
df_part.groupby('Label').count()

In [None]:
df_part.groupby('PR3').count()

In [None]:
needed_comm = [1,2,11,0,5,3,6,10,9,20,12,61,7,28]

In [None]:
df_part.loc[(df_part.Community_ID == 0) & (df_part.Label == 1)].sample(20)

In [None]:
df_non_generic.loc[df_non_generic.Cluster_ID == 51576]

In [None]:
df_part.to_csv('User_Community_PolAffiliation.csv', index =False)

In [None]:
for i in needed_comm:
    if len(df_part.loc[df_part.Community_ID == i]) >= 5:
        print(i)
        print(len(df_part.loc[df_part.Community_ID == i]))
        print(df_part.loc[df_part.Community_ID == i].groupby(['Community_ID', 'Label']).count())

In [None]:
df_part.groupby('Label').count()

In [None]:
df_part.loc[df_part.Label == 3].sample(20)

# Include Foreign Accounts

In [None]:
import pandas as pd
import glob
import os


folder_path = ''
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
df_list = []
for file in csv_files:
    df = pd.read_csv(file)
    df['source_file'] = os.path.basename(file)
    df_list.append(df)


combined_df = pd.concat(df_list, ignore_index=True)
print(combined_df.head())

In [None]:
combined_df.sample()

In [None]:
df_part.loc[df_part.User_ID.isin(combined_df.User_ID.tolist())].groupby('Community_ID').count()

In [None]:
df_part.loc[df_part.User_ID.isin(combined_df.User_ID.tolist())].groupby('Label').count()

In [None]:
df_pol_res2.loc[df_pol_res2.User_ID.isin(combined_df.User_ID.tolist())].groupby('Label').count()

In [None]:
df_chicka = pd.merge(df_pol_res2, combined_df)

In [None]:
df_chicka.loc[(df_chicka.Label == 0) & (df_chicka.source_file == 'Russian_ID.csv')]

In [None]:
df_chicka.groupby('source_file').count()