In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import json
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities
import matplotlib.pyplot as plt

k_core = nx.read_gexf("../graphs/hashtag_network_k10_nothemes.gexf")

In [2]:
communities = pd.read_csv("../data/themes_10core_final_updated.csv")

# For k=10 graph
def add_community_info(graph, communities_df):
    # Create community dictionary
    community_dict = communities_df.set_index('hashtag')['theme'].to_dict()
    
    # Add community as node attribute
    nx.set_node_attributes(graph, community_dict, 'community')
    
    # Print some statistics about communities in the graph
    graph_communities = [community_dict.get(node, 'misc') for node in graph.nodes()]
    community_counts = pd.Series(graph_communities).value_counts()
    
    print("Community distribution:")
    for comm, count in community_counts.items():
        print(f"{comm}: {count} nodes")

    # Print nodes with unknown communities
    unknown_nodes = [node for node in graph.nodes() if community_dict.get(node, 'unknown') == 'unknown']
    print(f"\nNumber of nodes with unknown community: {len(unknown_nodes)}")
    if len(unknown_nodes) > 0:
        print("Nodes without community assignment:")
        print(unknown_nodes)
    return graph

In [3]:
k_core_theme = add_community_info(k_core, communities)

# Export graph for Gephi
# nx.write_gexf(k_core_theme, "../graphs/hashtag_network_k10_withtheme.gexf")

Community distribution:
misc: 570 nodes
awareness_and_advocacy: 280 nodes
health_conditions: 277 nodes
platform: 183 nodes
emotions_and_feelings: 170 nodes
identity_and_community: 155 nodes
commonly_misused_substances: 154 nodes
location: 116 nodes
occupation: 73 nodes
other_substances: 68 nodes
humor: 56 nodes
cannabis: 51 nodes
alcohol: 43 nodes
substance_effects: 41 nodes
consumption_method: 41 nodes
tobacco_nicotine: 31 nodes
cognitive_enhancement: 24 nodes

Number of nodes with unknown community: 4
Nodes without community assignment:
['narcoticsanonynous', 'mentalhealthawarness', 'comedу', 'foryourpage']


In [4]:
# add the 4 missing communities manually
new_assignments = {
    'narcoticsanonynous': 'identity_and_community',
    'mentalhealthawarness': 'awareness_and_advocacy', 
    'comedу': 'humor',
    'foryourpage': 'platform'
}

# Assuming 'communities' is your DataFrame:
for hashtag, theme in new_assignments.items():
    communities.loc[len(communities)] = {'hashtag': hashtag, 'theme': theme}

In [5]:
# Recreate graph with updated communities
k_core_theme = add_community_info(k_core, communities)

# Check for any remaining unknown assignments
unknown_nodes = [node for node in k_core_theme.nodes() if k_core_theme.nodes[node].get('community', 'unknown') == 'unknown']
print(f"\nNumber of nodes with unknown community: {len(unknown_nodes)}")
if len(unknown_nodes) > 0:
    print("Nodes without community assignment:")
    print(unknown_nodes)

Community distribution:
misc: 566 nodes
awareness_and_advocacy: 281 nodes
health_conditions: 277 nodes
platform: 184 nodes
emotions_and_feelings: 170 nodes
identity_and_community: 156 nodes
commonly_misused_substances: 154 nodes
location: 116 nodes
occupation: 73 nodes
other_substances: 68 nodes
humor: 57 nodes
cannabis: 51 nodes
alcohol: 43 nodes
substance_effects: 41 nodes
consumption_method: 41 nodes
tobacco_nicotine: 31 nodes
cognitive_enhancement: 24 nodes

Number of nodes with unknown community: 0

Number of nodes with unknown community: 0


In [6]:
# Create edge list with weights and community information as DataFrame
edges = []
for u, v, d in k_core.edges(data=True):
    source_community = k_core.nodes[u].get('community', 'unknown')
    target_community = k_core.nodes[v].get('community', 'unknown')
    edges.append({
        'source': u,
        'source_community': source_community,
        'target': v, 
        'target_community': target_community,
        'weight': d['weight']
    })

edge_df = pd.DataFrame(edges)

# Export to CSV
edge_df.to_csv("../graphs/10core_edgelist_with_communities.csv", index=False)

In [7]:
def create_node_list(graph):
    node_data = []
    
    # For each node in the graph
    for node in graph.nodes():
        # Get community
        community = graph.nodes[node].get('community', 'Unknown')
        
        # Calculate weighted degree
        weighted_degree = sum(w['weight'] for u, v, w in graph.edges(node, data=True))
        
        # Add to list
        node_data.append({
            'Node': node,
            'Community': community,
            'Weighted_Degree': weighted_degree
        })
    
    # Convert to DataFrame and save
    df = pd.DataFrame(node_data)
    df = df.sort_values('Weighted_Degree', ascending=False)  # Optional: sort by weighted degree
    df.to_csv('../data/k10node_list_with_weights.csv', index=False)
    
    return df

In [8]:
node_list_df = create_node_list(k_core)

---
# get ego networks

In [9]:
key_hashtags = ['addiction', 'recovery', 'harmreduction', 
                 'harmreductionsaveslives', 'wedorecover',
                 'pinger', 'opioids', 'fent', 'pingtok', 'narcansaveslives']

def get_n_hop_ego_network(G, node, n):
    ego_network = nx.ego_graph(G, node, radius=n)
    return ego_network

In [10]:
ego_networks = {}
for hashtag in key_hashtags:
    if hashtag in k_core.nodes():
        ego_networks[hashtag] = get_n_hop_ego_network(k_core, hashtag, 1)
        print(f"{hashtag} ego network has {len(ego_networks[hashtag].nodes())} nodes and {len(ego_networks[hashtag].edges())} edges")
    else:
        print(f"Warning: {hashtag} not found in network")


addiction ego network has 1439 nodes and 26804 edges
recovery ego network has 1314 nodes and 24417 edges
harmreduction ego network has 1031 nodes and 19318 edges
harmreductionsaveslives ego network has 771 nodes and 13760 edges
wedorecover ego network has 1043 nodes and 19342 edges
pinger ego network has 145 nodes and 1451 edges
opioids ego network has 625 nodes and 10745 edges
fent ego network has 295 nodes and 5944 edges
pingtok ego network has 226 nodes and 3352 edges
narcansaveslives ego network has 453 nodes and 8030 edges


In [11]:
for hashtag, graph in ego_networks.items():
    print(f"\nCommunity distribution in {hashtag} ego network:")
    communities = [graph.nodes[node].get('community', 'unknown') for node in graph.nodes()]
    community_counts = pd.Series(communities).value_counts()
    print(community_counts)


Community distribution in addiction ego network:
health_conditions              247
awareness_and_advocacy         237
misc                           206
emotions_and_feelings          120
platform                       119
commonly_misused_substances    106
identity_and_community         105
location                        59
occupation                      48
humor                           41
alcohol                         32
other_substances                25
substance_effects               24
consumption_method              22
cannabis                        22
tobacco_nicotine                13
cognitive_enhancement           13
Name: count, dtype: int64

Community distribution in recovery ego network:
awareness_and_advocacy         226
misc                           203
health_conditions              192
emotions_and_feelings          121
platform                       107
identity_and_community         101
commonly_misused_substances     93
location                        53


In [12]:
# analyze community compoisiton
def analyze_community_distribution(G):
    communities = [G.nodes[node].get('community') for node in G.nodes()]
    return pd.Series(communities).value_counts().to_dict()

# Get inter-community connections
def get_community_connections(G):
    community_edges = []
    for u, v, w in G.edges(data=True):
        comm1 = G.nodes[u].get('community')
        comm2 = G.nodes[v].get('community')
        if comm1 != comm2:
            community_edges.append((comm1, comm2, w['weight']))
    return pd.DataFrame(community_edges, columns=['community1', 'community2', 'weight'])

In [13]:
# focus on high centrality nodes
def get_key_nodes(G, top_n=20):
    # Calculate different centrality measures
    degree_cent = nx.degree_centrality(G)
    betweenness_cent = nx.betweenness_centrality(G)
    eigenvector_cent = nx.eigenvector_centrality(G, max_iter=1000)
    
    # Combine into DataFrame
    centrality_df = pd.DataFrame({
        'node': list(G.nodes()),
        'degree': list(degree_cent.values()),
        'betweenness': list(betweenness_cent.values()),
        'eigenvector': list(eigenvector_cent.values()),
        'community': [G.nodes[n].get('community') for n in G.nodes()]
    })
    
    return centrality_df.sort_values('betweenness', ascending=False).head(top_n)

In [14]:
centrality = get_key_nodes(k_core)

In [15]:
centrality

Unnamed: 0,node,degree,betweenness,eigenvector,community
1377,addiction,0.616638,0.129029,0.192598,health_conditions
2148,recovery,0.563036,0.097912,0.182256,health_conditions
983,sober,0.4747,0.058327,0.163937,awareness_and_advocacy
1334,wedorecover,0.446827,0.049856,0.156086,awareness_and_advocacy
156,harmreduction,0.441681,0.049461,0.154108,awareness_and_advocacy
2160,sobriety,0.400086,0.037772,0.14767,health_conditions
597,smoke,0.278731,0.032617,0.102235,consumption_method
1624,soberlife,0.346913,0.026114,0.132868,awareness_and_advocacy
762,cat,0.234134,0.02605,0.071866,misc
1552,pain,0.243568,0.022678,0.09366,health_conditions


In [29]:
# create community level network 
def create_community_network(G):
    # Create network where nodes are communities
    community_edges = {}
    for u, v, data in G.edges(data=True):
        comm1 = G.nodes[u].get('community')
        comm2 = G.nodes[v].get('community')
        if comm1 != comm2:
            key = tuple(sorted([comm1, comm2]))
            community_edges[key] = community_edges.get(key, 0) + data['weight']
    
    # Create new graph
    C = nx.Graph()
    for (comm1, comm2), weight in community_edges.items():
        C.add_edge(comm1, comm2, weight=weight)
    
    return C

In [30]:
C = create_community_network(k_core)

In [32]:
# compare ego networks
def compare_ego_networks(ego_networks):
    comparison = []
    for name, G in ego_networks.items():
        stats = {
            'hashtag': name,
            'nodes': len(G.nodes()),
            'edges': len(G.edges()),
            'density': nx.density(G),
            'avg_clustering': nx.average_clustering(G),
            'communities': len(set(nx.get_node_attributes(G, 'community').values()))
        }
        comparison.append(stats)
    return pd.DataFrame(comparison)

In [33]:
compare_ego_networks(ego_networks)

Unnamed: 0,hashtag,nodes,edges,density,avg_clustering,communities
0,addiction,1439,26804,0.025907,0.685995,17
1,recovery,1314,24417,0.028305,0.702971,17
2,harmreduction,1031,19318,0.036383,0.766489,17
3,harmreductionsaveslives,771,13760,0.046356,0.840224,17
4,wedorecover,1043,19342,0.035594,0.745338,17
5,pinger,145,1451,0.138985,0.69148,15
6,opioids,625,10745,0.055103,0.775112,16
7,fent,295,5944,0.137069,0.707104,15
8,pingtok,226,3352,0.131839,0.650751,15
9,narcansaveslives,453,8030,0.078435,0.835541,15


Shows how interconnected the network is relative to all possible connections

Smaller networks (#pinger, #fent, #pingtok) have higher density (0.13-0.14)

Larger networks (#addiction, #recovery) have lower density (0.02-0.03)

#harmreductionsaveslives and #narcansaveslives have highest clustering (0.84), indicating very cohesive local communities

Most networks contain all 17 identified themes/communities

Harm reduction hashtags show high clustering, suggesting strong community cohesion
