# Community Detection in FriendFeed Network - Leiden Method



In [1]:

## 1. Import Libraries and Load Data
import dask.dataframe as dd
import igraph as ig
import leidenalg
import gc
import numpy as np
import time
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import warnings

# Plotting settings
%matplotlib inline
warnings.filterwarnings("ignore", category=FutureWarning)

# Start timing
start_time = time.time()

# Load preprocessed data with Dask
users = dd.read_parquet('processed_data/users_cleaned.parquet')
subs = dd.read_parquet('processed_data/subs_cleaned.parquet')
following = dd.read_parquet('processed_data/following_cleaned.parquet')
comments = dd.read_parquet('processed_data/comments_cleaned.parquet')
likes = dd.read_parquet('processed_data/likes_cleaned.parquet')
entries = dd.read_parquet('processed_data/entries_cleaned.parquet')

# Verify that all user IDs in interactions exist in the users DataFrame
def verify_user_ids(interaction_df, user_column_name):
    interaction_user_ids = set(interaction_df[user_column_name].unique().compute())
    user_ids = set(users['ID'].unique().compute())
    missing_user_ids = interaction_user_ids - user_ids
    print(f"Number of user IDs in '{user_column_name}' not in 'users': {len(missing_user_ids)}")
    if len(missing_user_ids) > 0:
        print(f"Sample of missing user IDs: {list(missing_user_ids)[:10]}")

# Run verification
verify_user_ids(likes, 'userID')
verify_user_ids(entries, 'PostedBy')


Number of user IDs in 'userID' not in 'users': 0
Number of user IDs in 'PostedBy' not in 'users': 0


In [2]:
# Function to print shape of Dask DataFrames
def print_dask_shape(df, name):
    row_count = df.shape[0].compute()
    col_count = len(df.columns)
    print(f"{name} shape: ({row_count}, {col_count})")

# Print shapes for each DataFrame
print_dask_shape(users, 'users')
print_dask_shape(subs, 'subs')
print_dask_shape(following, 'following')
print_dask_shape(comments, 'comments')
print_dask_shape(likes, 'likes')
print_dask_shape(entries, 'entries')


users shape: (645414, 4)
subs shape: (24761879, 2)
following shape: (18477145, 3)
comments shape: (2835155, 6)
likes shape: (427214, 3)
entries shape: (10954103, 5)


In [3]:
# Initialize the Graph
G = ig.Graph(directed=True)

# Add nodes for users
G.add_vertices(users['ID'].compute().tolist())
print(f"Graph initialized with {G.vcount()} nodes.")

# Define weights for different interaction types
interaction_weights = {'following': 1, 'subs': 1, 'like': 2, 'comment': 3}

# Helper function to add weighted edges
def add_weighted_edges(df, source_col, target_col, relationship_type):
    # Retrieve source-target pairs and compute them in-memory
    edges = df[[source_col, target_col]].compute().values
    weight = interaction_weights[relationship_type]
    
    # Ensure we pass only pairs for edges
    G.add_edges([(edge[0], edge[1]) for edge in edges])  
    # Assign weights and relationship type attributes
    G.es[-len(edges):]['relationship'] = relationship_type
    G.es[-len(edges):]['weight'] = [weight] * len(edges)  # Set weight for each added edge
    print(f"Added {len(edges)} '{relationship_type}' edges with weight {weight}. Total edges: {G.ecount()}")

# Add edges for 'following' and 'subs'
add_weighted_edges(following, 'FollowerID', 'FollowedID', 'following')
add_weighted_edges(subs, 'FollowerID', 'FollowedID', 'subs')

# Prepare mappings for post authors
post_author_entries = dict(zip(entries['PostID'].compute(), entries['PostedBy'].compute()))
post_author_comments = dict(zip(comments['PostID'].compute(), comments['PostedBy'].compute()))
post_author_mapping = {**post_author_entries, **post_author_comments}

# Filter out edges with None values for 'like' edges
likes_edges = [(row['userID'], post_author_mapping.get(row['PostID'])) for _, row in likes.iterrows()]
likes_edges = [(source, target) for source, target in likes_edges if source is not None and target is not None]

# Add to graph with weight for 'like'
G.add_edges(likes_edges)
G.es[-len(likes_edges):]['relationship'] = 'like'
G.es[-len(likes_edges):]['weight'] = [interaction_weights['like']] * len(likes_edges)
print(f"Added {len(likes_edges)} 'like' edges with weight {interaction_weights['like']}. Total edges: {G.ecount()}")

# Clear intermediate variables to free memory
del likes_edges
gc.collect()

# Filter out edges with None values for 'comment' edges
comments_edges = [(row['PostedBy'], post_author_entries.get(row['EntryID'])) for _, row in comments.iterrows()]
comments_edges = [(source, target) for source, target in comments_edges if source is not None and target is not None]

# Add to graph with weight for 'comment'
G.add_edges(comments_edges)
G.es[-len(comments_edges):]['relationship'] = 'comment'
G.es[-len(comments_edges):]['weight'] = [interaction_weights['comment']] * len(comments_edges)
print(f"Added {len(comments_edges)} 'comment' edges with weight {interaction_weights['comment']}. Total edges: {G.ecount()}")

Graph initialized with 645414 nodes.
Added 18477145 'following' edges with weight 1. Total edges: 18477145
Added 24761879 'subs' edges with weight 1. Total edges: 43239024
Added 370687 'like' edges with weight 2. Total edges: 43609711
Added 2623974 'comment' edges with weight 3. Total edges: 46233685


In [4]:
# Remove self-loops by selecting edges where the source and target vertices are the same
self_loop_edges = G.es.select(lambda e: e.source == e.target)
G.delete_edges(self_loop_edges)

print(f"Graph has {G.vcount()} nodes and {G.ecount()} edges after removing self-loops.")


Graph has 645414 nodes and 44671399 edges after removing self-loops.


In [5]:
## 3. Community Detection with Leiden Algorithm
# Run the Leiden algorithm
partition = leidenalg.find_partition(G, leidenalg.ModularityVertexPartition, weights='weight')
print(f"Number of communities detected: {len(set(partition.membership))}")


Number of communities detected: 4768


In [6]:
# Modularity score - measure of the strength of community structure
modularity_score = partition.modularity
print(f"Modularity Score: {modularity_score:.4f}")

# score of the partition
score = partition.quality()
print(f"Quality Score: {score:.4f}")

Modularity Score: 0.4594
Quality Score: 0.4927


In [7]:
# Map each node to its community
community_mapping = dict(zip(G.vs['name'], partition.membership))
# Calculate and display community sizes
community_sizes = Counter(partition.membership)
top_communities = community_sizes.most_common(10)
for comm_id, size in top_communities:
    print(f"Community {comm_id}: {size} nodes")

Community 0: 202355 nodes
Community 1: 69721 nodes
Community 2: 68536 nodes
Community 3: 53465 nodes
Community 4: 35667 nodes
Community 5: 33781 nodes
Community 6: 32778 nodes
Community 7: 29481 nodes
Community 8: 22117 nodes
Community 9: 19105 nodes


In [8]:
# length of the communities
print(f"Length of the communities: {len(community_sizes)}")

Length of the communities: 4768


In [9]:
# Count the number of communities with fewer than 2 nodes
count_small_communities = sum(1 for size in community_sizes.values() if size < 2)

print(f"Number of communities with fewer than 2 nodes: {count_small_communities}")


Number of communities with fewer than 2 nodes: 2045


In [17]:
# Get the degrees for all nodes
node_degrees = [(node.index, node.degree()) for node in G.vs]

# Sort nodes by degree in descending order and select the top 100
top_10_nodes = sorted(node_degrees, key=lambda x: x[1], reverse=True)[:100]

# Get the names and communities of the top 100 nodes
top_nodes_info = [(str(G.vs[node_id]['name']), community_mapping[G.vs[node_id]['name']]) for node_id, _ in top_10_nodes]
top_node_names, top_nodes_communities = zip(*top_nodes_info)

# Print the results
print(f"Top 10 nodes by degree: {top_node_names}")
print(f"Communities for top 10 nodes: {top_nodes_communities}")
print(f"top node value count {Counter(top_nodes_communities)}")

Top 10 nodes by degree: ('br3adman', 'malikimrana1', 'sonergonul', 'phasuk111', 'omarabid', 'xahac', 'nickcodipietro', 'alpb', 'mashable', 'kimkardashian', 'kmorrison', 'uforella', 'foreveradog', 'seoptimise', 'jsin', 'gustavogualbert', 'donkayvan', 'theguruseye', 'qwreck', 'farcecars', 'leolaporte', 'elena170364', 'courtneyengle', 'josemarbessa', 'bejoyru', 'spaceastronautics', 'barackobama', 'johnarcews', 'carrolltrust', 'ludwikc', 'garinkilpatrick', 'billromanos', 'donfuxx', 'ridwan2906', 'tavria', 'robangeles', 'webbizplan', 'streetmachine', 'derekhaines', 'puffaddering', 'kevinrose', 'profitbaron', 'freelance2day', 'mbl', 'micheleficara', 'veronicabelmont', 'toddhuff1234', 'denverflower', 'chrisvoss', 'livecrunch', 'jasoncalacanis', 'typojar', 'stejules', 'denversolarguy', 'dinovedo', 'envisiongroup', 'technotodd', 'webmelhor', 'b2b', 'techwall', 'lightandcomposition', 'miapo', 'aemailmarketing', 'whizbuzz1', 'steffanantonas', 'revtrev', 'dindasheeva', 'danschawbel', 'branislavchr

In [18]:
# Find all community IDs with only one node
single_user_communities = [comm_id for comm_id, size in community_sizes.items() if size == 1]
print(f"Total single-user communities: {len(single_user_communities)}")

# List of user IDs in single-user communities
single_user_ids = [G.vs[node]["name"] for node, comm_id in enumerate(partition.membership) if comm_id in single_user_communities]

# Print or examine the IDs
print(f"User IDs in single-user communities: {single_user_ids}")
print(f"Total user IDs in single-user communities: {len(single_user_ids)}")



Total single-user communities: 2045
User IDs in single-user communities: ['drmnmlk', 'dudude', 'ebuu', 'elaryan', 'erenhatirnaz', 'fadetobright', 'ferideceylan', 'fmgraphics', 'forget', 'gbirkett', 'gemma', 'gmania', 'incesticide', 'informedidiot', 'irishsideofmoon', 'ivancokrajcovic', 'jack100', 'jcporter1', 'jodyengland1', 'jordi4', 'josephdsmithjr', 'juliecarr', 'earfun', 'hermielazaro', 'kamraj', 'hoboot1', 'kassyjohnson', 'katapult', 'katetanita', 'kbipropertiesllc', 'kernal', 'korhansonmezsoy', 'lazarustaylor', 'linkster', 'lizardqueen', 'lmg', 'lydiawee', 'koutheir', 'mahooch', 'marianatalavera', 'marinutri', 'markchew', 'mazdak1', 'mdryan', 'millenareis', 'miptalk', 'davideridolfi', 'azizfoladvand', 'laroussibenyacoub', 'mojoyugen', 'mustyrhymer', 'ncollette', 'no2coldcalling', 'omargatti', 'paisa', 'panzullo', 'paolacaggia', 'paulmenard', 'pcextra', 'pclifton', 'piccoloimprenditore', 'pierremarques', 'pino', 'purna', 'caxonturse', 'baykiz', 'raissazhou', 'rasoul', 'bilallekesi

In [19]:
import dask.dataframe as dd
import pandas as pd

# Load preprocessed data with Dask
users = dd.read_parquet('processed_data/users_cleaned.parquet')
subs = dd.read_parquet('processed_data/subs_cleaned.parquet')
following = dd.read_parquet('processed_data/following_cleaned.parquet')
comments = dd.read_parquet('processed_data/comments_cleaned.parquet')
likes = dd.read_parquet('processed_data/likes_cleaned.parquet')
entries = dd.read_parquet('processed_data/entries_cleaned.parquet')

def analyze_user(user_id):
    # 1. User Information
    user_info = users[users['ID'] == user_id].compute()
    if user_info.empty:
        print(f"User ID {user_id} not found in the users dataset.")
        return
    
    print("User Information:")
    print(user_info)
    print("\n")

    # 2. Subscriptions
    # Check who the user is following and who follows them
    user_subs_following = subs[subs['FollowerID'] == user_id].compute()
    user_subs_followed_by = subs[subs['FollowedID'] == user_id].compute()

    print(f"Subscriptions where user is following others: {len(user_subs_following)}")
    print(f"Subscriptions where others are following the user: {len(user_subs_followed_by)}")
    print("\n")

    # 3. Following Relationships
    user_following = following[following['FollowerID'] == user_id].compute()
    user_followed_by = following[following['FollowedID'] == user_id].compute()

    print(f"Following relationships where user follows others: {len(user_following)}")
    print(f"Following relationships where others follow the user: {len(user_followed_by)}")
    print("\n")

    # 4. Comments
    # Check if the user has made any comments
    user_comments = comments[comments['PostedBy'] == user_id].compute()
    print(f"Total comments made by user: {len(user_comments)}")
    if len(user_comments) > 0:
        print("Sample comments:")
        print(user_comments[['PostID', 'Text', 'Timestamp']].head())
    print("\n")

    # 5. Likes
    # Check if the user has liked any posts
    user_likes = likes[likes['userID'] == user_id].compute()
    print(f"Total likes by user: {len(user_likes)}")
    if len(user_likes) > 0:
        print("Sample likes:")
        print(user_likes[['PostID', 'Timestamp']].head())
    print("\n")

    # 6. Entries (Posts)
    # Check if the user has made any posts
    user_posts = entries[entries['PostedBy'] == user_id].compute()
    print(f"Total posts made by user: {len(user_posts)}")
    if len(user_posts) > 0:
        print("Sample posts:")
        print(user_posts[['PostID', 'Text', 'Timestamp']].head())
    print("\n")
    
    # Free up memory
    gc.collect()

In [21]:
# Example usage:
# Replace 'user_id_here' with the actual user ID you want to analyze
analyze_user(user_id='dudude')

User Information:
         ID  Type      Name     Description
342  dudude  user  terry007  No Description


Subscriptions where user is following others: 0
Subscriptions where others are following the user: 0


Following relationships where user follows others: 0
Following relationships where others follow the user: 0


Total comments made by user: 0


Total likes by user: 0


Total posts made by user: 0




In [25]:
single_user_ids_list = list(single_user_ids)


# Subscriptions
user_subs_following = subs[subs['FollowerID'].isin(single_user_ids_list)]
user_subs_followed_by = subs[subs['FollowedID'].isin(single_user_ids_list)]
total_subs_following = user_subs_following.shape[0].compute()
total_subs_followed_by = user_subs_followed_by.shape[0].compute()

# Following relationships
user_following = following[following['FollowerID'].isin(single_user_ids_list)]
user_followed_by = following[following['FollowedID'].isin(single_user_ids_list)]
total_following = user_following.shape[0].compute()
total_followed_by = user_followed_by.shape[0].compute()

# Comments
user_comments = comments[comments['PostedBy'].isin(single_user_ids_list)]
total_comments = user_comments.shape[0].compute()

# Likes
user_likes = likes[likes['userID'].isin(single_user_ids_list)]
total_likes = user_likes.shape[0].compute()

# Entries (Posts)
user_posts = entries[entries['PostedBy'].isin(single_user_ids_list)]
total_posts = user_posts.shape[0].compute()

# Print total counts
print(f"Total subscriptions where user is following others: {total_subs_following}")
print(f"Total subscriptions where others are following the user: {total_subs_followed_by}")
print(f"Total following relationships where user follows others: {total_following}")
print(f"Total following relationships where others follow the user: {total_followed_by}")
print(f"Total comments made by users: {total_comments}")
print(f"Total likes by users: {total_likes}")
print(f"Total posts made by users: {total_posts}")


Total subscriptions where user is following others: 0
Total subscriptions where others are following the user: 0
Total following relationships where user follows others: 0
Total following relationships where others follow the user: 0
Total comments made by users: 678
Total likes by users: 3
Total posts made by users: 8714


In [26]:

## 5. Final Notes and Cleanup
# Free up memory
del users, subs, following, comments, likes, entries
gc.collect()

# Display execution time
end_time = time.time()
total_seconds = end_time - start_time
minutes, seconds = divmod(total_seconds, 60)
print(f"Execution time: {int(minutes)} minutes and {seconds:.2f} seconds")

Execution time: 26 minutes and 49.72 seconds
