In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms.community import greedy_modularity_communities

# Load the dataset
file_path = "soc-redditHyperlinks-body.tsv"
# testing the first 10 rows
df = pd.read_csv(file_path, sep='\t').head(10000)

G = nx.from_pandas_edgelist(df, 'SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', create_using=nx.DiGraph())

# Centrality Measures
degree_centrality = centrality.degree_centrality(G)
closeness_centrality = centrality.closeness_centrality(G)
# Print Centrality Measures
for node in G.nodes():
    print(f"Node: {node}, Degree Centrality: {degree_centrality[node]}, Closeness Centrality: {closeness_centrality[node]}")

# Influence Analysis
pagerank_scores = nx.pagerank(G)
influential_subreddits = sorted(pagerank_scores, key=pagerank_scores.get, reverse=True)
# Print top influential subreddits
print("Top Influential Subreddits:")
for subreddit in influential_subreddits[:10]:
    print(subreddit, pagerank_scores[subreddit])

# Perform community detection using the Louvain algorithm
communities = list(greedy_modularity_communities(G))
# Print the number of communities and subreddits in each community
print(f"Number of communities: {len(communities)}")
for i, community in enumerate(communities):
    print(f"Community {i + 1}: {len(community)} subreddits")
# Visualize the communities
pos = nx.spring_layout(G)  # use a different layout algorithm based on preference
colors = [i for i, comm in enumerate(communities) for _ in comm]
plt.figure(figsize=(10, 8))
nx.draw(G, pos, node_color=colors, with_labels=True, cmap=plt.cm.tab20, font_size=8)
plt.title("Subreddit Community Detection")
plt.show()

# Set edge weights to 1 for each edge
edge_weights = {(source, target): 1 for source, target in G.edges()}
# Visualize the flow of information through edge thickness
plt.figure(figsize=(12, 10))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, font_size=8, node_color='skyblue', node_size=1000,
        edgelist=edge_weights.keys(), width=list(edge_weights.values()), edge_color='gray', edge_cmap=plt.cm.Blues)
plt.title("Content Propagation Analysis")
plt.show()

# ============ CONTENT ANALYSIS=================
# Convert 'POST_ID' to numeric
df['POST_ID'] = pd.to_numeric(df['POST_ID'], errors='coerce')
# Drop rows with NaN values in 'POST_ID'
df = df.dropna(subset=['POST_ID'])
# Assuming you have a column named 'POST_ID' in your dataframe
top_posts = df.nlargest(1000, 'POST_ID')
# Display the content of the top 5 posts
for index, row in top_posts.iterrows():
    print(f"Post ID: {row['POST_ID']}")
    print(f"Source Subreddit: {row['SOURCE_SUBREDDIT']}")
    print(f"Target Subreddit: {row['TARGET_SUBREDDIT']}")
    print(f"Timestamp: {row['TIMESTAMP']}")
    print(f"Link Sentiment: {row['LINK_SENTIMENT']}")
    print(f"Properties: {row['PROPERTIES']}")
    print("="*50)