In [None]:
import praw
import config
import pandas as pd


# Initialize Reddit
reddit = praw.Reddit(client_id=config.client_id,
                     client_secret=config.client_secret,
                     user_agent=config.user_agent)

# Lists to store data
posts_data = []
comments_data = []
title = []
post_author = []
comment_authors = []

#represent in a dataframe    
df = pd.DataFrame(columns=['Subreddit','Title','PostAuthor','CommentAuthors'])

# Choose subreddit
subreddit_names = ['OnePiece', 'Naruto', 'dbz', 'OnePunchMan', 'ShingekiNoKyojin', 'JuJutsuKaisen','BokuNoHeroAcademia','HunterXHunter','Berserk','sololeveling','bleach','BlackClover','SpyxFamily','ChainsawMan','deathnote']
for names in subreddit_names:
    subreddit = reddit.subreddit(names)
    # Fetch posts
    comment_count = 0
    all_comments = []
    for post in subreddit.hot(limit=25):  # Adjust limit as needed
        number_of_comments = len(post.comments.list())
        if not post.author:
            post_username = 'deleted user'
        else:
            post_username = post.author.name
    
        #Comments
       
        post.comments.replace_more(limit=0)  # Load all comments
        for comment in post.comments.list():
            if not comment.author:
                comment_username = 'deleted user'
            else:
                comment_username = comment.author.name
            
            reply_authors = []
            for second_level_comment in comment.replies:
                if not second_level_comment.author:
                    second_level_comment_username = 'deleted user'
                else:
                    second_level_comment_username = second_level_comment.author.name
                reply_authors.append(second_level_comment_username)

            comment_authors.append([comment_username,reply_authors])
            
            title.append(post.title)
            post_author.append(post_username)
            all_comments.append(comment_authors)
            df.loc[len(df)] = [names,post.title,post_username,comment_authors[-1]]



print(df.shape)  
print(df)
df.to_csv('exported_dataframe.csv', index=False, encoding='utf-8')


In [None]:
from collections import defaultdict

# Initialize a dictionary to hold user data
user_subreddits = defaultdict(lambda: {'count': 0, 'subreddits': set()})

for index, row in df.iterrows():
    # Track the post author
    user_subreddits[row['PostAuthor']]['count'] += 1
    user_subreddits[row['PostAuthor']]['subreddits'].add(row['Subreddit'])

    # Track comment authors and reply authors
    for comment_author_replies in row['CommentAuthors']:
        # Ensure comment_author_replies is a list with at least one element (the main comment author)
        if isinstance(comment_author_replies, list) and len(comment_author_replies) > 0:
            main_author = comment_author_replies[0]  # Main comment author
            user_subreddits[main_author]['count'] += 1
            user_subreddits[main_author]['subreddits'].add(row['Subreddit'])

            # Check if there are reply authors and they are in a list
            if len(comment_author_replies) > 1 and isinstance(comment_author_replies[1], list):
                for reply_author in comment_author_replies[1]:
                    user_subreddits[reply_author]['count'] += 1
                    user_subreddits[reply_author]['subreddits'].add(row['Subreddit'])

# Convert to DataFrame
users_data = []
for user, data in user_subreddits.items():
    if len(data['subreddits']) > 1:  # Users appearing in multiple subreddits
        users_data.append([user, data['count'], ", ".join(data['subreddits'])])

df_users_in_multiple_subreddits = pd.DataFrame(users_data, columns=['User', 'Occurrences', 'Subreddits'])

df_users_in_multiple_subreddits = df_users_in_multiple_subreddits.loc[~df_users_in_multiple_subreddits['User'].isin(['AutoModerator', 'deleted user'])]

print(df_users_in_multiple_subreddits)
df_users_in_multiple_subreddits.to_csv('optimized_new.csv', index=False, encoding='utf-8')

In [None]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

# Assuming 'df_users_in_multiple_subreddits' DataFrame is prepared as before
# and you have a CSV file that maps users to subreddits correctly.

df_users = pd.read_csv('optimized_new.csv')

# Generating a new graph
G = nx.Graph()

# Define colors for each subreddit
subreddit_colors = {
    'OnePiece': '#FF6347',  # Tomato
    'Naruto': '#FFD700',  # Gold
    'dbz': '#FF4500',  # OrangeRed
    'OnePunchMan': '#DA70D6',  # Orchid
    'ShingekiNoKyojin': '#2E8B57',  # SeaGreen
    'JuJutsuKaisen': '#6A5ACD',  # SlateBlue
    'BokuNoHeroAcademia': '#20B2AA',  # LightSeaGreen
    'HunterXHunter': '#DB7093',  # PaleVioletRed
    'Berserk': '#8B0000',  # DarkRed
    'sololeveling': '#C71585',  # MediumVioletRed
    'bleach': '#1E90FF',  # DodgerBlue
    'BlackClover': '#3CB371',  # MediumSeaGreen
    'SpyxFamily': '#FFA07A',  # LightSalmon
    'ChainsawMan': '#D2691E',  # Chocolate
    'deathnote': '#696969',  # DimGray
}

# Add nodes with attributes
for _, row in df_users.iterrows():
    user = row['User']
    subreddit = row['Subreddits']  # Assuming 'Subreddits' is the column containing subreddit values
    G.add_node(user, type='user', subreddit=subreddit)

# Add edges
for i, row_i in df_users.iterrows():
    user_i = row_i['User']
    subreddits_i = set(row_i['Subreddits'].split(', '))
    for j, row_j in df_users.iterrows():
        if i < j:  # Ensure each pair is processed only once
            user_j = row_j['User']
            subreddits_j = set(row_j['Subreddits'].split(', '))
            shared_subreddits = subreddits_i.intersection(subreddits_j)
            
            for subreddit in shared_subreddits:
                # Add an edge for each shared subreddit
                G.add_edge(user_i, user_j, color=subreddit_colors[subreddit], label=subreddit)


# Assuming G is your pre-defined graph
# Initialize the circular layout
pos = nx.spring_layout(G, seed=109)

# Drawing the graph using the circular layout
plt.figure(figsize=(50, 50))  # Adjust figure size to your preference

# Draw nodes
nx.draw_networkx_nodes(G, pos,node_color='black', node_size=150)

# Draw edges
edges = G.edges(data=True)
edge_colors = [edge[2]['color'] for edge in edges]  # Extract color from edge attribute
nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=2)



plt.axis('off')  # Turn off the axis
plt.show()


In [None]:
# Retrieving the count of nodes and edges
number_of_nodes = G.number_of_nodes()
number_of_edges = G.number_of_edges()

print(f"Total Nodes in the Graph: {number_of_nodes}")
print(f"Total Edges in the Graph: {number_of_edges}")

In [None]:
# Calculate degree centrality
degree_centrality = nx.degree_centrality(G)

plt.hist(degree_centrality.values(), bins=10, edgecolor='black')
plt.title('Degree Centrality Histogram')
plt.xlabel('Degree Centrality')
plt.ylabel('Frequency')
plt.show()

# Calculate betweenness centrality
betweenness_centrality = nx.betweenness_centrality(G)
print("Betweenness Centrality:", betweenness_centrality)

# Calculate clustering coefficient
clustering_coefficient = nx.clustering(G)
print("Clustering Coefficient:", clustering_coefficient)

In [None]:
# Create a DataFrame to store degree centrality and corresponding subreddit
degree_centrality_df = pd.DataFrame(columns=['Node', 'Degree Centrality', 'Subreddit'])

i = 0
# Populate the DataFrame with degree centrality and corresponding subreddit
for node, centrality in degree_centrality.items():
    degree_centrality_df.loc[len(degree_centrality_df)] = {'Node': node, 'Degree Centrality': centrality, 'Subreddit': G.nodes[node]['subreddit']}
    i += 1

degree_centrality_df = degree_centrality_df.sort_values(by='Degree Centrality', ascending=False)


# Print the most active user based on degree centrality
print("Most active user based on degree centrality:", degree_centrality_df.iloc[0]['Node'])
print("Degree centrality:", degree_centrality_df.iloc[0]['Degree Centrality'])
print("Subreddit:", degree_centrality_df.iloc[0]['Subreddit'])

In [None]:
betweenness_centrality_df = pd.DataFrame(columns=['Node', 'Subreddit', 'Betweenness Centrality'])

i = 0
# Populate the DataFrame with degree centrality and corresponding subreddit
for node, centrality in betweenness_centrality.items():
    betweenness_centrality_df.loc[len(betweenness_centrality_df)] = {'Node': node, 'Betweenness Centrality': centrality, 'Subreddit': G.nodes[node]['subreddit']}
    i += 1

# Sort DataFrame by betweenness centrality in descending order
betweenness_centrality_df = betweenness_centrality_df.sort_values(by='Betweenness Centrality', ascending=False)

# Print the user with the highest betweenness centrality
print("User with the highest betweenness centrality:", betweenness_centrality_df.iloc[0]['Node'])
print("Betweenness centrality:", betweenness_centrality_df.iloc[0]['Betweenness Centrality'])
print("Subreddit:", betweenness_centrality_df.iloc[0]['Subreddit'])


In [None]:
# Convert clustering coefficient dictionary to DataFrame
clustering_coefficient_df = pd.DataFrame(columns=['Node', 'Clustering Coefficient', 'Subreddit'])

# Populate the DataFrame with clustering coefficient and corresponding subreddit
for node, coefficient in clustering_coefficient.items():
    subreddit = G.nodes[node]['subreddit']
    clustering_coefficient_df.loc[len(clustering_coefficient_df)] = {'Node': node, 'Clustering Coefficient': coefficient, 'Subreddit': subreddit}

# Sort DataFrame by clustering coefficient in descending order
clustering_coefficient_df = clustering_coefficient_df.sort_values(by='Clustering Coefficient', ascending=False)

# Print the subreddit with the highest clustering coefficient
print("User with the highest clustering coefficient:", clustering_coefficient_df.iloc[0]['Node'])
print("Clustering coefficient:",clustering_coefficient_df.iloc[0]['Clustering Coefficient'])
print("Subreddit: ",clustering_coefficient_df.iloc[0]['Subreddit'])


In [None]:
print("Based on all the generated metrics, we get to identify the subreddits with the most popularity, impact and users who are well connected with other subreddits.")
print("Publicizing anything in these subreddits, has better reach than the ones analyzed by us.")
degree_subreddits = degree_centrality_df.iloc[0]['Subreddit'].split(', ')
betweenness_subreddits = betweenness_centrality_df.iloc[0]['Subreddit'].split(', ')
clustering_subreddits = clustering_coefficient_df.iloc[0]['Subreddit'].split(', ')

# Merge all subreddits and remove duplicates
all_subreddits = list(set(degree_subreddits + betweenness_subreddits + clustering_subreddits))

# Print the merged list of subreddits
print("Merged list of subreddits:", all_subreddits)