# Step 1: instantiate

In [1]:
import praw
from prawcore.exceptions import Forbidden
import numpy as np
import pandas as pd
from pyvis.network import Network
import networkx as nx
import matplotlib.pyplot as plt
from decouple import config

sub_of_interest = "TwoXChromosomes"

un = config('USER')
pw = config("PW")
client_id = config("CLIENT_ID")
secret = config("SECRET")

reddit = praw.Reddit(
    username=un,
    password=pw,
    client_secret=secret,
    client_id=client_id,
    user_agent='network_map'
)

print(reddit.user.me())

mrkillercow


# Step 2: Define functions

## Step 2.1: define function to obtain recent redditors of a subreddit

## Step 2.2: define fucntion to obtain recent subreddits to which a redditor has posted to

In [2]:
def get_users(sub=sub_of_interest, post_count=8, comment_count=12):
    """
    Finds the most recently active users in a subreddit. It's easier to comment than to post on a subreddit, so more comments are collected.
    :param subreddit: subreddit to crawl
    :param posts_count: number of posts to search for
    :param comments_count: number of comments to search for
    :return: List of usernames who were recently active in a subreddit
    """
    # === Obtain list of posters and commentors ===
    posters = []
    
    try:
        for s in reddit.subreddit(sub).new(limit=post_count):
            if s.author.name != "AutoModerator":
                posters.append(s.author.name)
            elif type(s.author.name) == None:
                pass
    except Forbidden:
        print("Tried to access Forbidden")
        #pass
    
    """posters = [
        s.author.name                                         #<- Redditor who posted
        for s in reddit.subreddit(sub).new(limit=post_count)  #<- from most recent posts
        if s.author.name != "AutoModerator"                   #<- remove auto moderator
    ]"""
    
    
    commentors = [] 
    
    try: 
        # --- Same with commentors ---
        for c in reddit.subreddit(sub).comments(limit=comment_count):
            if c.author.name != "AutoModerator":
                commentors.append(c.author.name)
            elif type(c.author.name) == None:
                pass
    except Forbidden:
        print("Tried to access Forbidden")
        #pass
    """
    commenters = [
        c.author.name
        for c in reddit.subreddit(sub).comments(limit=comment_count)  
        if c.author.name != "AutoModerator"
    ]"""
    
    # === Combine the posters and commentors ===
    redditors = posters + commentors    
    # --- Remove duplicates
    unique_redditors = list(set(redditors))
    
    return unique_redditors

In [3]:
def get_redditor_activity(subreddit, post_count=8, comment_count=12, filter_soi=True):
    """
    Get names of subreddits to which users have posted to.
    :param redditor_list: list of redditor usernames
    :param lim: number of posts to limit to; default 12
    """
    
    # === Define lists to store subreddits where redditor has posted and commented to ===
    posts_subreddits = []
    comments_subreddits = []
    
    # === 
    # Iterate over list of redditors and store the names of the subreddits where they posted and store 
    # them in the list to a certain length 
    # ===
    
    redditor_list = get_users(subreddit, post_count, comment_count)
    
    # -- Iterate over list of redditors --
    for r in redditor_list:

        posts = reddit.redditor(r).submissions.new(limit=post_count) #<- new submissions from redditors
        comments = reddit.redditor(r).comments.new(limit=comment_count)
        
        try:        
            # - Iterate over posts/submissions -
            for submission in posts:
                # Skip if they posted in the subreddit that we're scraping, otherwise store to list
                if submission.subreddit.display_name == sub_of_interest:
                    if filter_soi is True:
                        pass
                    else:
                        posts_subreddits.append(
                            {
                                "source": subreddit,
                                "target": submission.subreddit.display_name,
                                "type": "submission",
                                'weight': 3
                            }
                        )

                elif submission.subreddit.display_name != sub_of_interest:
                    posts_subreddits.append(
                        {
                            "source": subreddit,
                            "target": submission.subreddit.display_name,
                            "type": "submission",
                            'weight': 3                        
                        }
                    )

            for comment in comments:
                if comment.subreddit.display_name == sub_of_interest:
                    if filter_soi is True:
                        pass
                    else:
                        comments_subreddits.append(
                                {
                                    "source": subreddit,
                                    "target": comment.subreddit.display_name,
                                    "type": "comment",
                                    'weight': 1
                                }
                        )
                elif comment.subreddit.display_name != sub_of_interest:
                        comments_subreddits.append(
                            {
                                "source": subreddit,
                                "target": comment.subreddit.display_name,
                                "type": "comment",
                                'weight': 1
                            }
                        )
        except Forbidden:
            print("Forbidden")
            pass
    
    #activity_list = posts_subreddits + comments_subreddits
    
    return posts_subreddits + comments_subreddits


def remove_self_loops(dataframe, column_1, column_2):
    
    mask = dataframe.apply(lambda x: x[column_1] in x[column_2], axis=1)
    
    df = dataframe[~mask]
    
    return df

# Step 3: get redditors from the sub of interest and their recently active subreddits

In [4]:
primary_activity = get_redditor_activity(sub_of_interest, post_count=7, comment_count=10, filter_soi=True)

print(primary_activity[:5])

[{'source': 'TwoXChromosomes', 'target': 'childfree', 'type': 'submission', 'weight': 3}, {'source': 'TwoXChromosomes', 'target': 'MealPrepSunday', 'type': 'submission', 'weight': 3}, {'source': 'TwoXChromosomes', 'target': 'androidroot', 'type': 'submission', 'weight': 3}, {'source': 'TwoXChromosomes', 'target': 'AnimalsBeingDerps', 'type': 'submission', 'weight': 3}, {'source': 'TwoXChromosomes', 'target': 'Architects', 'type': 'submission', 'weight': 3}]


# Step 4: Organize the edge list into a dataframe

In [5]:
df_primary = pd.DataFrame(
    data=primary_activity
)

df_primary = remove_self_loops(df_primary, "source", "target")

display(df_primary)
print(df_primary.shape)

Unnamed: 0,source,target,type,weight
0,TwoXChromosomes,childfree,submission,3
1,TwoXChromosomes,MealPrepSunday,submission,3
2,TwoXChromosomes,androidroot,submission,3
3,TwoXChromosomes,AnimalsBeingDerps,submission,3
4,TwoXChromosomes,Architects,submission,3
...,...,...,...,...
186,TwoXChromosomes,relationship_advice,comment,1
187,TwoXChromosomes,AskReddit,comment,1
188,TwoXChromosomes,childfree,comment,1
189,TwoXChromosomes,relationship_advice,comment,1


(191, 4)


# Step 5: iterate over the primary networks to get secondary activity edge list

In [6]:
df_secondary = pd.DataFrame()

primary_target = list(df_primary.target.unique())

for subreddit in primary_target:
    secondary_activity = get_redditor_activity(subreddit, post_count=2, comment_count=5, filter_soi=False)

    df_secondary = df_secondary.append(secondary_activity, ignore_index=True)
    
df_secondary = remove_self_loops(df_secondary, 'source', 'target')
    
display(df_secondary.head(25))

Unnamed: 0,source,target,type,weight
1,childfree,infj,submission,3
2,childfree,singlengenuinelyhappy,submission,3
3,childfree,antinatalism,submission,3
4,childfree,thesopranos,submission,3
5,childfree,thesopranos,submission,3
7,childfree,relationships,submission,3
8,childfree,trichotillomania,submission,3
11,childfree,youtube,submission,3
13,childfree,britishproblems,comment,1
14,childfree,lostgeneration,comment,1


# Step 6: concatenate the primary and secondary edge list into a single large dataframe

In [7]:
df = pd.concat([df_primary, df_secondary], ignore_index=True)

display(df.head())
display(df.tail())
print(df.shape)

Unnamed: 0,source,target,type,weight
0,TwoXChromosomes,childfree,submission,3
1,TwoXChromosomes,MealPrepSunday,submission,3
2,TwoXChromosomes,androidroot,submission,3
3,TwoXChromosomes,AnimalsBeingDerps,submission,3
4,TwoXChromosomes,Architects,submission,3


Unnamed: 0,source,target,type,weight
2944,sweden,MyTeam,comment,1
2945,sweden,China,comment,1
2946,sweden,battlefield2042,comment,1
2947,sweden,HellLetLoose,comment,1
2948,sweden,Unexpected,comment,1


(2949, 4)


# Step 7: remove self-loops

In [8]:
df.to_csv(f"reddit_activity_{sub_of_interest}.csv", mode='a', header=False, index=False)

# Step 8: combine the weights for each edge combination

In [9]:
d = {'weight':'sum', 'type': 'first'}
df = df.groupby(
    ['source','target'], 
    sort=False, 
    as_index=False
).agg(d).reindex(columns=df.columns)

display(df.head(25))

Unnamed: 0,source,target,type,weight
0,TwoXChromosomes,childfree,submission,17
1,TwoXChromosomes,MealPrepSunday,submission,3
2,TwoXChromosomes,androidroot,submission,3
3,TwoXChromosomes,AnimalsBeingDerps,submission,3
4,TwoXChromosomes,Architects,submission,3
5,TwoXChromosomes,ShittyVeganFoodPorn,submission,3
6,TwoXChromosomes,curlyhair,submission,5
7,TwoXChromosomes,harrypotter,submission,3
8,TwoXChromosomes,AnimalCrossing,submission,3
9,TwoXChromosomes,crochet,submission,3


# Step 7: define nodes and edges

In [10]:
s = df.source.unique().tolist()
t = df.target.unique().tolist()
n = list(set(s+t))

nodes = {v: k for k, v in enumerate(n, 0)}

df['source_num'] = df['source'].map(nodes)
df['target_num'] = df['target'].map(nodes)

display(df.head(30))
#edge_list = [(s, t, w) for s, t, w in zip()]

Unnamed: 0,source,target,type,weight,source_num,target_num
0,TwoXChromosomes,childfree,submission,17,246,1006
1,TwoXChromosomes,MealPrepSunday,submission,3,246,778
2,TwoXChromosomes,androidroot,submission,3,246,198
3,TwoXChromosomes,AnimalsBeingDerps,submission,3,246,1039
4,TwoXChromosomes,Architects,submission,3,246,519
5,TwoXChromosomes,ShittyVeganFoodPorn,submission,3,246,789
6,TwoXChromosomes,curlyhair,submission,5,246,1256
7,TwoXChromosomes,harrypotter,submission,3,246,371
8,TwoXChromosomes,AnimalCrossing,submission,3,246,197
9,TwoXChromosomes,crochet,submission,3,246,894


In [11]:
node_list = list(nodes.values())
node_labels = list(nodes.keys())
edge_list = [(source, target, weight) for source, target, weight in zip(df.source_num, df.target_num, df.weight)]

In [19]:
nx_pv = Network("500px", "1000px", notebook=True)

nx_pv.add_nodes(node_list, label=node_labels)

nx_pv.add_edges(edge_list)
nx_pv.show_buttons(filter_=['physics'])
nx_pv.save_graph(f"reddit_map_{sub_of_interest}.html")

In [21]:
G = nx.DiGraph()
edge_list = [(source, target, {"weight": weight}) for source, target, weight in zip(df.source_num, df.target_num, df.weight)]

print(edge_list[:5])

G.add_nodes_from(node_list)
G.add_edges_from(edge_list)

[(246, 1006, {'weight': 17}), (246, 778, {'weight': 3}), (246, 198, {'weight': 3}), (246, 1039, {'weight': 3}), (246, 519, {'weight': 3})]
