# Step 1: instantiate

In [1]:
import praw
import numpy as np
import pandas as pd
from pyvis.network import Network
from decouple import config
from collections import Counter

sub_of_interest = 'FemaleDatingStrategy'
hard_limit = 6

r_client_id = config('REDDIT-CLIENT-ID')
secret = config('REDDIT-SECRET')
r_id = config("USER")
r_pw = config("PW")
 
# ===== Instantiate Reddit instance =====
reddit = praw.Reddit(
    client_id= r_client_id,
    client_secret=secret,
    username=r_id,
    password=r_pw,
    user_agent='1'
)

# Step 2: Define functions

## Step 2.1: define function to obtain recent redditors of a subreddit

## Step 2.2: define fucntion to obtain recent subreddits to which a redditor has posted to

In [2]:
def get_users(sub=sub_of_interest, post_count=8, comment_count=12):
    """
    Finds the most recently active users in a subreddit. It's easier to comment than to post on a subreddit, so more comments are collected.
    :param subreddit: subreddit to crawl
    :param posts_count: number of posts to search for
    :param comments_count: number of comments to search for
    :return: List of usernames who were recently active in a subreddit
    """
    # === Obtain list of posters and commentors ===
    posters = [
        s.author.name                                         #<- Redditor who posted
        for s in reddit.subreddit(sub).new(limit=post_count)  #<- from most recent posts
        if s.author.name != "AutoModerator"                   #<- remove auto moderator
    ]
    
    # --- Same with commentors ---
    commenters = [
        c.author.name
        for c in reddit.subreddit(sub).comments(limit=comment_count)  
        if c.author.name != "AutoModerator"
    ]
    
    # === Combine the posters and commentors ===
    redditors = posters + commenters    
    # --- Remove duplicates
    unique_redditors = list(set(redditors))
    
    return unique_redditors

In [3]:
def get_redditor_activity(subreddit, post_count=8, comment_count=12, filter_soi=True):
    """
    Get names of subreddits to which users have posted to.
    :param redditor_list: list of redditor usernames
    :param lim: number of posts to limit to; default 12
    """
    
    # === Define lists to store subreddits where redditor has posted and commented to ===
    posts_subreddits = []
    comments_subreddits = []
    
    # === 
    # Iterate over list of redditors and store the names of the subreddits where they posted and store 
    # them in the list to a certain length 
    # ===
    
    redditor_list = get_users(subreddit, post_count, comment_count)
    
    # -- Iterate over list of redditors --
    for r in redditor_list:

        posts = reddit.redditor(r).submissions.new(limit=post_count) #<- new submissions from redditors
        comments = reddit.redditor(r).comments.new(limit=comment_count)

        # - Iterate over posts/submissions -
        for submission in posts:
            # Skip if they posted in the subreddit that we're scraping, otherwise store to list
            if submission.subreddit.display_name == sub_of_interest:
                if filter_soi is False:
                    posts_subreddits.append(
                        {
                            "source": subreddit,
                            "target": submission.subreddit.display_name,
                            "type": "submission",
                            'weight': 3
                        }
                    )
                
            else:
                posts_subreddits.append(
                    {
                        "source": subreddit,
                        "target": submission.subreddit.display_name,
                        "type": "submission",
                        'weight': 3                        
                    }
                )
        
        for comment in comments:
            if comment.subreddit.display_name == sub_of_interest:
                if filter_soi is False:
                    comments_subreddits.append(
                            {
                                "source": subreddit,
                                "target": comment.subreddit.display_name,
                                "type": "comment",
                                'weight': 1
                            }
                    )
            else:
                    comments_subreddits.append(
                        {
                            "source": subreddit,
                            "target": comment.subreddit.display_name,
                            "type": "comment",
                            'weight': 1
                        }
                    )
    
    #activity_list = posts_subreddits + comments_subreddits
    
    return posts_subreddits + comments_subreddits


def get_redditors_activity(posts_subreddits_list, comments_subreddits_list):
    
    posts_weight = {sub[1]: posts_subreddits_list.count(sub[1])+3 for sub in posts_subreddits_list}
    comments_weight = {com[1]: comments_subreddits_list.count(com[1]) for com in comments_subreddits_list}
    
    def merg_sum(a, b):
        for k in b:
            if k in a:
                b[k] = b[k] + a[k]
        c = {**a, **b}
        return c
    
    activity_weight = merg_sum(posts_weight, comments_weight)
    
    #activity_subreddits = list(set(posts_subreddits_list + comments_subreddits_list))
    
    return activity_weight

# Step 3: get redditors from the sub of interest and their recently active subreddits

In [4]:
primary_activity = get_redditor_activity(sub_of_interest, post_count=3, comment_count=6, filter_soi=True)

print(primary_activity[:5])

[{'source': 'FemaleDatingStrategy', 'target': 'asiantwoX', 'type': 'submission', 'weight': 3}, {'source': 'FemaleDatingStrategy', 'target': 'aspergers', 'type': 'submission', 'weight': 3}, {'source': 'FemaleDatingStrategy', 'target': 'aspergers', 'type': 'submission', 'weight': 3}, {'source': 'FemaleDatingStrategy', 'target': 'AskFDS', 'type': 'submission', 'weight': 3}, {'source': 'FemaleDatingStrategy', 'target': 'childfree', 'type': 'submission', 'weight': 3}]


# Step 4: Organize the edge list into a dataframe

In [5]:
df_primary = pd.DataFrame(
    data=primary_activity
)
display(df_primary.head())
print(df_primary.shape)

Unnamed: 0,source,target,type,weight
0,FemaleDatingStrategy,asiantwoX,submission,3
1,FemaleDatingStrategy,aspergers,submission,3
2,FemaleDatingStrategy,aspergers,submission,3
3,FemaleDatingStrategy,AskFDS,submission,3
4,FemaleDatingStrategy,childfree,submission,3


(21, 4)


# Step 5: iterate over the primary networks to get secondary activity edge list

In [6]:
df_secondary = pd.DataFrame()

primary_target = list(df_primary.target.unique())

for subreddit in primary_target:
    secondary_activity = get_redditor_activity(subreddit, post_count=2, comment_count=4, filter_soi=False)

    df_secondary = df_secondary.append(secondary_activity, ignore_index=True)
    
display(df_secondary)

Unnamed: 0,source,target,type,weight
0,asiantwoX,perth,submission,3
1,asiantwoX,asiantwoX,submission,3
2,asiantwoX,FemaleDatingStrategy,submission,3
3,asiantwoX,asiantwoX,submission,3
4,asiantwoX,perth,comment,1
...,...,...,...,...
313,amiugly,amiugly,comment,1
314,amiugly,amiugly,comment,1
315,amiugly,GilmoreGirls,comment,1
316,amiugly,GilmoreGirls,comment,1


# Step 6: concatenate the primary and secondary edge list into a single large dataframe

In [7]:
df = pd.concat([df_primary, df_secondary], ignore_index=True)

display(df.head())
display(df.tail())
print(df.shape)

Unnamed: 0,source,target,type,weight
0,FemaleDatingStrategy,asiantwoX,submission,3
1,FemaleDatingStrategy,aspergers,submission,3
2,FemaleDatingStrategy,aspergers,submission,3
3,FemaleDatingStrategy,AskFDS,submission,3
4,FemaleDatingStrategy,childfree,submission,3


Unnamed: 0,source,target,type,weight
334,amiugly,amiugly,comment,1
335,amiugly,amiugly,comment,1
336,amiugly,GilmoreGirls,comment,1
337,amiugly,GilmoreGirls,comment,1
338,amiugly,amiugly,comment,1


(339, 4)


In [11]:
df.to_csv(f"reddit_activity_{sub_of_interest}.csv", mode='a', header=True, index=False)

# Step 7: define nodes and edges

In [None]:
nodes = 

In [5]:
nx = Network(notebook=True)

nx.add_nodes(node_list, label=node_list)

nx.add_edges(edge_tuples)

nx.show("nodes.html")