# This notebook collects a sample of control subreddits based on co-posting. It reads the pre-quarantine Reddit-wide activity of all TRP users, and find the top 100 subreddits based on the percentage of TRP users posting in these subreddits.

In [1]:
# -*- coding: utf-8 -*-

In [2]:
import re
import os
import sys
import pandas as pd
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import json
from scipy import stats
import csv
import pprint
from collections import defaultdict
import time
import datetime
import math
import glob
# import matplotlib.pyplot as plt; plt.rcdefaults()
# from IPython.display import Markdown, display

In [3]:
TIME_CHUNK_IN_DAYS = 10
QUARANTINE_DATE = "28/09/2018"
quarDataUnix = time.mktime(datetime.datetime.strptime(QUARANTINE_DATE, "%d/%m/%Y").timetuple())

def getTimeChunkIndex(timeStamp):
    timeStamp = float(timeStamp)
    timeDiff = timeStamp - quarDataUnix
    timeDiffDays = float(timeDiff)/(24*60*60) 
    chunkIndex = math.floor(timeDiffDays/TIME_CHUNK_IN_DAYS)
    return int(chunkIndex)

In [4]:
pd.options.display.max_rows = 999

pd.set_option('display.max_colwidth', -1)

In [17]:
USER_HISTORY_COMMENTS = "/mnt/storage/quarantine/data/TRP_users_comments"
USER_HISTORY_SUBMISSIONS = "/mnt/storage/quarantine/data/TRP_users_submissions.csv"
CANDIDATE_CONTROL_SUBREDDITS = "/mnt/storage/quarantine/data/candidate_control_subs_TRP_users.csv"

In [6]:
POST_THRESHOLD = 5

In [7]:
all_files = glob.glob(USER_HISTORY_COMMENTS + "/*.csv")

li = []

# Reading in comments
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, usecols = ["created_utc", "subreddit", "author"])
    li.append(df)
    
# Reading in submissions
df = pd.read_csv(USER_HISTORY_SUBMISSIONS, index_col=None, header=0, usecols = ["created_utc", "subreddit", "author"])
li.append(df)    

frame = pd.concat(li, axis=0, ignore_index=True)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [8]:
df = frame[frame['created_utc'] < quarDataUnix]
dfSubreddits = df.groupby('subreddit')

In [9]:
subredditTRPUserCount = {}

In [10]:
count = 0
for subreddit, subredditPosts in dfSubreddits:
    subredditUsers = subredditPosts.groupby('author')
    userCount = 0
    for user, group in subredditUsers:
        if (len(group) >= POST_THRESHOLD):
            userCount += 1
    subredditTRPUserCount[subreddit] = userCount
    count += 1
    if (count % 10000 == 0):
        print (count)

10000
20000
30000


In [11]:
controlSubreddits = sorted(subredditTRPUserCount, key=subredditTRPUserCount.get, reverse = True)

In [18]:
candidateSubs = []
quot = "\""
text = ""
sl = []
for subreddit in controlSubreddits[0:1000]:
    su = subredditTRPUserCount[subreddit]
    if (su > 5):
        text = text + quot + subreddit + quot + ","
        candidateSubs.append([subreddit, su])
lb = "("
rb = ")"
print (lb + text[:-1] + rb)

("TheRedPill","asktrp","AskReddit","MGTOW","worldnews","pics","funny","The_Donald","todayilearned","news","politics","Showerthoughts","NoFap","gaming","gifs","videos","AskMen","movies","CringeAnarchy","Braincels","Fitness","seduction","MensRights","Tinder","aww","unpopularopinion","relationship_advice","PurplePillDebate","sex","JordanPeterson","WTF","BlackPeopleTwitter","conspiracy","mildlyinteresting","nba","personalfinance","relationships","technology","soccer","WhereAreAllTheGoodMen","dating_advice","trashy","OldSchoolCool","PoliticalHumor","niceguys","CryptoCurrency","confession","Futurology","hiphopheads","Drugs","nottheonion","interestingasfuck","dankmemes","starterpacks","KotakuInAction","Whatcouldgowrong","television","LifeProTips","pussypassdenied","science","AdviceAnimals","Libertarian","wallstreetbets","changemyview","WhitePeopleTwitter","bodybuilding","NoStupidQuestions","FortNiteBR","IncelTears","MMA","IAmA","watchpeopledie","askMRP","nfl","RoastMe","Jokes","steroids","tre

In [20]:
candidateSubsDf = pd.DataFrame(candidateSubs, columns=["Subreddit", "TRP_users"])

In [21]:
candidateSubsDf.to_csv(CANDIDATE_CONTROL_SUBREDDITS, index=False)