# This notebook shows the time series analysis of user activity for quarantined subreddit's members on (1) the overall Reddit and (2) the quarantined subreddit

In [1]:
# -*- coding: utf-8 -*-

In [2]:
import pandas as pd
import json
from scipy import stats
import numpy as np
import csv
import pprint
from collections import defaultdict

import math
import matplotlib.pyplot as plt; plt.rcdefaults()
from IPython.display import Markdown, display

import re
import os
import sys

import time
import datetime

In [3]:
csv.field_size_limit(sys.maxsize)

131072

In [4]:
USER_HISTORY_FOLDER = "/mnt/storage/quarantine/data/redpill_users_history"
RED_PILL_COMMENTS_FILE = "/mnt/storage/quarantine/data/theRedPillComments.csv"
PRE_QUARANTINE_USERS_FILE = "/mnt/storage/quarantine/data/preQuarantineUsers.csv"

In [5]:
TIME_CHUNK_IN_DAYS = 10
QUARANTINE_DATE = "28/09/2018"

quarDataUnix = time.mktime(datetime.datetime.strptime(QUARANTINE_DATE, "%d/%m/%Y").timetuple())

In [6]:
def getTimeChunkIndex(timeStamp):
    timeStamp = float(timeStamp)
    timeDiff = timeStamp - quarDataUnix
    timeDiffDays = float(timeDiff)/(24*60*60) 
    chunkIndex = math.floor(timeDiffDays/TIME_CHUNK_IN_DAYS)
    return int(chunkIndex)

In [7]:
userPostCount = defaultdict(int)
userPostOnQuarSubCount = defaultdict(int)

In [None]:
for subdir, dirs, files in os.walk(USER_HISTORY_FOLDER):
    fileCount = 0
    for file in files:
        fileCount += 1
        if (fileCount % 10 == 0):
            print ("fileCount = ", fileCount)
        inputFile = os.path.join(subdir, file)
        with open(inputFile) as csvfile:
            readCSV = csv.DictReader(csvfile)
            for row in readCSV:
                author = row['author']     
                postTime = row['created_utc']
                timeChunkIndex = getTimeChunkIndex(postTime)
                userPostCount[timeChunkIndex] += 1
                subreddit = row['subreddit']
                if (subreddit == 'TheRedPill'):
                    userPostOnQuarSubCount[timeChunkIndex] += 1

fileCount =  10
fileCount =  20


## Activity of r/theredpill users across Reddit

In [None]:
lists = sorted(userPostCount.items())
x, y = zip(*lists)
plt.plot(x, y)
plt.axvline(x=0)
plt.rcParams["figure.figsize"] = (20,8)
plt.xlabel('Time-windows of 10 days, before and after the quarantine', fontsize=18)
plt.ylabel('Total posts', fontsize=18)
plt.show()

### The above graph indicates that the overall user activity of quarantined subreddit's members on Reddit increased after the quarantining.

## Activity of r/theredpill users who posted before quarantining on r/theredpill

In [None]:
lists = sorted(userPostOnQuarSubCount.items())
x, y = zip(*lists)
plt.plot(x, y)
plt.axvline(x=0)
plt.rcParams["figure.figsize"] = (20,8)
plt.xlabel('Time-windows of 10 days, before and after the quarantine', fontsize=18)
plt.ylabel('Total posts', fontsize=18)
plt.show()

### The above graph indicates that the user activity of quarantined subreddit's members on the (quarantined) subreddit itself rapidly decreased after the quarantining.

## Activity of all users on r/theredpill

In [None]:
preQuarantineUsers = set()

In [None]:
with open(RED_PILL_COMMENTS_FILE) as csvfile:
    readCSV = csv.DictReader(csvfile)
    for row in readCSV:
        author = row['author']
        if (author != "[deleted]"):
            created_utc = float(row['created_utc'])
            if (created_utc < quarDataUnix):
                preQuarantineUsers.add(author)
            

In [None]:
preQuarantineUserPostCount = defaultdict(int)
newUserPostCount = defaultdict(int)

In [None]:
with open(RED_PILL_COMMENTS_FILE) as csvfile:
    readCSV = csv.DictReader(csvfile)
    for row in readCSV:
        author = row['author']
        if (author != "[deleted]"):
            postTime = row['created_utc']
            timeChunkIndex = getTimeChunkIndex(postTime)
            
            if (author in preQuarantineUsers):
                preQuarantineUserPostCount[timeChunkIndex] += 1
            else:
                newUserPostCount[timeChunkIndex] += 1     

In [None]:
lists = sorted(preQuarantineUserPostCount.items())
x1, y1 = zip(*lists)
plt.plot(x1, y1, label='Pre-quarantine users')

lists = sorted(newUserPostCount.items())
x2, y2 = zip(*lists)
plt.plot(x2, y2, label='New users')

plt.axvline(x=0)
plt.legend()

plt.xlabel('Time-windows of 10 days, before and after the quarantine', fontsize=18)
plt.ylabel('Total posts', fontsize=18)

plt.show()

### In the graph above, pre-quarantine users are those who posted on r/theredpill (TRP) at least once prior to being quarantined. New users are those who had never posted on TRP before the quarantining. This shows that new users end up becoming the prominent contributors, and remain so for many months after the quarantining

## New user influx

In [None]:
authorsInEachChunk = defaultdict(set)

In [None]:
with open(RED_PILL_COMMENTS_FILE) as csvfile:
    readCSV = csv.DictReader(csvfile)
    for row in readCSV:
        author = row['author']
        if (author != "[deleted]"):
            postTime = row['created_utc']
            timeChunkIndex = getTimeChunkIndex(postTime)
            authorsInEachChunk[timeChunkIndex].add(author)            

In [None]:
lists = sorted(authorsInEachChunk.items())

In [None]:
seenUsersSet = set()
for ulist in lists[0:3]:
    chunkUsers = ulist[1]
    seenUsersSet = seenUsersSet.union(chunkUsers)

In [None]:
newsUsersCount = defaultdict(int)
for ulist in lists[4:]:
    chunkIndex = ulist[0]
    chunkUsers = ulist[1]
    newUsersSet = chunkUsers.difference(seenUsersSet)
    newsUsersCount[chunkIndex] = len(newUsersSet)
    seenUsersSet = seenUsersSet.union(chunkUsers)

In [None]:
lists = sorted(newsUsersCount.items())
x, y = zip(*lists)
plt.plot(x, y)
plt.axvline(x=0)
plt.rcParams["figure.figsize"] = (20,8)
plt.xlabel('Time-windows of 10 days, before and after the quarantine', fontsize=18)
plt.ylabel('Number of new users who posted in each time window', fontsize=18)
plt.show()

### The above graph shows that the new users influx dropped down after the quarantine. Curiously, the drop happened immediately after the quarantine and sustained throughout

# Analysis of toxicity scores

In [None]:
trp_toxicity_df = pd.read_csv("/mnt/storage/quarantine/data/theRedPillComments-toxicity.csv")

TrpToxicityDistribution = {}
for i, row in trp_toxicity_df.iterrows():
        author = row['author']
        toxicity = row['toxicity']

        if (toxicity != -1):
            postTime = row['created_utc']
            timeChunkIndex = getTimeChunkIndex(postTime)
            
            if timeChunkIndex in TrpToxicityDistribution:
                TrpToxicityDistribution[timeChunkIndex].append(toxicity)
            else:
                TrpToxicityDistribution[timeChunkIndex] = []
                TrpToxicityDistribution[timeChunkIndex].append(toxicity)

TrpMeanToxicity = {}
for k in TrpToxicityDistribution:
    TrpMeanToxicity[k] = np.mean(TrpToxicityDistribution[k])

In [None]:
lists = sorted(TrpMeanToxicity.items())
x, y = zip(*lists)
plt.plot(x, y)
plt.axvline(x=0)
plt.rcParams["figure.figsize"] = (20,8)
plt.xlabel('Time-windows of 10 days, before and after the quarantine', fontsize=18)
plt.ylabel('Mean toxicity scores on r/TRP', fontsize=18)
plt.show()

## The mean toxicity levels within r/TRP appear to be unchanged (if not higher than before) following the quarantining, indicating that there were no changes in posting behavior. 

Next, we look at how this related to the rate of removal (i.e., moderator actions) within the subreddit.

# Analysis of removed comments

In [None]:
data = pd.read_csv("/mnt/storage/quarantine/data/theRedPillComments-toxicity.csv")

TrpRemovalCount = {}
TotalNumberComments = {}

for i, row in data.iterrows():
        body = row['body']
        postTime = row['created_utc']
        timeChunkIndex = getTimeChunkIndex(postTime)
        
        if timeChunkIndex in TotalNumberComments:
            TotalNumberComments[timeChunkIndex] += 1
        else:
            TotalNumberComments[timeChunkIndex] = 1
        
        ###count #removed
        if (body == "[removed]"):
            if timeChunkIndex in TrpRemovalCount:
                TrpRemovalCount[timeChunkIndex] += 1
            else:
                TrpRemovalCount[timeChunkIndex] = 1

##get the removal rate: i.e., #removed comments normalized by #total comments
TrpRemovalRate = {}
for k in TrpRemovalCount:
    if TotalNumberComments[k] == 0:
        TrpRemovalRate[k] = 0
    else:      
        TrpRemovalRate[k] = float(TrpRemovalCount[k])/(TotalNumberComments[k])

In [None]:
lists = sorted(TotalNumberComments.items())
x, y = zip(*lists)
plt.plot(x, y)
plt.axvline(x=0)
plt.rcParams["figure.figsize"] = (20,8)
plt.xlabel('Time-windows of 10 days, before and after the quarantine', fontsize=18)
plt.ylabel('Total number of comments in r/TRP', fontsize=18)
plt.show()

In [None]:
lists = sorted(TrpRemovalCount.items())
x, y = zip(*lists)
plt.plot(x, y)
plt.axvline(x=0)
plt.rcParams["figure.figsize"] = (20,8)
plt.xlabel('Time-windows of 10 days, before and after the quarantine', fontsize=18)
plt.ylabel('Number of comments removed from r/TRP', fontsize=18)
plt.show()

In [None]:
lists = sorted(TrpRemovalRate.items())
x, y = zip(*lists)
plt.plot(x, y)
plt.axvline(x=0)
plt.rcParams["figure.figsize"] = (20,8)
plt.xlabel('Time-windows of 10 days, before and after the quarantine', fontsize=18)
plt.ylabel('Rate of comment removal in r/TRP', fontsize=18)
plt.show()

## *Observation:* The rate of removal within r/TheRedPill seems to have dropped following the quarantining, despite the mean toxicity levels remaining relatively same!