# This file reads the comments and posts posted in the quarantined subreddit six months before and six months after the quarantining, and writes out a file of users who authored these comments and posts. It also tests hypothesis 1.

In [1]:
# -*- coding: utf-8 -*-

In [2]:
import pandas as pd
import json
from scipy import stats
import numpy as np
import csv
import pprint
from collections import defaultdict

import math
import matplotlib.pyplot as plt; plt.rcdefaults()
from IPython.display import Markdown, display

import re

import time
import datetime

In [3]:
ALL_COMMENTS_FILE = "/mnt/storage/quarantine/data/theRedPillComments.csv"
ALL_SUBMISSIONS_FILE = "/mnt/storage/quarantine/data/theRedPillSubmissions.csv"
PRE_QUARANTINE_USERS_FILE = "/mnt/storage/quarantine/data/preQuarantineUsers.csv"

In [4]:
QUARANTINE_DATE = "28/09/2018"
MAX_POSTS_THRESHOLD = 10000

In [5]:
quarDateUnix = time.mktime(datetime.datetime.strptime(QUARANTINE_DATE, "%d/%m/%Y").timetuple())

In [6]:
firstDateUnix = quarDateUnix - 180*24*3600
lastDateUnix = quarDateUnix + 180*24*3600

print ("First Date:", firstDateUnix)
print ("Last Date:", lastDateUnix)

First Date: 1522555200.0
Last Date: 1553659200.0


In [7]:
df_comments = pd.read_csv(ALL_COMMENTS_FILE,
                          usecols = ["created_utc", "subreddit", "author", "score"])

In [8]:
df_submissions = pd.read_csv(ALL_SUBMISSIONS_FILE,
                      usecols = ["created_utc", "subreddit", "author", "score"])

In [9]:
df = pd.concat([df_comments, df_submissions], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [10]:
df = df[df.author != "[deleted]"]

In [11]:
# We don't need these filters anymore because I updated the datasets themselves such that these values are excluded.

# df = df[df.created_utc > firstDateUnix]
# df = df[df.created_utc < lastDateUnix]

In [12]:
len(df)

194442

In [13]:
dfPast = df[df.created_utc <= quarDateUnix]
dfFuture = df[df.created_utc > quarDateUnix]

In [14]:
dfPastGroups = dfPast.groupby('author')
dfFutureGroups = dfFuture.groupby('author')

## Collecting users who posted at least twice and at most 10000 times before the quarantine

In [15]:
count = 0
TRPAuthors = []
for author, posts in dfPastGroups:
    numPosts = len(posts)
    if (numPosts > 1 and numPosts < MAX_POSTS_THRESHOLD):
        TRPAuthors.append(author) 

In [16]:
TRPAuthorsdf = pd.DataFrame(TRPAuthors, columns=["Author"])

In [17]:
TRPAuthorsdf.to_csv(PRE_QUARANTINE_USERS_FILE, index=False)

In [18]:
len(TRPAuthorsdf)

10617

## Detecting users who did not depart after the quarantine

In [19]:
dfPastAuthors = set(dfPastGroups.groups.keys())
dfFutureAuthors = set(dfFutureGroups.groups.keys())

In [20]:
departedAuthors = dfPastAuthors - dfFutureAuthors

In [21]:
nonDepartedAuthors = dfPastAuthors - departedAuthors

In [22]:
authorActivityLevels = {}
authorTotalPosts = {}
for author, posts in dfPastGroups:
    numPosts = len(posts)
    if (numPosts > 1 and numPosts < MAX_POSTS_THRESHOLD):
        postTimes = posts['created_utc'].tolist()
        firstPostTime = min(postTimes)
        lastPostTime = max(postTimes)
        firstLastPostDiff = float(lastPostTime - firstPostTime)/(24*60*60) 
        authorActivityLevels[author] = numPosts/firstLastPostDiff
        authorTotalPosts[author] = numPosts

## Comparing activity levels of departed users and non-departed users in the quarantined sub before the quarantining.

In [23]:
departedAuthorsActivity = []
nonDepartedAuthorsActivity = []
for author in authorActivityLevels.keys():
    activity = authorActivityLevels[author]
    if (author in nonDepartedAuthors):
        # Non-departed Author
        nonDepartedAuthorsActivity.append(activity)
    else:
        # Departed Author
        departedAuthorsActivity.append(activity)

In [24]:
s = pd.Series(departedAuthorsActivity)
s.describe().apply(lambda x: format(x, '.8f'))

count     7648.00000000
mean        77.81608989
std        603.94723169
min          0.01159789
25%          0.07814412
50%          0.24777084
75%          2.52677519
max      34560.00000000
dtype: object

In [25]:
s = pd.Series(nonDepartedAuthorsActivity)
s.describe().apply(lambda x: format(x, '.8f'))

count    2969.00000000
mean       18.99260871
std       218.16368767
min         0.01116558
25%         0.07107235
50%         0.15290484
75%         0.40607395
max      6171.42857143
dtype: object

In [26]:
# T - test
ttest=stats.ttest_ind(departedAuthorsActivity, nonDepartedAuthorsActivity)
ttest

Ttest_indResult(statistic=5.1774538447187, pvalue=2.2905161843846927e-07)

## The above results indicate that the pre-quarantine activity of departed users is significantly higher than the pre-quarantine activity of non-departed users. Therefore, hypthesis 1 is rejected. This result also counteracts the findings of prior research.

## Comparing total posting activity of departed users and non-departed users in the quarantined sub before the quarantining.

In [27]:
departedAuthorsPosts = []
nonDepartedAuthorsPosts = []
for author in authorTotalPosts.keys():
    posts = authorTotalPosts[author]
    if (author in nonDepartedAuthors):
        # Non-departed Author
        nonDepartedAuthorsPosts.append(posts)
    else:
        # Departed Author
        departedAuthorsPosts.append(posts)

In [28]:
s = pd.Series(departedAuthorsPosts)
s.describe().apply(lambda x: format(x, '.8f'))

count    7648.00000000
mean        7.68423117
std        16.48912532
min         2.00000000
25%         2.00000000
50%         3.00000000
75%         7.00000000
max       573.00000000
dtype: object

In [29]:
s = pd.Series(nonDepartedAuthorsPosts)
s.describe().apply(lambda x: format(x, '.8f'))

count    2969.00000000
mean       21.86998990
std        51.76781767
min         2.00000000
25%         3.00000000
50%         7.00000000
75%        20.00000000
max       952.00000000
dtype: object

In [30]:
# T - test
ttest=stats.ttest_ind(departedAuthorsPosts, nonDepartedAuthorsPosts)
ttest

Ttest_indResult(statistic=-21.338936750618757, pvalue=5.819703512568361e-99)

## The above results indicate that the total pre-quarantine posts of departed users are significantly lower than the total pre-quarantine posts of non-departed users. Therefore, hypthesis 1 is not rejected using this measure of 'activity'. 

## Printing Users list for futher Google Big Data queries

In [32]:
tf = pd.read_csv(PRE_QUARANTINE_USERS_FILE)

quot = "\""
text = ""
for sub in tf.Author:
    text = text + quot + sub + quot + ","

lb = "("
rb = ")"
# print(lb + text[:-1] + rb)