In [None]:
#DEBUG CONVENIENCE
def showme(xs):
    for x in xs:
        for k, v in x.items():
            if k and v:
                print(k + '')
                print('  ' + repr(v))
        print()

# Survey Analysis

We will drill through the various questions to try to gain some insight on the differences between the tumblr and reddit sample. First, we need to load the results from a file and clean up our data a bit.

In [None]:
# read in file
with open('results.csv', newline='') as f:
    f.readline()
    lines = f.readlines()

# clean up question names
import re
lines[0] = re.sub(r'\${[^}]*} ?', '', lines[0])

# make csv reader
import csv
responses = list(csv.DictReader(lines, delimiter=','))

# get rid of the minors
q_major = 'Are you 18 years of age or older?'
responses = [ r for r in responses if r[q_major] == 'Yes' ]

# get rid of the non-users
q_site = 'On which site do you spend more time?'
r_nonuser = 'I do not spend any time on either site'
responses = [ r for r in responses if r[q_site] != r_nonuser ]

# combine the questions about realname-linked usernames
rnlinkage_singular = "Is your username linked in some way to your real name?"
rnlinkage_plural = "How many of your u...-usernames"
q_rnlinkage = "How many of your usernames are linked in some way to your real name?"
for r in responses:
    singular = r[rnlinkage_singular]
    plural = r[rnlinkage_plural]
    if singular:
        r[q_rnlinkage] = '1' if singular == 'Yes' else '0'
    else:
        r[q_rnlinkage] = plural

# Who responded?

Let's see how respondants break down by site. 

In [None]:
# pretty plots
from plotly.graph_objs import Pie, Figure, Layout, Scatter
from plotly.offline import init_notebook_mode; init_notebook_mode()
from plotly.offline import iplot as iplot_raw
def iplot(title, plots, **kwargs):
    iplot_raw(Figure(data=plots, layout=Layout(title=title, **kwargs)), show_link=False)

In [None]:
from collections import Counter

q_site = 'On which site do you spend more time?'
sites, counts = zip(*Counter(map(lambda x: x[q_site], responses)).items())
iplot("User sources", [Pie(labels=sites, values=counts)])

# Openness by Site

Let's take a first look at how many accounts are connected to real names.

In [None]:
def try_apply(apply, x):
    try:
        return apply(x)
    except:
        return False

def tumblr(items):
    return [ x for x in items if x[q_site] == 'tumblr' ]

def reddit(items):
    return [ x for x in items if x[q_site] == 'reddit' ]

def analyze_three_ways(rs, title, question, validator):
    plots = []
    annotations = []
    for sample,      group,     domain,       offset in (
        (rs,         "Overall", [0, 0.32],    0.09),
        (tumblr(rs), "tumblr",  [0.34, 0.65], 0.49),
        (reddit(rs), "reddit",  [0.67, 1],    0.89)):

        usernames = [ r[question] for r in sample if validator(r[question]) ]
        n_usernames, counts = zip(*Counter(usernames).items())
        plots.append(Pie(labels=n_usernames,
                         values=counts,
                         domain={"x": domain},
                         name=group,
                         hoverinfo="value+percent",
                         text=group,
                         textinfo="label"))

        annotations.append({"font": {"size": 20},
                            "showarrow": False,
                            "text": group,
                            "x": offset,
                            "y": 0.95})

    iplot(title, plots, annotations=annotations, showlegend=False)

In [None]:
q_usernames = "How many usernames do you use? Select 0 if you don't have...-usernames"
analyze_three_ways(responses, "Usernames per account", q_usernames, lambda x: x and x != '-1')

analyze_three_ways(responses, "Usernames linked to real name", q_rnlinkage, lambda x: x and x != '-1')

q_time = 'About how many hours per day do you use ?-Hours'
analyze_three_ways(responses, "Hours used per day", q_time, lambda x: x and x != '-1')