In [11]:
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [12]:
from __future__ import division
import warnings
warnings.filterwarnings('ignore')
import itertools
from collections import defaultdict
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from numpy import ma
import pandas as pd
import pickle
from scipy import stats
from sklearn.decomposition import PCA
import statsmodels.api as sm
import statsmodels.stats.api as sms
import tabletext
from pyspan.config import *
INPUT_DIR = paths["input_dir"]
METRICS_DIR = paths["metrics_dir"]
from pyspan import valence

In [13]:
words_fn = "survey_terms.txt"
words = open(words_fn, "r").read().split("\n")
words = list(filter(lambda s: s.strip(), words))
partisan = words[:-20]
antonyms = words[-20:]

# Pre-processing

In [14]:
# Load data
df_ = pd.read_csv("LoP_Ratings_2.csv")
df = df_[2:]

In [16]:
# Check if anyone replied "no" to any of the questions on the consent form
pd.concat([ df[df["Q5_1"]!="Yes"], df[df["Q5_2"]!="Yes"], 
            df[df["Q5_3"]!="Yes"], df[df["Q5_4"]!="Yes"] ])

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,DistributionChannel,UserLanguage,Q5_1,...,194.1,195.1,196.1,Q621,Condition,FL_30_DO,FL_40_DO,Block2_DO,Words_Dem_First_DO,Words_Repub_First_DO


In [17]:
# See if anyone didn't finish the survey
unfinished = df.loc[df["Finished"] == "FALSE"]
print "Excluding some participants"*(not unfinished.empty)
df = df.loc[df["Finished"] == "TRUE"]
print len(df)

Excluding some participants
201


In [18]:
# Combine "Democrat first" and "Republican first" conditions
df1 = df[map(str, range(1, 197))]
df2 = df[map(lambda n: str(n+.1), range(1, 197))]
df2.rename(columns = dict([ (str(i+.1), str(i)) for i in range(1, 197) ]),
           inplace = True)
df__ = df1.fillna(df2)
# Check if anyone left every question blank
any(map(lambda i: df__.loc[i].isnull().values.all(), df__.index))

False

In [19]:
len(df__)

201

In [20]:
code = { "I am almost certain the speaker is a Democrat.": 0,
         "I am reasonably sure the speaker is a Democrat.": 1,
         "I am unsure but think that the speaker is a Democrat.": 2,
         "I am unsure but think that the speaker is a Republican.": 3,
         "I am reasonably sure the speaker is a Republican.": 4,
         "I am almost certain the speaker is a Republican.": 5,
         "-99": np.nan
       }

In [21]:
df__ = df__.replace(code)

In [22]:
minidf = df__[map(str, range(1, 99))]

Check for differences in the order of options. N.B.: MANOVA assumes the DVs are continuous and normal.

In [23]:
dat = minidf[map(str, range(1, 99))]
condition = df["FL_40_DO"]
%Rpush dat condition

In [24]:
%R dat <- matrix(unlist(dat), ncol = 98)

array([[5., 0., 5., ..., 1., 1., 1.],
       [4., 4., 1., ..., 4., 3., 2.],
       [5., 1., 4., ..., 1., 4., 4.],
       ...,
       [1., 0., 0., ..., 4., 4., 0.],
       [4., 1., 0., ..., 4., 3., 4.],
       [4., 0., 0., ..., 1., 0., 1.]])

In [25]:
%R summary <- summary(manova(dat ~ condition))
%Rpull summary
print summary

           Df  Pillai approx F num Df den Df Pr(>F)
condition   1 0.52556   0.9834     98     87 0.5336
Residuals 184                                      



In [26]:
# Helper functions for cleaning demographic responses
def int_(s):
    if s == "-99":
        return np.nan
    return int(s)

# TODO: Make sure get_gender works on this data!
def get_gender(row):
    gender = row["Q611"]
    if "f" in gender.lower():
        return "F"
    if "m" in gender.lower():
        return "M"
    return np.nan

def get_party_identity(row):
    pi = row["Q613"]
    if pi == "-99":
        return np.nan
    return int(pi[0])-4

def get_political_leanings(row):
    pl = row["Q614"]
    if pl in ("-99", "Other (please specify)"):
        return np.nan
    return {
        "Very Liberal": -2,
        "Moderately Liberal": -1,
        "Moderate": 0,
        "Moderately Conservative": 1,
        "Very Conservative": 2
    }[pl]
    
def get_political_engagement(row):
    pe = row["Q615"]
    if pe == "-99":
        return np.nan
    return int(pe[0])-4
    
def get_education(row):
    edu = row["Q617"]
    if edu == "-99":
        return np.nan
    return {
        "High school": 0,
        "Some college": 1,
        "Associate's/professional/vocational degree": 2,
        "Bachelor's degree": 3,
        "Master's degree": 4,
        "Higher-level graduate degree": 5
    }[edu]

def get_voted(row):
    voted = row["Q619"]
    if voted == "Yes":
        return 1
    if voted == "No":
        return 0
    return np.nan

def get_political_bubble(row):
    bubble = row["Q620"]
    if bubble == "-99":
        return np.nan
    bubble = filter(lambda s: s.isdigit(), bubble)
    # If the participant entered text as a response, just ignore it
    # because I can't think of a systematic way to convert text responses
    # to numbers
    if len(bubble) == 0:
        return np.nan
    return int_(filter(lambda s: s.isdigit(), bubble))

In [27]:
# Add demographics
minidf["age"] = map(int_, df["Q610"])
minidf["gender"] = df.apply(get_gender, axis = 1)
minidf["party"] = df["Q612"].replace("-99", np.nan)
minidf["party_identity"] = df.apply(get_party_identity, axis = 1)
minidf["political_leanings"] = df.apply(get_political_leanings,
                                        axis = 1)
minidf["political_engagement"] = df.apply(get_political_engagement,
                                          axis = 1)
minidf["C-Span"] = df.apply(lambda row: "C-Span" in row["Q616"],
                            axis = 1)
minidf["education"] = df.apply(get_education, axis = 1)
minidf["voted"] = df.apply(get_voted, axis = 1)
minidf["political_bubble"] = df.apply(get_political_bubble,
                                      axis = 1)

In [28]:
ATC_FAILED = []
for i in df__.index:
    atcs = df__.loc[i][map(str, range(99, 197))].values
    atcs = enumerate(atcs)
    atcs = [ (i_, a) for i_, a in atcs if not np.isnan(a) ]
    assert len(atcs) == 2
    original = df__.loc[i][map(str, range(1, 99))].values
    atc_f = not all([ abs(original[i_]-a) <= 1 for i_, a in atcs ])
    ATC_FAILED.append(atc_f)
minidf["ATC_FAILED"] = ATC_FAILED
print "%f of participants (%d) failed the attention check"%(ATC_FAILED.count(True)/float(len(ATC_FAILED)), ATC_FAILED.count(True))
minidf = minidf[minidf.ATC_FAILED == False]
print "Excluding participants who failed the attention check\n"
print "n = {}".format(len(minidf))
minidf = minidf.drop("ATC_FAILED", 1)

0.268657 of participants (54) failed the attention check
Excluding participants who failed the attention check

n = 147


In [29]:
# How much missing data is there?
# Each number in this list is the number of questions skipped by a
# participant, excluding 0s. So if 3 participants each skip 1 question,
# the list will be [ 1, 1, 1 ].
nskipped = minidf.isnull().sum(axis = 1)
filter(lambda i: i != 0, nskipped.tolist())

[1, 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [30]:
minidf = minidf.reset_index(drop = True)

In [31]:
minidf["index"] = minidf.index
minidf = minidf.set_index("index")

In [32]:
minidf.to_csv("responses.csv")

# Reformat and save needed data

In [33]:
pkls = pickle.load(open(METRICS_DIR + "partial_kls-unigrams", "rb"))

In [34]:
# Save PKL data to file
to_save = [np.nan]*(len(partisan)*4)
to_save = np.array(to_save).reshape((len(partisan), 4))

for i, w in enumerate(partisan):
    to_save[i][0] = i + 1
    try:
        to_save[i][2] = pkls.loc[w]["dmetric"]
        to_save[i][3] = pkls.loc[w]["rmetric"]
    except KeyError:
        to_save[i][2] = np.nan
        to_save[i][3] = np.nan
some_words = pd.DataFrame(to_save, columns = [ "index", "word",
                          "PKL_D", "PKL_R" ])
some_words["index"] = map(int, some_words["index"])
some_words = some_words.set_index("index")
some_words["word"] = partisan
some_words["google_sentiment"] = map(lambda w: valence.get_valence(w, use = "google")[0], 
                                     partisan)
some_words["pattern_sentiment"] = map(lambda w: valence.get_valence(w, use = "pattern")[0], 
                                      partisan)
some_words["crr"] = map(lambda w: valence.get_valence(w, use = "crr")[0], partisan)
some_words.to_csv("partisan.csv")

In [35]:
# Save antonyms data to file
to_save = [np.nan]*(len(antonyms)*5)
to_save = np.array(to_save).reshape((len(antonyms), 5))

positive = [ "joy", "plentiful", "qualified", "famous", "clever",
             "accurate", "superior", "laugh", "praise", "sweet" ]
negative = [ "inferior", "cry", "blame", "bitter", "sorrow",
             "scarce", "unqualified", "unknown", "stupid",
             "inaccurate" ]
for i, w in enumerate(antonyms):
    to_save[i][0] = i + 79
    to_save[i][2] = pkls.loc[w]["dmetric"]
    to_save[i][3] = pkls.loc[w]["rmetric"]
some_words = pd.DataFrame(to_save, columns = [ "index", "word",
                          "PKL_D", "PKL_R", "valence" ])
some_words["index"] = map(int, some_words["index"])
some_words = some_words.set_index("index")
some_words["word"] = antonyms
get_valence = lambda w: "POS" if w in positive else "NEG"
some_words["valence"] = map(get_valence, antonyms)
some_words["google_sentiment"] = map(lambda w: valence.get_valence(w, use = "google")[0], antonyms)
some_words["pattern_sentiment"] = map(lambda w: valence.get_valence(w, use = "pattern")[0], 
                                      antonyms)
some_words["crr"] = map(lambda w: valence.get_valence(w, use = "crr")[0], antonyms)
some_words.to_csv("antonyms.csv")