In [1]:
from collections import defaultdict
from copy import deepcopy
import numpy as np
import pandas as pd
import pickle
from scipy import stats
import sys
from pyspan.config import *
mode = settings["mode"]
assert mode == "crec"
from pyspan import valence

# Pre-processing

In [2]:
# Load data
# LoP_2AFC_2_original contains the original data downloaded from
# Qualtrics. LoP_2AFC_2 is identical, except I recoded the
# free responses to the education question to be standardized.
df = pd.read_csv("LoP_2AFC_2.csv", keep_default_na=False)

In [3]:
# See if there are any responses I should reject
df_ = df[2:]
# Check if anyone replied "no" to any of the questions on the consent form
pd.concat([ df_[df_["Q5_1"]!="Yes"], df_[df_["Q5_2"]!="Yes"], 
            df_[df_["Q5_3"]!="Yes"], df_[df_["Q5_4"]!="Yes"] ])

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,DistributionChannel,UserLanguage,Q5_1,...,Q614_6_TEXT,Q615,Q616,Q616_8_TEXT,Q617,Q618,Q619,Q620,Q621,Condition


In [4]:
# See if anyone didn't finish the survey
unfinished = df_.loc[df_["Finished"] == "FALSE"]
print "Excluding some participants"*(not unfinished.empty)
df_ = df_.loc[df_["Finished"] == "TRUE"]
print len(df_)

Excluding some participants
175


In [5]:
# Combine conditions (differ in question order)
df1 = df_[map(str, range(1, 197))]
df1.replace(["-99", ""], np.nan, inplace = True)
df2 = df_[map(lambda n: str(n+.1), range(1, 197))]
df2.rename(columns = dict([ (str(i+.1), str(i)) for i in range(1, 197) ]),
           inplace = True)
df2.replace(["-99", ""], np.nan, inplace = True)
df__ = df1.fillna(df2)
# Check if anyone left every question blank
any(map(lambda i: df__.loc[i].isnull().values.all(), df__.index))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


False

In [6]:
minidf = df__[map(str, range(1, 99))]

In [7]:
minidf["Condition"] = df_["Condition"]
minidf["party"] = df_["Q612"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [8]:
# Helper functions for cleaning demographic responses
def int_(s):
    if s == "-99":
        return np.nan
    return int(s)

def get_gender(row):
    gender = row["Q611"]
    if "f" in gender.lower():
        return 1
    if "m" in gender.lower():
        return 0
    return np.nan

def get_party_identity(row):
    pi = row["Q613"]
    if pi == "-99":
        return np.nan
    return int(pi[0])-4

def get_political_leanings(row):
    pl = row["Q614"]
    if pl in ("-99", "Other (please specify)"):
        return np.nan
    return {
        "Very Liberal": -2,
        "Moderately Liberal": -1,
        "Moderate": 0,
        "Moderately Conservative": 1,
        "Very Conservative": 2
    }[pl]
    
def get_political_engagement(row):
    pe = row["Q615"]
    if pe == "-99":
        return np.nan
    return int(pe[0])-4
    
def get_education(row):
    edu = row["Q617"]
    if edu in ("-99", ""):
        return np.nan
    return {
        "High school": 0,
        "Some college": 1,
        "Associate's/professional/vocational degree": 2,
        "Bachelor's degree": 3,
        "Master's degree": 4,
        "Higher-level graduate degree": 5
    }[edu]

def get_voted(row):
    voted = row["Q619"]
    if voted == "-99":
        return np.nan
    if voted.lower()[:2] == "no" or voted.lower()[:2] == "didn't":
        return 0
    return 1

def get_political_bubble(row):
    bubble = row["Q620"]
    if bubble == "-99":
        return np.nan
    bubble = filter(lambda s: s.isdigit(), bubble)
    # If the participant entered text as a response, just ignore it
    # because I can't think of a systematic way to convert text responses
    # to numbers
    if len(bubble) == 0:
        return np.nan
    return int_(filter(lambda s: s.isdigit(), bubble))

In [9]:
# Add demographics
minidf["age"] = map(int_, df_["Q610"])
minidf["gender"] = df_.apply(get_gender, axis = 1)
minidf["party"] = df_["Q612"].replace("-99", np.nan)
minidf["party_identity"] = df_.apply(get_party_identity, axis = 1)
minidf["political_leanings"] = df_.apply(get_political_leanings,
                                        axis = 1)
minidf["political_engagement"] = df_.apply(get_political_engagement,
                                          axis = 1)
minidf["C-Span"] = df_.apply(lambda row: "C-Span" in row["Q616"],
                            axis = 1)
minidf["education"] = df_.apply(get_education, axis = 1)
minidf["voted"] = df_.apply(get_voted, axis = 1)
minidf["political_bubble"] = df_.apply(get_political_bubble,
                                      axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

How many people failed the attention check?

In [10]:
ATC_FAILED = []
for i in df__.index:
    atcs = df__.loc[i][map(str, range(99, 197))].values
    atcs = filter(lambda a: isinstance(a, str), atcs)
    original = df__.loc[i][map(str, range(1, 99))].values
    for i_ in (0, 1):
        if i_ == 1 and len(atcs) == 1:
            continue
    atc_f = ( not all([ a in original for a in atcs ]) ) or ( len(atcs) < 2 )
    ATC_FAILED.append(atc_f)
minidf["ATC_FAILED"] = ATC_FAILED
print "%f of participants (%d) failed the attention check"%(ATC_FAILED.count(True)/float(len(ATC_FAILED)), ATC_FAILED.count(True))
minidf = minidf[minidf.ATC_FAILED == False]
minidf = minidf.drop("ATC_FAILED", 1)

0.451429 of participants (79) failed the attention check


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [11]:
# How much missing data is there?
# Each number in this list is the number of questions skipped by a
# participant, excluding 0s. So if 3 participants each skip 1 question,
# the list will be [ 1, 1, 1 ].
nskipped = minidf.isnull().sum(axis = 1)
filter(lambda i: i != 0, nskipped.tolist())

[1, 1, 1, 1, 1, 1, 1, 1, 2, 1]

In [12]:
minidf = minidf.reset_index()

In [13]:
print "N = %d"%(len(minidf))

N = 96


# Reformat and save needed data

In [14]:
pkls = pickle.load(open(paths["metrics_dir"] + "partial_kls-unigrams"))
probs = pickle.load(open(paths["metrics_dir"] + "probs-unigrams"))
signals = pickle.load(open(paths["metrics_dir"] + "signals-unigrams"))
pkls_std = deepcopy(pkls)
pkls_std["dmetric"] = stats.mstats.zscore(pkls["dmetric"])
pkls_std["rmetric"] = stats.mstats.zscore(pkls["rmetric"])
words = pickle.load(open("survey_terms.pkl", "rb"))
partisan = words[:-10]
assert len(partisan) == 88
antonyms_ = words[-10:]
assert len(antonyms_) == 10

In [15]:
def get_(df, col, x):
    try:
        return df.loc[x][col]
    except KeyError:
        return np.nan

In [16]:
def get_diff_sq(pair):
    w1, w2 = pair
    return (get_(pkls, "dmetric", w1) - get_(pkls, "rmetric", w1))**2 + (get_(pkls, "dmetric", w2) - get_(pkls, "rmetric", w2))**2

In [17]:
diff_sqs = dict(zip(map(tuple, words.values), map(get_diff_sq, words.values)))

In [18]:
def save(dat, fn, plus = 1):
    to_save = [np.nan]*(len(dat)*16)
    to_save = np.array(to_save).reshape((len(dat), 16))
    for i, (w1, w2) in enumerate(dat.values):
        to_save[i][0] = i+plus
        to_save[i][3] = get_(pkls_std, "dmetric", w1)
        to_save[i][4] = get_(pkls_std, "rmetric", w1)
        to_save[i][5] = get_(pkls_std, "dmetric", w2)
        to_save[i][6] = get_(pkls_std, "rmetric", w2)
        to_save[i][7] = diff_sqs[(w1, w2)]
        to_save[i][8] = get_(signals, "dmetric", w1)
        to_save[i][9] = get_(signals, "rmetric", w1)
        to_save[i][10] = get_(signals, "dmetric", w2)
        to_save[i][11] = get_(signals, "rmetric", w2)
        to_save[i][12] = get_(probs, "dmetric", w1)
        to_save[i][13] = get_(probs, "rmetric", w1)
        to_save[i][14] = get_(probs, "dmetric", w2)
        to_save[i][15] = get_(probs, "rmetric", w2)
    some_words = pd.DataFrame(to_save, columns = [ "index", "word1",
                     "word2", "PKL_D(word1)", "PKL_R(word1)", "PKL_D(word2)",
                     "PKL_R(word2)", "DIFF_SQ", "LOGP_D(word1)", "LOGP_R(word1)", 
                     "LOGP_D(word2)", "LOGP_R(word2)", "P_D(word1)", "P_R(word1)", "P_D(word2)",
                     "P_R(word2)" ])
    some_words["index"] = map(int, some_words["index"])
    some_words = some_words.set_index("index")
    some_words["word1"] = [ pair[0] for pair in dat.values ]
    some_words["word2"] = [ pair[1] for pair in dat.values ]
    some_words["google_valence_word1"] = [ valence.get_valence(pair[0], use = "google")[0] 
                                           for pair in dat.values ]
    some_words["google_valence_word2"] = [ valence.get_valence(pair[1], use = "google")[0] 
                                           for pair in dat.values ]
    some_words["pattern_valence_word1"] = [ valence.get_valence(pair[0], use = "pattern")[0]
                                            for pair in dat.values ]
    some_words["pattern_valence_word2"] = [ valence.get_valence(pair[1], use = "pattern")[0] 
                                            for pair in dat.values ]
    some_words["crr_valence_word1"] = [ valence.get_valence(pair[0], use = "crr")[0]
                                        for pair in dat.values ]
    some_words["crr_valence_word2"] = [ valence.get_valence(pair[1], use = "crr")[0]
                                        for pair in dat.values ]
    some_words.to_csv(fn)

In [19]:
save(partisan, "partisan_words.csv")
save(antonyms_, "antonyms.csv", plus = 89)

In [20]:
minidf.to_csv("responses.csv")