In [1]:
import pandas as pd
import numpy as np
import spacy
import networkx as nx
import netstats as ns

import itertools as it

from gensim.models import FastText
import umap
import hdbscan

import matplotlib.pyplot as plt
import seaborn as sns

nlp = spacy.load('en_core_web_sm')

In [4]:
filename = 'ANES_2016.csv'

df = pd.read_csv(filename)

In [5]:
df.columns

Index(['V160001', 'V160001_orig', 'age', 'female', 'education', 'leftright',
       'politicalinterest', 'politicalparticipation', 'TIPI_extraversion',
       'TIPI_agreeableness', 'TIPI_conscientiousness',
       'TIPI_emotionalstability', 'TIPI_openness', 'V161069', 'V161072',
       'V161075', 'V161078', 'V161098', 'V161101', 'V161104', 'V161106'],
      dtype='object')

In [2]:
text_qs = ['V161069', # PRE: Text- What is it that R likes about Democratic Pres cand
           'V161072', # PRE: Text- What is it that R dislikes about Democratic Pres cand
           'V161075', # PRE: Text- What is it that R likes about Republican Pres cand
           'V161078', # PRE: Text- What is it that R dislikes about Republican Pres cand
           'V161098', # PRE: Text- What does R like about Democratic party
           'V161101', # PRE: Text- What does R dislike about the Democratic party
           'V161104', # PRE: Text- What does R like about Republican party
           'V161106'  # PRE: Text- What does R dislike about the Republican party
          ]

politics = ['leftright', # 0=strong Dem, 6 = strong Rep
            'politicalinterest', # 0 = not at all interested, 3= v interested
            'politicalparticipation', # 0 = no protest or petition, 1 = one activity, 2 = both activities
           ]

personality = [
               'TIPI_extraversion',
               'TIPI_agreeableness', 
               'TIPI_conscientiousness',
               'TIPI_emotionalstability', 
               'TIPI_openness'
              ]

demographics = ['age', 
                'female', 
                'education']

# to be calculated for each text q
stats = ['clustering',
         'giant component',
         'disssortativity',
         'k avg',
         'k std',
         'entropy',
         'density']

In [4]:
for q in text_qs:
    sub = df.dropna(subset=q)
    print(q, f'{len(sub)} respondents')

V161069 1936 respondents
V161072 2599 respondents
V161075 1849 respondents
V161078 2909 respondents
V161098 2046 respondents
V161101 2297 respondents
V161104 1947 respondents
V161106 2531 respondents


# Network extraction

## Step 1: identify all terms (nodes) and sentences (for embeddings) across all text responses

In [39]:
# keep as nodes word that are these parts of speech:
keep = ['ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB']

In [40]:
def clean_text(x):
    x = x.replace('//', '. ')
    x = x.replace('\\', '. ')
    x = x.replace('.', '. ')
    
    return x.lower()

In [41]:
# create spacy docs of each response
for q in text_qs:
    df[f'{q}_doc'] = df[q].fillna('').apply(lambda x: nlp(clean_text(x)))

In [42]:
raw_nodes = set()
sentences = list()

for q in text_qs:
    docs = df[f'{q}_doc']

    for doc in docs:
        # nodes to keep (unclustered)
        for token in doc:
            if token.pos_ in keep:
                raw_nodes.add(token.lemma_)

        # sentences (for fasttext embedding)
        for sent in doc.sents:
            line = list()

            for token in sent:
                line.append(token.text)

            sentences.append(line)

## SKIPPING FOR NOW
### Step 2a: word embeddings using FastText

In [120]:
# Train FastText model
model = FastText(sentences, vector_size=300, window=5, min_count=1, workers=4)

In [121]:
vecs = list()

for word in raw_nodes:
    vector = model.wv[word]
    vecs.append(vector)
    
word_embeddings = np.vstack(vecs)

print(word_embeddings.shape)

(9051, 300)


In [147]:
# spacy embeddings
sp_embeddings = np.array([nlp(word).vector for word in raw_nodes])

In [148]:
sp_embeddings.shape

(9051, 96)

### Step 2b: clustering using UMap + hdbscan

Note: This step needs improvement. For now, skip clustering

In [133]:
reducer = umap.UMAP(n_components=10) 
embedding = reducer.fit_transform(word_embeddings)

In [149]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=5, prediction_data=True)
cluster_labels = clusterer.fit_predict(sp_embeddings)

In [78]:
len(cluster_labels)

9099

In [79]:
cluster_labels

array([114,  -1, 104, ...,  -1, 131,  -1])

In [150]:
max(cluster_labels)

10

In [152]:
clust_words = dict()

for word, label in zip(raw_nodes, cluster_labels):
    clust_words.setdefault(label, set())
    clust_words[label].add(word)

## Step 3: Construct Network for each response

In [43]:
def get_network(doc):
    ### step 1: Nodes (words to keep)
    nodes = [token.text for token in doc if token.pos_ in keep]
    
    ### Step 2: Edges (parse tree)

    # step 2a: all edges
    all_edges = dict() # child : parent

    for sent in doc.sents:
        for token in sent:
            all_edges[token.text] = token.head.text

    # step 2b: edges to keep
    edges = list()

    for child, parent in all_edges.items():
        if child in nodes: # if we have a child that is a node
            seen = set()
            
            while parent not in nodes and parent not in seen: # find a parent
                grand = all_edges[parent]
                seen.add(parent)
                
                if grand != parent: # if we're not in a loop
                    parent = grand # iteratively check grand parents
                else:
                    break

            if child != parent and parent in nodes:
                edges.append([child, parent])
                
    # Step 3: MERGE NODES
    # skipping for now
    
    network = {'nodes' : nodes,
               'edges' : edges}
    
    return network

In [44]:
def get_network_stats(network):
    nodes = network['nodes']
    edges = network['edges']
    
    if len(nodes) > 1 and len(edges) > 0:
        G = nx.Graph()

        G.add_edges_from(edges)

        for node in nodes:
            G.add_node(node)

        stats = ns.network_stats(G)
    
    else:
        stats = dict()
        
    return stats

In [45]:
for q in text_qs:
    print(q)
    
    # get network from response text
    df[f'{q}_network'] = df[f'{q}_doc'].apply(lambda doc: get_network(doc))
    
    
    # get stats from network
    df[f'{q}_stats'] =  df[f'{q}_network'].apply(lambda network: get_network_stats(network))

V161069
V161072
V161075
V161078
V161098
V161101
V161104
V161106


In [46]:
df.head()

Unnamed: 0,V160001,V160001_orig,age,female,education,leftright,politicalinterest,politicalparticipation,TIPI_extraversion,TIPI_agreeableness,...,V161104_network,V161106_network,V161069_stats,V161072_stats,V161075_stats,V161078_stats,V161098_stats,V161101_stats,V161104_stats,V161106_stats
0,1,300001,29.0,0.0,1,6.0,2.0,0.0,4.5,4.0,...,"{'nodes': ['more', 'straightforward', 'people'...","{'nodes': [], 'edges': []}",{},{},"{'node_count': 3, 'edge_count': 1, 'clustering...",{},{},"{'node_count': 10, 'edge_count': 9, 'clusterin...","{'node_count': 3, 'edge_count': 2, 'clustering...",{}
1,2,300002,26.0,0.0,3,5.0,3.0,0.0,4.0,4.0,...,"{'nodes': ['pro', '2nd', 'ament', 'small', 'go...","{'nodes': [], 'edges': []}",{},"{'node_count': 5, 'edge_count': 4, 'clustering...","{'node_count': 3, 'edge_count': 2, 'clustering...",{},{},{},"{'node_count': 7, 'edge_count': 6, 'clustering...",{}
2,3,300003,23.0,0.0,1,2.0,1.0,0.0,6.0,3.0,...,"{'nodes': [], 'edges': []}","{'nodes': [], 'edges': []}",{},{},{},{},{},{},{},{}
3,4,300004,58.0,0.0,1,4.0,2.0,0.0,5.0,6.0,...,"{'nodes': ['stand', 'health', 'care', 'issue',...","{'nodes': ['lack', 'unity'], 'edges': [['unity...",{},"{'node_count': 2, 'edge_count': 1, 'clustering...","{'node_count': 6, 'edge_count': 4, 'clustering...",{},{},"{'node_count': 10, 'edge_count': 5, 'clusterin...","{'node_count': 9, 'edge_count': 7, 'clustering...","{'node_count': 2, 'edge_count': 1, 'clustering..."
4,5,300006,38.0,1.0,1,2.0,1.0,0.0,3.0,5.5,...,"{'nodes': ['usually', 'go', 'morals', 'try', '...","{'nodes': [], 'edges': []}",{},"{'node_count': 8, 'edge_count': 4, 'clustering...",{},"{'node_count': 4, 'edge_count': 3, 'clustering...","{'node_count': 5, 'edge_count': 4, 'clustering...",{},"{'node_count': 9, 'edge_count': 8, 'clustering...",{}


In [36]:
df.to_csv('network_data.csv', index=False)

In [6]:
# Reload saved data
df = pd.read_csv('network_data.csv')

# Analysis 1: Network structure + personality
1. Transform all free response text into networks
2. Calculate network statistics for each network
3. Multilevel model: $s = \beta p + \alpha_q + \epsilon$. Captures relationship between personal trait $p$ and network statistic $s$ while controlling for question-level random effects $\alpha_q$.
4. Save t-statistics if signficant -- these are the correlations of interest

Notes: 
* Controlling for questions allows us to have multiple networks (responses) per individual. May need to cluster standard errors, though
* May want to consider word count and flesch-kincaid score?

In [173]:
from pymer4.models import lmer
import polars as pl

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

### Step 1: reshape data

Each row should be the response to a single question, with [politics, personality, demographics] + [network stats for that answer]

In [47]:
stat_col = [f'{q}_stats' for q in text_qs]

cols = ['V160001'] + politics + personality + demographics
cols

['V160001',
 'leftright',
 'politicalinterest',
 'politicalparticipation',
 'TIPI_extraversion',
 'TIPI_agreeableness',
 'TIPI_conscientiousness',
 'TIPI_emotionalstability',
 'TIPI_openness',
 'age',
 'female',
 'education']

In [69]:
data = df[cols + stat_col].melt(id_vars=cols, value_vars=stat_col, 
                                var_name='question',
                                value_name='net_stats')

# drop rows with no network stats
data = data[data['net_stats'].apply(lambda x: len(x) > 0)]

# unwrap stats
for stat in stats:    
    data[stat] = data['net_stats'].apply(lambda x: x[stat])

In [90]:
data.to_csv('stats.csv', index=False)

In [174]:
data = pd.read_csv('stats.csv')

In [175]:
def get_tvals(measures, stats, data):
    t_matrix = np.zeros((len(measures), len(stats)))
    p_matrix = np.zeros((len(measures), len(stats)))

    for measure_index, net_index in list(it.product(range(len(measures)), range(len(stats)))):
        measure_stat = measures[measure_index]
        net_stat = stats[net_index]

        # create a smaller dataframe

        df = data[['V160001', 'question', measure_stat, net_stat]]
        df = data.rename({measure_stat: 'measure_stat', net_stat: 'net_stat'})

        # run model
        model = lmer('measure_stat ~ net_stat  + (1 | question)', data=df)
        model.fit(no_warnings=True, summarize=False, verbose = False)           

        # get t-vals
        t_val = model.result_fit['t_stat'][1]

        if np.isnan(t_val):
            t_val = 0
            print('Warning: no t_val found for method %s, feature %s.\
                Correlation estimated at 0.')

        t_matrix[measure_index][net_index] = t_val

        # get p-val
        p_val = model.result_fit['p_value'][1]
        p_matrix[measure_index][net_index] = p_val

    corr = pd.DataFrame(t_matrix.T, index=stats, columns=measures)

    return corr

In [None]:
data = data.rename(columns={'V160001': 'Respondent',
                            'leftright': 'Party ID (R)',
                            'politicalinterest': 'Pol. Interest',
                            'politicalparticipation': 'Pol. Participation',
                            'TIPI_extraversion' : 'Extraversion',
                            'TIPI_agreeableness': 'Agreeableness', 
                            'TIPI_conscientiousness': 'Conscientiousness',
                            'TIPI_emotionalstability': 'Emotional Stability', 
                            'TIPI_openness': 'Openness',
                            'age': 'Age',
                            'female': 'Gender (M)',
                            'education': 'Education'
                           })

# columns to rescale
rescale = ['Party ID (R)', 'Pol. Interest', 'Pol. Participation',
           'Extraversion', 'Agreeableness', 'Conscientiousness',
           'Emotional Stability', 'Openness', 'Age', 'Gender (M)', 'Education',
           'clustering', 'giant component',
           'disssortativity', 'k avg', 'k std', 'entropy', 'density']


In [176]:
# columns we'll need later
keep = data[['V160001', 'question']]

# columns to rescale
rescale = politics + personality + demographics + stats

#data = data.set_index('Respondent')
data = data[rescale]

In [177]:
scaled = pd.DataFrame(scaler.fit_transform(data), columns=rescale, index=data.index)
scaled = scaled.reset_index()

data = pd.concat([keep, scaled], axis=1, keys='Respondent')

In [179]:
data.columns = data.columns.droplevel()
data = data[['V160001', 'question'] + rescale]

In [180]:
data.head()

Unnamed: 0,V160001,question,leftright,politicalinterest,politicalparticipation,TIPI_extraversion,TIPI_agreeableness,TIPI_conscientiousness,TIPI_emotionalstability,TIPI_openness,age,female,education,clustering,giant component,disssortativity,k avg,k std,entropy,density
0,12,V161069_stats,-0.405503,,-0.560405,,,,,,2.061301,0.968417,-0.377468,-0.051554,-2.61482,0.150838,-0.54228,0.165594,0.04314,-1.318046
1,13,V161069_stats,0.512137,1.263961,1.45791,-0.190568,0.212518,-0.249853,-0.362949,0.318407,-1.169521,-1.032613,0.930498,-0.051554,-2.73464,-0.077533,0.932325,0.637962,0.150207,-1.217444
2,15,V161069_stats,-1.323143,1.263961,1.45791,1.909013,1.088969,-1.166969,0.427092,1.206914,0.676663,0.968417,0.930498,-0.051554,-1.736142,-0.305904,-2.907791,0.018187,0.552374,-1.207384
3,17,V161069_stats,-1.323143,1.263961,1.45791,-1.940219,-1.102159,0.667262,0.427092,-0.5701,-0.246429,-1.032613,0.930498,-0.051554,-3.415967,0.073095,0.173337,1.61888,0.303561,-1.467764
4,19,V161069_stats,-1.323143,0.000192,1.45791,-0.540498,0.650743,-0.249853,-0.757969,0.318407,-1.169521,0.968417,0.930498,-0.051554,-2.61482,0.150838,-0.54228,0.165594,0.04314,-1.318046


In [188]:
measures = politics + personality + demographics

corr2 = get_tvals(measures, stats, pl.DataFrame(data))

In [190]:
corr2.to_csv('corr2.txt')

In [189]:
corr2

Unnamed: 0,leftright,politicalinterest,politicalparticipation,TIPI_extraversion,TIPI_agreeableness,TIPI_conscientiousness,TIPI_emotionalstability,TIPI_openness,age,female,education
clustering,0.763257,-0.042712,-1.73906,0.803076,1.984451,0.394748,-1.188427,0.591384,0.454677,0.570603,-0.90414
giant component,-0.771598,-8.546908,-2.758296,-4.337047,-0.348806,-1.517565,2.336227,-7.33535,-2.44041,-1.257808,0.720805
disssortativity,-3.285174,5.591299,2.870927,-0.780461,2.583103,2.08946,-2.171296,5.061006,-1.965638,0.758563,2.532952
k avg,-5.432994,8.398177,5.539519,1.5139,3.537577,2.786866,-1.697912,8.615117,-3.462351,1.476163,6.56495
k std,-5.092588,9.839123,5.611802,1.41371,3.505438,2.571459,-2.187487,9.003738,-2.417656,2.461708,4.56399
entropy,-3.696957,5.274392,2.866195,-1.404655,2.86459,2.155963,-2.423827,4.474926,-2.051201,1.247105,2.898816
density,4.499365,-12.019007,-7.20615,-2.676251,-3.590643,-3.117432,3.05749,-10.660232,1.853349,-3.072373,-3.947613


In [170]:
corr

Unnamed: 0,leftright,politicalinterest,politicalparticipation,TIPI_extraversion,TIPI_agreeableness,TIPI_conscientiousness,TIPI_emotionalstability,TIPI_openness,age,female,education
clustering,0.763257,-0.042712,-1.73906,0.803076,1.984451,0.394748,-1.188427,0.591384,0.454677,0.570603,-0.90414
giant component,-0.771599,-8.546909,-2.758296,-4.337047,-0.348806,-1.517565,2.336227,-7.33535,-2.44041,-1.257808,0.720805
disssortativity,-3.285174,5.591299,2.870927,-0.780461,2.583104,2.08946,-2.171296,5.061006,-1.965638,0.758563,2.532952
k avg,-5.432994,8.398177,5.539519,1.5139,3.537577,2.786866,-1.697912,8.615117,-3.462351,1.476163,6.56495
k std,-5.092588,9.839123,5.611802,1.41371,3.505438,2.571459,-2.187487,9.003738,-2.417656,2.461708,4.56399
entropy,-3.696957,5.274392,2.866195,-1.404655,2.86459,2.155963,-2.423827,4.474926,-2.051201,1.247105,2.898816
density,4.499365,-12.019007,-7.20615,-2.676251,-3.590643,-3.117432,3.05749,-10.660232,1.853349,-3.072373,-3.947613


# Analysis 2: Self-similarity vs ideological similarity

For each respondent:

    * Identify qs which are ideologically aligned v. not (ie: if Dem, likes about Dem cand)
    * Compare similarity of network structure between aligned v. not aligned qs
    * Identify comparison set of users with same ideology
    * Compare similarity of network structure between same qs