In [None]:
import praw
import re
import pandas as pd
import psycopg2
from dotenv import load_dotenv
from datetime import datetime
from matplotlib import pyplot as plt
from scripts.reddit import scrape_political
import seaborn as sns
import numpy as np
from IPython.display import display 
load_dotenv()
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
import dotenv
dotenv.load_dotenv()
import os
import requests
import json
conn = psycopg2.connect("postgres://postgres:postgres@127.0.0.1:5432/pubpol")

blacklist = ['www.reddit.com', 'redd.it', 'i.redd.it', 'v.redd.it', 'youtube.com',
             'youtu.be', 'i.imgur.com', 'imgur.com', 'discord.gg', 'parler.com',
             'google.com', 't.co', 'jssocial.pw', 'magaimg.net', 
             'streamable.com', 'pic8.co', 'kek.gg', 'www.youtube.com',
             'gfycat.com', 'memefly.me', 'vgy.me', 'imgoat.com', 
             'twitter.com', 'vimeo.com', 'soundcloud.com',
             'mega.nz']
             
%matplotlib inline

# Why do we have different political beliefs?

According to rational choice theory, people act *rationally* to maximize their happiness, according to some *utility function.* People tend to take this as a fact rather than a model, which leads to a lot of tonguewagging about voters acting against their own self-interest, and the subsequent inference that "voters who disagree with me are stupid."

We can expand this idea to make a more specific statement: 
* IF people are rational
* AND they have the same self-interest
* AND they operate from the same information
* AND they interpret this information through the same set of experiences and values
* THEN they will make the same decisions.  

I'd contend that none of these conditions are met in practice. In this project, I'm specifically looking at the condition, **People operate from the same information** using the population **users of political subreddits**.

## What constitutes 'information'?

In this context, I am using 'information' to mean description of facts as presented by a publication. A collection of stories creates a narrative. There are three ways that a publication can create a narrative that differs from the *true* state of things:

1. Create false information and present it as true.
2. Selectively report true information 
3. Contextualize information with other facts or opinion.

This project is not going to exhaustively look at any of these, but it will touch on points 2 and 3. 

# Do political subreddits use the same news?

I'm going to look at the domains linked by political subreddits to get some of evidence of the idea that people are really just looking at completely different news sources. 

To get data, I accessed the reddit API on multiple days, and got information about the "hot 1000" posts for each subreddit of interest. 

In [None]:
scrape_political()

# What subreddits post from similar domains?

We can look at this question in two different ways - using raw post counts, and adjusting them by the upvote scores. Adjusting by the upvote scores tells us, "if we were to go browse that subreddit, what domains would we see most?"

It turns out that (self described) conservatives and republicans use similar sites, liberals/progressives/democrats use similar sites, and libertarians have their own sites. 

The anarchocapitalist subreddit seems to almost exclusively post memes and videos, so it's not worth including here.

In [None]:
subs = ['politics', 'democrats', 'liberal', 'neoliberal', 'progressive',                   
         'libertarian','conservative', 'conservatives','republican',]

In [None]:
query = '''
(select 
    domain, lower(subreddit) as subreddit, count(1) as count
from reddit
group by domain, subreddit)
'''
count_table = pd.read_sql(query, conn).pivot(index='domain', columns = 'subreddit', values = 'count')
count_table = count_table.loc[set(count_table.index) - set(blacklist)]

In [None]:
query = '''
select lower(domain) as domain, lower(subreddit) as subreddit, sum(score) as score from
(select domain, subreddit, 1.0 * score / sum(score) over (partition by subreddit) as score
from reddit
where domain not in ({})) a
group by domain, subreddit
'''.format(", ".join(["'" + b + "'" for b in blacklist]))

scores = pd.read_sql(query, conn)
score_table = scores.pivot(index='domain', columns = 'subreddit', values = 'score')

score_table_inverse = scores.pivot(index='subreddit', columns = 'domain', values = 'score')

In [None]:
score_table

In [None]:
sti_normalized = score_table_inverse.loc[subs].fillna(0).apply(lambda x: x/max(x), 1)
sti_normalized = sti_normalized.drop(blacklist, 1, errors = 'ignore')

domains_to_plot = list(sti_normalized.loc[subs].sum().sort_values(ascending=False)[:30].index)

In [None]:
domain_mapping = {'thehill.com': 'center',
 'babylonbee.com': 'satire',
 'thefederalist.com': 'right, questionable',
 'thinkprogress.org': 'left, questionable',
 'nypost.com': 'right-center, questionable',
 'talkingpointsmemo.com': 'left',
 'lawandcrime.com': 'left-center',
 'apnews.com': 'center',
 'dailycaller.com': 'right, questionable',
 'slate.com': 'left',
 'reason.com': 'right-center',
 'nymag.com': 'left',
 'theweek.com': 'left',
 'pjmedia.com': 'right-extreme, questionable',
 'townhall.com': 'right-extreme, questionable',
 'deadstate.org': 'left',
 'justthenews.com': 'right, questionable',
 'newrepublic.com': 'left',
 'hotair.com': 'right',
 'www.washingtonpost.com': 'left-center',
 'politicaldig.com': 'left, questionable',
 'redstate.com': 'right, questionable',
 'crooksandliars.com': 'left',
 'twitchy.com': 'right, questionable',
 'newsmaven.io': 'unknown',
 'amgreatness.com': 'right, questionable',
 'www.breitbart.com': 'right-extreme, questionable',
 'legalinsurrection.com': 'right',
 'www.nytimes.com': 'left-center',
 'prospect.org': 'left-center',
 'spectator.org': 'right, questionable',
 'issuesinsights.com': 'right, questionable'}

In [None]:
data_to_plot = sti_normalized.loc[subs,domains_to_plot].dropna(how='all').fillna(0).corr()
data_to_plot.index = [": ".join([item, domain_mapping[item]]) for item in data_to_plot.columns]

In [None]:
scores_norm = score_table.fillna(0).apply(lambda x: x/sum(x))
scores_norm = scores_norm[scores_norm.sum(1)  >0]

In [None]:
chi_sq_table.sum().sum()

In [None]:
chi_sq_table.shape

In [None]:
chi_sq_table[chi_sq_table.sum(1)>0]

In [None]:
from scipy.stats import chisquare

query = '''
select lower(domain) as domain, lower(subreddit) as subreddit, count(score) as score
from reddit
group by domain, subreddit
'''.format(", ".join(["'" + b + "'" for b in blacklist]))

chi_sq_table = pd.read_sql(query, conn).pivot(index='domain', columns = 'subreddit', values = 'score').fillna(0).loc[domains_to_plot]
chi_sq_table = chi_sq_table[['conservative', 'conservatives']]
chi_sq_table = chi_sq_table[chi_sq_table.sum(1)>0]


grand_total = chi_sq_table.sum().sum()
chi_sq_expected = pd.DataFrame(
    chi_sq_table.sum(1).to_numpy().reshape(-1,1) * chi_sq_table.sum(0).to_numpy().reshape(1,-1) / grand_total, 
    index = chi_sq_table.index,
    columns = chi_sq_table.columns)



for sub in chi_sq_table.columns:
    print(sub, chisquare(chi_sq_table[sub], chi_sq_expected[sub]).pvalue)



In [None]:
max(scores_norm.politics)

In [None]:
cm = sns.clustermap(
    data=data_to_plot, 
    cmap=plt.cm.RdBu, 
    metric = 'correlation',
    vmin=1, vmax = -1, yticklabels=1)


In [None]:
sti_normalized.fillna(0).apply(lambda x: list(sti_normalized.columns[np.argsort(x)[-5:]]), 1, result_type = 'expand')

#### Blacklisted sites and memes

If I don't blacklist certain domains, the clustering looks very different! Certain subreddits are full of links to videos, twitter posts, and image memes. These posts are so common that they dominate the content similarity metrics.



In [None]:
sns.clustermap(
    data=score_table.dropna(how='all').fillna(0).corr(), cmap=plt.cm.RdBu, 
    metric = 'correlation', vmin=1, vmax = -1)

In [None]:
sub_mapping = {
    "worldpolitics": "blue",
    "dsa":"blue",
    "uspolitics":"blue",
    "environment":"blue",
    "democrats":"blue",
    "progressive":"blue",
    "liberal":"blue",
    "politics":"blue",
    "americanpolitics":"blue",
    "neoliberal":"blue",
    "worldnews":"blue",
    "conservative":"red",
    "conservatives":"red",
    "republican":'red'
}

In [None]:
query = '''
select lower(domain) as domain, lower(subreddit) as subreddit, sum(score) as score from
(select domain, subreddit, 1.0 * score / sum(score) over (partition by subreddit) as score
from reddit
where domain not in ({})) a
group by domain, subreddit
'''.format(", ".join(["'" + b + "'" for b in blacklist]))

top_domains = pd.read_sql(query, conn)
top_domains['group'] = top_domains['subreddit'].replace(sub_mapping)
top_domains = top_domains.groupby(['group', 'domain']).sum()
# top_domains = top_domains[top_domains.score > 100]
top_domains = top_domains.reset_index()

In [None]:
list(top_domains[top_domains.group == 'blue'].sort_values('score', ascending = False)[:10].domain)

In [None]:
list(top_domains[top_domains.group == 'red'].sort_values('score', ascending = False)[:10].domain)

In [None]:
def n_grams(corpus, n):
    tfidf_ngram = []
    for i in range(1, n+1):
        vectorizer = TfidfVectorizer(ngram_range = (i,i), max_df=0.5, min_df=5, binary = True)
        tfidf = vectorizer.fit_transform(corpus)
        tfidf_ngram.append(pd.DataFrame(tfidf.toarray(), columns = vectorizer.get_feature_names()).mean())

    return(tfidf_ngram)

def contiguous_subset(values):
    n = len(values)
    breakpoints = [l for i in range(1, n) for l in list(combinations(range(1, n), i)) ]
    groupings = []
    for l in breakpoints:
        grouping = []
        breaks = [0] + list(l) + [n]
        for idx in range(len(breaks)-1):
            grouping.append(values[breaks[idx]:breaks[idx+1]])
        groupings.append(grouping)
    return(groupings)

def ngram_importance(ngram, ngrams):
    subsets = contiguous_subset(ngram.split())
    bottom = []
    for subset in subsets:
        tfidf=[]
        for gram in subset:
            term = ' '.join(gram)
            idx = len(gram)-1
            tfidf.append(np.log(ngrams[idx][term]))
        bottom.append(np.sum(tfidf))
    top = ngrams[len(ngram.split())-1][ngram]
    return(top/np.sum(np.exp(bottom)))
    

In [None]:
query = '''
select distinct url, title
from reddit
'''
titles = pd.read_sql(query, conn)
ngrams = n_grams(titles.title, 6)

In [None]:
ngrams_to_replace = []
for j in range(5):
    for idx, item in enumerate(ngrams[j+1]):
        ng = ngrams[j+1].index[idx]
        if ngram_importance(ng, ngrams) > 25:
            ngrams_to_replace.append(ng)
ngrams_to_replace.reverse()            

In [None]:
replacements = [(g, re.sub('\s', '_', g)) for g in ngrams_to_replace]
replacements

In [None]:
query = '''
select distinct url, title
from reddit
where domain not in ({})
'''.format(", ".join(["'" + b + "'" for b in blacklist]))

titles = pd.read_sql(query, conn)
titles['title_orig'] = titles.title
titles.title = titles.title.str.replace('([0-9]+,?)+\.?[,0-9]* \willion', '_number_')
titles.title = titles.title.str.replace('(\d+[./]){1,2}(\d+)', '_date_')
titles.title = titles.title.str.replace('[A-Za-z]{3} \d+[–\-./]\d+', '_date_')
titles.title = titles.title.str.replace('\d{1,2}:\d{2}', '_time_')
titles.title = titles.title.str.replace('\d{4}', '_year_')
titles.title = titles.title.str.replace('([0-9]+,?)+\.?[,0-9]*', '_number_')
titles.title = titles.title.str.lower()

for r in replacements:
    titles.title = titles.title.str.replace(r[0], r[1])
#     titles.title = titles.title.str.replace(r[0], " ".join([r[1], r[0]]))


vectorizer = TfidfVectorizer(ngram_range = (1,1), max_df=0.5, min_df=5, binary = True, stop_words = 'english')
tfidf = vectorizer.fit_transform(titles.title)
tfidf_df = pd.DataFrame(tfidf.toarray(), columns = vectorizer.get_feature_names())
tfidf_df = titles.join(tfidf_df,lsuffix ="_1")


In [None]:
query = '''
select url, lower(subreddit) as subreddit, 1.0 * score / sum(score) over (partition by subreddit) as score
from reddit
where domain not in ({})
'''.format(", ".join(["'" + b + "'" for b in blacklist]))


posts = pd.read_sql(query, conn)
posts_tfidf = posts.merge(tfidf_df, on='url')
posts_tfidf.iloc[:, 6:] = posts_tfidf.iloc[:, 6:].multiply(posts_tfidf['score'], axis=0)

top_words_subreddit = posts_tfidf.drop(['url', 'score', 'title', 'title_orig'], 1).groupby('subreddit').sum()
top_10_subreddit = pd.DataFrame(top_words_subreddit.apply(lambda x: top_words_subreddit.columns[np.argsort(x)[-10:]].to_list(), axis=1).to_list(), index=top_words_subreddit.index)
display(top_10_subreddit.sort_index())

In [None]:
query = '''
select subreddit, url, lower(domain) as domain, 1.0 * score / sum(score) over (partition by subreddit) as score
from reddit
where domain in ('thehill.com', 'apnews.com')
'''

posts = pd.read_sql(query, conn)
posts['sub_group'] = posts.subreddit.replace(sub_mapping)
posts = posts[(posts.sub_group == 'red') | (posts.sub_group == 'blue')]
posts_tfidf = posts.merge(tfidf_df, on='url')
posts_tfidf.iloc[:, 7:] = posts_tfidf.iloc[:, 7:].multiply(posts_tfidf['score'], axis=0)

top_words_group = posts_tfidf.drop(['url', 'score', 'title', 'title_orig'], 1).groupby('sub_group').sum()

top_words_group = pd.DataFrame(top_words_group.apply(lambda x: top_words_group.columns[np.argsort(x)[-10:]].to_list(), axis=1).to_list(), index=top_words_group.index)

top_words_group.swapaxes(0, 1)


In [None]:
top_words_grouped = top_words_subreddit.reset_index()
top_words_grouped['subreddit_group'] = top_words_grouped.subreddit.replace(sub_mapping)
top_words_grouped = top_words_grouped.groupby('subreddit_group').mean()
top_words_grouped = top_words_grouped.loc[['blue', 'red']]

In [None]:
print(top_words_grouped['susan_collins'].sort_values(ascending = False))


In [None]:
print((top_words_group.loc['red'] - top_words_group.loc['blue']).sort_values()[-30:])
print((top_words_group.loc['red'] - top_words_group.loc['blue']).sort_values()[:30])



In [None]:
top_words_grouped.loc['red'][top_words_grouped.loc['red']!=0]

In [None]:
top_words_grouped.loc['blue'][top_words_grouped.loc['blue']!=0]

In [None]:
print(top_words['riot'].sort_values(ascending = False)[0:5])
print(top_words['protest'].sort_values(ascending = False)[0:5])

In [None]:
print(top_words['proud_boys'].sort_values(ascending = False)[0:5])
print(top_words['black_lives_matter'].sort_values(ascending = False)[0:5])

In [None]:
# print(top_words['benghazi'].sort_values(ascending = False)[0:5])
print(top_words['hunter_biden'].sort_values(ascending = False)[0:5])

In [None]:
(top_words.loc['conservative'] - top_words.loc['liberal']).sort_values()[-10:]

In [None]:
(top_words.loc['conservative'] - top_words.loc['liberal']).sort_values()[:10]

In [None]:
(top_words.loc['republican'] - top_words.loc['democrats']).sort_values()[:10]

In [None]:
(top_words.loc['republican'] - top_words.loc['democrats']).sort_values()[-10:]

In [None]:
api_key = os.environ['google_api_key']

def knowledge(query):
    endpoint = 'https://kgsearch.googleapis.com/v1/entities:search?'
    url = '{}query={}&key={}'.format(endpoint,query,key)
    return(json.loads(requests.get(url).content))

def parse_knowledge(knowledge):
    return([item['result']['@type'] for item in knowledge['itemListElement']])



In [None]:
result = knowledge('riots')

In [None]:
[item['result']['@type'] for item in json.loads(result)['itemListElement']]

In [None]:
json.loads(result)['itemListElement'][0]

In [None]:
ngrams_to_replace

In [None]:
result = knowledge(ngrams_to_replace[0])

In [None]:
parse_knowledge(result)

In [None]:
know_ngram = []
for idx, ngram in enumerate(ngrams_to_replace):
    if idx % 10 == 0:
        print(idx)
    know_ngram.append(knowledge(ngram))

In [None]:
know_ngram

In [None]:
for idx, k in enumerate(know_ngram):
    print(ngrams_to_replace[idx], parse_knowledge(k))

In [None]:
ngrams_to_replace[idx]