# Apply the slant measure from Gentzkow and Shapiro (2010) 

## Set-up

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from nltk import ngrams
import re
import string
import glob
import os
os.chdir('C:\\Users\\donggwan.kim\\Desktop\hein-daily')

## Speaker - Speech - Chamber - Party data

In [None]:
### data are downloaded from https://data.stanford.edu/congress_text
speaker_df = pd.read_csv('speaker_df.csv')
speaker_df.head()

## Pre-processing

In [None]:
# basic stopwords
STOPWORDS = set(stopwords.words('english'))

# add additional stopwords (Fox's stopwords) that Gentzknow and Shapiro used in their paper.
# source: https://gist.github.com/maxwelld90/6bafbf2570877c4d1de0
with open('classical_stopwords.txt') as f:
    lines = f.readlines()
FOX_stopwords = set([word.rstrip() for word in lines[5:]]) # remove white space 

# merge the two lists
STOPWORDS.update(FOX_stopwords)

# create a stemmer object
ps= PorterStemmer()

# punctuation
string.punctuation = string.punctuation + '•'

# add more stopwords to improve precision
# for higher precision, have both aren't and arent  or i'm and im
STOPWORDS_wo_punc = []
for word in list(STOPWORDS):
    word = re.sub('['+string.punctuation+']', '', word)
    STOPWORDS_wo_punc.append(word)

## 1) For each word, get the total number of times it was used by each party

In [None]:
def one_gram_function (arg):
    # make it lowercases
    arg = arg.lower()
    # remove punctiations
    import string
    arg = re.sub('['+string.punctuation+']', '', arg)
    # remove extra spaces in strings
    arg = re.sub(r' +', ' ', arg)
    # string replacement - we found that these words are used with and without a space
    # as this is an important topic in ads, we replace the space for consistency
    arg = arg.replace('health care', 'healthcare')
    arg = arg.replace('child care', 'childcare')
    # tokenize - remove if a word is too short or a stopword
    Tokenized = [word for word in word_tokenize(arg) if ((len(word) >= 2) & (word not in STOPWORDS))]
    # remove numbers
    Tokenized = [word for word in Tokenized if not word.isdigit()]
    # stem - double check if there are remaining stopwords
    Token_Stem = [ps.stem(word) for word in Tokenized if (word not in STOPWORDS)]
    # counter
    Freq_1 = Counter(list(Token_Stem))
    # return the counter
    return Freq_1

### Republicans

In [None]:
Rep = ''
for text in speaker_df[speaker_df['party'] == 'R']['text']:
    Rep = Rep + ' ' + text
Rep = Rep[1:]

### Democrats 

In [None]:
Dem = ''
for text in speaker_df[speaker_df['party'] == 'D']['text']:
    Dem = Dem + ' ' + text
Dem = Dem[1:]

### apply the counter function

In [None]:
Rep_count = one_gram_function(Rep)
Dem_count = one_gram_function(Dem)

### make them as dictionary objects 

In [None]:
Rep_count_dict = dict(Rep_count)
Dem_count_dict = dict(Dem_count)

#  all words that we consider for chi-squared statistics
all_words = list(Rep_count_dict.keys()) + list(Dem_count_dict.keys())
all_words = list(set(all_words)) # about 100,000 words

### validity check

In [None]:
print(Rep_count['childcar']) # 59
print(Dem_count['childcar']) # 360
print(Rep_count['terror']) # 2664
print(Dem_count['terror']) # 1834

## 2) chi-squared statistics

In [None]:
# calculate chi-squared stat for each word
chi_squared_list = {}

for word in all_words:
    
    # if found in Rep_count
    if word in Rep_count.keys():
        f_r = Rep_count_dict[word]
    else:
        f_r = 0
        
    # if found in Dem_count
    if word in Dem_count.keys():
        f_d = Dem_count_dict[word]
    else: 
        f_d = 0
        
    f_not_r = sum(Rep_count.values()) - f_r
    f_not_d = sum(Dem_count.values()) - f_d
    
    num = ((f_r * f_not_d - f_d * f_not_r) ** 2) 
    denom = (f_r + f_d) * (f_r + f_not_r) * (f_d + f_not_d) * (f_not_r + f_not_d)
    chi_squared = num / denom
    
    chi_squared_list.update({word:float(chi_squared)})

## 3) Feature selection

In [None]:
### import (1) 60 ad creatives (transcribed texts) and (2) two aggregated speech data
### ad creatives from Kantar Media cannot be shared due to NDA
### speech data can be downloaded from American Presidency Project (https://www.presidency.ucsb.edu/)
### we collect all public speech data from both candidates during the primary election
folder = 'ALL_ADS_AND_SPEECHES_FINAL'
file_loc = 'C:\\Users\\donggwan.kim\\Desktop\\Video_Transcribing_Final_corrected\\ALL_ADS_AND_SPEECHES_FINAL\\*.txt'
file_paths = glob.glob(file_loc)
print(len(file_paths))

In [None]:
name_list = []
string_list = []

for file in file_paths:
    ### get fila names only
    file_name = [name[0:8] for name in file.split('\\') if name[0].isdigit()][0]
    ### append it to the name list
    name_list.append(file_name)
    ### open the text file
    with open(file, encoding="utf8") as f:
        lines = f.readlines()
    ### create a string
    string_concat = ''
    for line in lines:
        string_concat = string_concat + ' ' + line
    ### replace some characters
    string_concat = string_concat.replace("—", " ")
    string_concat = string_concat.replace("–", " ")
    string_concat = string_concat[1:]
    ### initial pre-processing
    outcome = string_concat.lower() # lower case
    outcome = outcome.strip() # remove some excessive whitespaces
    outcome = outcome.replace("\n", "") # remove "\n" new lines
    outcome = re.sub(r'\d+', '', outcome) # remove numbers
    # remove punctuation
    outcome = re.sub('['+string.punctuation+']', '', outcome)
    string_list.append(outcome)

In [None]:
ad_and_speech = ''
for string in string_list:
    ad_and_speech = ad_and_speech + ' ' + string
ad_and_speech = ad_and_speech[1:]

Freq_1 = one_gram_function(ad_and_speech)

In [None]:
feature_selection = []

for word in Freq_1.keys():
    if (Freq_1[word] >= 2) & (Freq_1[word] <= 100):
        feature_selection.append(word)

In [None]:
chi_squared_list_short = {}
for word in feature_selection:
    chi_squared_list_short[str(word)] = float(chi_squared_list[str(word)])

In [None]:
### get 1,000 words that are used by either of the two parties
political_words = []
NUM_OF_WORDS = 1000
for word in feature_selection:
    cutoff = sorted(chi_squared_list_short.values(), reverse=True)[int(NUM_OF_WORDS)]
    if chi_squared_list_short[word] > cutoff:
        political_words.append(word)
print(len(political_words))

### test
print('nuclear' in political_words)
print('childcar' in political_words)

## 3) mapping phrases to ideology

### Relative frequency by speaker among the selected words

In [None]:
def one_gram_function_2 (arg):
    # make it lowercases
    arg = arg.lower()
    # remove punctiations
    import string
    arg = re.sub('['+string.punctuation+']', '', arg)
    # remove extra spaces in strings
    arg = re.sub(r' +', ' ', arg)
    # tokenize
    Tokenized = [word for word in word_tokenize(arg) if ((len(word) >= 2) & (word not in STOPWORDS))]
    # Remove numbers
    Tokenized = [word for word in Tokenized if not word.isdigit()]
    # stem
    Token_Stem = [ps.stem(word) for word in Tokenized if (word not in STOPWORDS)]
    # the frequencies of words that are selected as political words
    Token_Stem_2 = [word for word in Token_Stem if word in political_words]
    # counter
    Freq_1 = Counter(list(Token_Stem_2))
    # return
    return Freq_1

In [None]:
speaker_list = list(set(speaker_df['speaker_id']))
a = 0
new_list = []

for speaker in speaker_list:    
    # print number
    print(a)
    # for each speaker, I get their frequencies of the politcally charged words
    temp_df = speaker_df[speaker_df['speaker_id'] == int(speaker)]
    # get the frequency
    word_freq = one_gram_function_2(temp_df.iloc[0]['text'])
    # create an inner list - to get the frequency of each political word
    inner_list = []
    # political_words - list of the selected political words
    for word in political_words: 
        if word in word_freq.keys():
            inner_list.append(word_freq[str(word)])
        else:
            inner_list.append(0)
    # get a vector for relative frequencies    
    relative_freq = np.array(inner_list) / np.array(inner_list).sum()
    # append the relative frequency vector to the empty list
    new_list.append(relative_freq)
    a += 1
print(len((new_list[0])))

### Create the final data

In [None]:
# create an empty table
regression_df = pd.DataFrame(columns = political_words, index = np.arange(len(speaker_list)))
# fill it by row
for i in range(len(new_list)):
    regression_df.iloc[int(i)] = new_list[int(i)]    
# add speaker information to the data frame
regression_df['speaker_id'] = speaker_list

### add additional information

In [None]:
tmp = speaker_df[['speaker_id', 'lastname', 'firstname', 'chamber', 'state', 'party', 'district']].copy()
tmp.rename(columns={'speaker_id': 'speaker_id', 'lastname': 'lastname',
                    'firstname': 'firstname', 'chamber': 'chamber',
                    'state': 'state_2', 'party': 'party', 'district': 'district'}, inplace=True)
### merge
combregression_df_2 = pd.merge(regression_df, tmp, on = 'speaker_id', how = 'left').copy()
### fill NA b/c district is missing for S 
combregression_df_2["district"].fillna(100, inplace = True)
### add a constant 
combregression_df_2['constant'] = 1
combregression_df_2.head()

### Vote going to republican candidate  (collected from Daily Kos)
### for senators use state level, for house representatives use district level

In [None]:
df_vote = pd.read_csv('senate_vote_share_going_to_rep.csv')
df_vote_2 = pd.read_csv('house_vote_share_going_to_rep.csv')
df_vote_2.drop(columns=['Unnamed: 0', 'fullname', 'party'], inplace = True)
df_vote_2.rename(columns={'state': 'state_2', 'district': 'district',
                          'chamber': 'chamber', 'rep_vote_share': 'rep_vote_share'}, inplace=True)
df_vote_final = pd.concat([df_vote, df_vote_2], axis=0)
df_vote_final.head()

### merge it to the main data

In [None]:
combregression_df_3 = pd.merge(combregression_df_2, df_vote_final, 
                               on = ['chamber', 'state_2', 'district'], 
                               how = 'left').copy()

### run regressions

In [None]:
from statsmodels.compat.pandas import Appender
import statsmodels.api as sm

In [None]:
combregression_df_3.dropna(inplace = True)

alpha_list = []
beta_list = []
length = len(political_words)
print('total # of words:', length)

for i in range(int(length)):
    # select a word from the political word list
    variable = political_words[int(i)]
    # run regression
    # y: f_{pc}
    # x: a_{p}, b_{p}
    sm_model = sm.OLS(combregression_df_3[str(variable)].astype(float), 
                      combregression_df_3[['constant', 'rep_vote_share']].astype(float)) # republican vs rep_vote_share
    sm_fit = sm_model.fit()
    alpha, beta = sm_fit.params    
    alpha_list.append(alpha)
    beta_list.append(beta)

data_coef = pd.DataFrame(list(zip(political_words, alpha_list, beta_list)), columns = ['phrase', 'alpha', 'beta'])

## 4) apply the mapping derived from 3) to ads

In [None]:
folder = 'ALL_ADS_AND_SPEECHES_FINAL'
file_loc = 'C:\\Users\\donggwan.kim\\Desktop\\Video_Transcribing_Final_corrected\\' + folder + '\\*.txt'
file_paths = glob.glob(file_loc)
file_paths = file_paths[0:60] ### the last two are speech data

RA_FILE = []
slant_index = []

for file in file_paths:
    ### open the transcribed test
    with open(file) as f:
        lines = f.readlines()
    ### create a string
    ad_text = ''  
    for line in lines:
        # nomination speech has some issues
        line = line.replace("\\", "")
        ad_text = ad_text + ' ' + line.rstrip()
    ad_text = ad_text[1:]
    ### lower case
    ad_text = ad_text.lower()
    ### get frequency
    Freq = one_gram_function_2(ad_text)
    ### estimate the slant score
    num = 0
    denom = 0
    for word in political_words:
        alpha = pd.to_numeric(data_coef[data_coef['phrase'] == word]['alpha'])
        alpha = float(alpha)
        beta = pd.to_numeric(data_coef[data_coef['phrase'] == word]['beta'])
        beta = float(beta)
        # if a given political word appears in the ad
        if word in Freq.keys():
            num = num + beta * ((Freq[word] / sum(Freq.values())) - alpha)
            denom = denom + (beta * beta)
        # otherwise - if the word does not appear in the ad
        else:
            num = num + beta * (0 - alpha)
            denom = denom + (beta * beta)
    y_hat = num / denom
    ### save it
    RA_FILE.append(file[92:100])        
    slant_index.append(y_hat)

In [None]:
df_output = pd.DataFrame(list(zip([int(num) for num in RA_FILE], np.array(slant_index))), 
                       columns = ['FILE_NUM_RA_CODING', 'slant'])
df_output.to_csv('slant_estimates.csv')
df_output.head()