In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chisquare
from collections import defaultdict
import math

# Chi-squared goodness of fit tests


In [None]:
def generate_observed_and_expected_freqs(word, df):
  israelis_word_used = df[(df['Entity']=='Israelis') & (df['Word'] == word)]['Number of Occurrences'].sum()

  palestinians_word_used = df[(df['Entity']=='Palestinians') & (df['Word'] == word)]['Number of Occurrences'].sum()

  expected_freq = (israelis_word_used + palestinians_word_used) / 2

  return [israelis_word_used, palestinians_word_used], [expected_freq, expected_freq]

In [None]:
def calc_pval(observed, expected):
  chi2, p_val = chisquare(observed, expected)
  return p_val

In [None]:
def run_chisquared(word, df):

  observed, expected = generate_observed_and_expected_freqs(word, df)

  # if both 0 then they do follow a 5050 distribution
  if sum(observed) == 0:
    return math.inf, False, observed

  pval = calc_pval(observed, expected)

  significant = True if pval < 0.05 else False

  return pval, significant, observed

##Overall

In [None]:
df_overall = pd.read_pickle('/content/drive/My Drive/articles/per_entity_counts.pkl')

In [None]:
df_overall

Unnamed: 0,Entity,Word,Number of Occurrences
0,Israelis,died,66
1,Israelis,killed (active),8
2,Israelis,killed (passive),461
3,Israelis,murdered,24
4,Palestinians,died,206
5,Palestinians,killed (active),5
6,Palestinians,killed (passive),4942
7,Palestinians,murdered,2


## Murdered

In [None]:
run_chisquared('murdered', df_overall)

(np.float64(1.5992473593247364e-05), True, [np.int64(24), np.int64(2)])

##Killed (active)


In [None]:
run_chisquared('killed (active)', df_overall)

(np.float64(0.40538055645894244), False, [np.int64(8), np.int64(5)])

##Killed (passive)

In [None]:
run_chisquared('killed (passive)', df_overall)

(np.float64(0.0), True, [np.int64(461), np.int64(4942)])

##Died

In [None]:
run_chisquared('died', df_overall)

(np.float64(2.08877116859715e-17), True, [np.int64(66), np.int64(206)])

##Per News Site


In [None]:
df_pernews = pd.read_pickle('/content/drive/My Drive/articles/per_site_counts.pkl')

In [None]:
df_pernews = df_pernews[['News Site', 'Entity', 'Word', 'Number of Occurrences']]

In [None]:
newssites = df_pernews['News Site'].unique()
words = df_pernews['Word'].unique()

combinations = [(newssite, word) for word in words for newssite in newssites]

pvals = defaultdict(lambda : defaultdict(int))

for newssite, word in combinations:
  result = run_chisquared(word, df_pernews[df_pernews['News Site'] == newssite])
  pvals[newssite][word] = result

In [None]:
data_ = []

for newssite, words in pvals.items():
  for word, result in words.items():
    data_.append({
        'News Site' : newssite,
        'Word' : word,
        'pval' : result[0],
        'significant' : result[1],
        'observed' : result[2]
    })

df_pernews_sigtests = pd.DataFrame(data_)

df_pernews_sigtests[(df_pernews_sigtests['Word'] == 'died')].sort_values(by='News Site')

Unnamed: 0,News Site,Word,pval,significant,observed
3,AP News,died,8e-06,True,"[0, 20]"
7,BBC,died,0.025347,True,"[0, 5]"
11,CNN,died,0.008151,True,"[0, 7]"
19,Dailymail,died,0.041087,True,"[9, 20]"
15,Fox News,died,0.738883,False,"[4, 5]"
23,Guardian,died,0.005473,True,"[28, 53]"
27,Hindustan Times,died,0.004678,True,"[3, 15]"
35,Independent,died,0.004678,True,"[8, 24]"
31,India,died,0.317311,False,"[1, 0]"
39,Indian Express,died,0.095581,False,"[2, 7]"


##Active vs Passive Voice per News Site



In [None]:
df_apvoice = pd.read_pickle('/content/drive/My Drive/articles/news_ap_voice.pkl')

In [None]:
df_apvoice = df_apvoice[['News Site', 'Entity', 'Active Voice Count', 'Passive Voice Count']]

In [None]:
df_apvoice

Unnamed: 0,News Site,Entity,Active Voice Count,Passive Voice Count
0,AP News,Palestine,1634,616
1,AP News,Israel,29267,2946
2,AP News,IDF,123,19
3,AP News,Hamas,1285,136
4,BBC,Palestine,260,231
5,BBC,Israel,10516,1735
6,BBC,IDF,1570,320
7,BBC,Hamas,663,42
8,CNN,Palestine,241,173
9,CNN,Israel,13005,1674


In [None]:
def generate_ap_observed_expected_freqs(newssite, entity, voice_type, df):
  df = df[df['News Site'] == newssite]

  voice_col = 'Active Voice Count' if voice_type == 'active' else 'Passive Voice Count'

  if entity == 'states':
    israel_voice = df[(df['Entity']=='Israel')][voice_col].sum()

    palestine_voice = df[(df['Entity']=='Palestine')][voice_col].sum()

    expected_freq = (israel_voice + palestine_voice) / 2

    return [israel_voice, palestine_voice], [expected_freq, expected_freq]
  elif entity == 'actors':
    idf_voice = df[(df['Entity']=='IDF')][voice_col].sum()

    hamas_voice = df[(df['Entity']=='Hamas')][voice_col].sum()

    expected_freq = (idf_voice + hamas_voice) / 2

    return [idf_voice, hamas_voice], [expected_freq, expected_freq]

In [None]:
def run_chisquared_voice(newssite, entity, voice_type, df):

  observed, expected = generate_ap_observed_expected_freqs(newssite, entity, voice_type, df)

  pval = calc_pval(observed, expected)

  significant = True if pval < 0.01 else False

  return pval, significant, observed

In [None]:
newssites = df_apvoice['News Site'].unique()

pvals_ap = defaultdict(lambda : defaultdict(int))

for newssite in newssites:
  pvals_ap[newssite]['states passive'] = run_chisquared_voice(newssite, 'states', 'passive', df_apvoice)
  pvals_ap[newssite]['actors passive'] = run_chisquared_voice(newssite, 'actors', 'passive', df_apvoice)
  pvals_ap[newssite]['states active'] = run_chisquared_voice(newssite, 'states', 'active', df_apvoice)
  pvals_ap[newssite]['actors active'] = run_chisquared_voice(newssite, 'actors', 'active', df_apvoice)

In [None]:
voice_data_ = []

for newssite, results in pvals_ap.items():
  for voice_type, result in results.items():
    voice_data_.append({
        'News Site' : newssite,
        'Type' : voice_type,
        'pval' : result[0],
        'significant' : result[1],
        'observed' : result[2]
    })

df_voice_sigtests = pd.DataFrame(voice_data_)

df_voice_sigtests[(df_voice_sigtests['Type'].str.contains('states passive'))].sort_values(by='News Site')

Unnamed: 0,News Site,Type,pval,significant,observed
0,AP News,states passive,0.0,True,"[2946, 616]"
4,BBC,states passive,3.376888e-252,True,"[1735, 231]"
8,CNN,states passive,3.010641e-267,True,"[1674, 173]"
16,Dailymail,states passive,0.0,True,"[6026, 444]"
12,Fox News,states passive,1.351932e-201,True,"[1292, 144]"
20,Guardian,states passive,0.0,True,"[11465, 2576]"
24,Hindustan Times,states passive,3.497058e-264,True,"[1954, 304]"
32,Independent,states passive,0.0,True,"[3213, 730]"
28,India,states passive,3.5098899999999996e-30,True,"[220, 37]"
36,Indian Express,states passive,1.328236e-149,True,"[1450, 346]"


# Active vs Passive Voice Overall

In [None]:
## calculate overall counts

palestine_active_voice = df_apvoice[df_apvoice['Entity'] == 'Palestine']['Active Voice Count'].sum()
palestine_passive_voice = df_apvoice[df_apvoice['Entity'] == 'Palestine']['Passive Voice Count'].sum()

israel_active_voice = df_apvoice[df_apvoice['Entity'] == 'Israel']['Active Voice Count'].sum()
israel_passive_voice = df_apvoice[df_apvoice['Entity'] == 'Israel']['Passive Voice Count'].sum()

idf_active_voice = df_apvoice[df_apvoice['Entity'] == 'IDF']['Active Voice Count'].sum()
idf_passive_voice = df_apvoice[df_apvoice['Entity'] == 'IDF']['Passive Voice Count'].sum()

hamas_active_voice = df_apvoice[df_apvoice['Entity'] == 'Hamas']['Active Voice Count'].sum()
hamas_passive_voice = df_apvoice[df_apvoice['Entity'] == 'Hamas']['Passive Voice Count'].sum()



In [None]:
def generate_observed_expected_freqs(observed):
  expected_freq = (observed[0] + observed[1]) / 2

  return observed, [expected_freq, expected_freq]

In [None]:
def run_chisquared_voice_overall(observed):
  observed, expected = generate_observed_expected_freqs(observed)

  pval = calc_pval(observed, expected)

  significant = True if pval < 0.05 else False

  return pval, significant, observed

In [None]:
run_chisquared_voice_overall([palestine_active_voice, israel_active_voice])

(np.float64(0.0), True, [np.int64(8814), np.int64(266009)])

In [None]:
run_chisquared_voice_overall([palestine_passive_voice, israel_passive_voice])

(np.float64(0.0), True, [np.int64(6446), np.int64(37935)])

In [None]:
run_chisquared_voice_overall([idf_active_voice, hamas_active_voice])

(np.float64(0.0), True, [np.int64(24590), np.int64(15969)])

In [None]:
run_chisquared_voice_overall([idf_passive_voice, hamas_passive_voice])

(np.float64(0.0), True, [np.int64(4322), np.int64(1320)])