# Calculating PMI

For a given target word t="crime", and all other co-occurring words w (dictionary), need to compute:

P(t) = (total # of emails w/ "crime" tokens)/(total # of emails in corpus)
- len of crime subset / len of corpus

P(w) = (total # of emails w/ w tokens)/(total # of emails in corpus)
- keyword search for w in full corpus -> len of w subset / len of corpus

P(t, w) = (# of emails with "crime" and w in the same email)/(total # of emails in entire corpus)
- keyword search for w in crime subset -> keep count or len of t,w subset / len of corpus

In [1]:
import pandas as pd
# pd.set_option('display.max_rows', None)
# import yaml
import pickle
import math

## Primary candidates PMI

In [2]:
primary_candidates_corpus = pd.read_csv('../corpus_sample/primary_candidates_corpus.csv')

In [3]:
# leprimary_candidates_corpusus
primary_corpus_length = len(primary_candidates_corpus)

In [4]:
primary_crime_subset = pd.read_csv('../corpus_sample/primary_candidates_crime_subset.csv')

In [5]:
# calculating P(t)
primary_p_t = len(primary_crime_subset) / len(primary_candidates_corpus)
primary_p_t

0.008154201352375469

In [None]:
pmi = pd.read_csv('pmi_primary.csv', index_col=0)

In [None]:
pmi = pmi.loc[pmi['p_w'] > 0]

In [13]:
import numpy as np

In [None]:
pmi['p(tw)/p(t)p(w)'] = pmi['p_tw'] / (pmi['p_w'] * primary_p_t)

In [None]:
pd.set_option('display.float_format', lambda x: '%0.5f' % x)

In [None]:
pmi['pmi'] = np.log(pmi['p(tw)/p(t)p(w)'])

In [None]:
sort = pmi.sort_values('pmi', ascending = False)

In [None]:
sort.head(50)

In [None]:
sort = sort.rename(columns={'p_w': 'p(w)', 'p_tw': 'p(t,w)', 'p(tw)/p(t)p(w)': 'p(t,w)/p(t)p(w)'})
sort.loc[~sort['w'].str.contains('crime')].head(10)

# Primary candidates PMI (original case)

In [6]:
# generate list of w
with open('../pickle/primary_candidates_crime_tokens_original_case', 'rb') as c:
    original_case_crime_tokens = pickle.load(c)

# word_count maps tokens with their frequencies in crime subset
original_case_w_count = {}

for email in original_case_crime_tokens:
    for word in email:
        original_case_w_count[word] = 1 + original_case_w_count.get(word, 0)

In [None]:
# initialize PMI dataframe with w's
w_list = [w for w in original_case_w_count if original_case_w_count[w] > 20]
original_case_pmi = pd.DataFrame(w_list, columns=['w'])

In [None]:
original_case_pmi

In [None]:
original_case_pmi['corpus_w_count'] = original_case_pmi['w'].apply(count_keyword_primary)    

In [None]:
original_case_pmi['p(w)'] = original_case_pmi['corpus_w_count'] / primary_corpus_length

In [None]:
original_case_pmi['p(t,w)'] = original_case_pmi['w'].apply(calculate_cooccurrence_primary)

In [None]:
original_case_pmi.to_csv('new_pmi.csv')

In [7]:
new_pmi = pd.read_csv('pmi_calculations.csv', index_col=0)

In [8]:
new_pmi

Unnamed: 0,w,corpus_w_count,p(w),"p(t,w)"
0,hear,27544,0.161815,269
1,soon,6815,0.040037,54
2,asked,5014,0.029456,56
3,Democrats,61219,0.359648,581
4,stand,45426,0.266868,621
...,...,...,...,...
1943,Sandy Smith,107,0.000629,13
1944,Butterfield,109,0.000640,13
1945,G.K. Butterfield,93,0.000546,13
1946,Doug Collins,92,0.000540,17


In [9]:
new_pmi['p(t,w)/p(t)p(w)'] = new_pmi['p(t,w)'] / ((new_pmi['p(w)'] * primary_p_t))


In [10]:
pd.set_option('display.float_format', lambda x: '%0.5f' % x)

In [14]:
new_pmi['pmi'] = np.log(new_pmi['p(t,w)/p(t)p(w)'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [15]:
new_pmi = new_pmi.sort_values('pmi', ascending=False).loc[new_pmi['corpus_w_count'] > 20]

In [17]:
new_pmi = new_pmi.head(100)

In [18]:
new_pmi

Unnamed: 0,w,corpus_w_count,p(w),"p(t,w)","p(t,w)/p(t)p(w)",pmi
1468,“Unsure,37,0.00022,37,20875005.73560,16.85406
1185,war crimes,36,0.00021,36,20875005.73560,16.85406
1474,an urgent public opinion poll,33,0.00019,33,20875005.73560,16.85406
1873,the fine values,32,0.00019,32,20875005.73560,16.85406
1874,our President Donald J Trump,32,0.00019,32,20875005.73560,16.85406
...,...,...,...,...,...,...
1301,Roger Stone,163,0.00096,45,5763038.39326,15.56698
1075,abusing,227,0.00133,62,5701543.41677,15.55625
1136,looters,122,0.00072,33,5646517.94487,15.54655
1903,Joe's campaign chip,167,0.00098,44,5500001.51117,15.52026


# Keyword analysis to find out unique # of senders

In [19]:
def unique_senders(w):
    senders = set()
    for index, row in primary_candidates_corpus.iterrows():
        if w in str(row['body_text']):
            senders.add(row['name'])
    return len(senders)

In [20]:
new_pmi['unique_senders'] = new_pmi['w'].apply(unique_senders)
filtered_pmi = new_pmi.loc[new_pmi['unique_senders'] > 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_pmi['unique_senders'] = new_pmi['w'].apply(unique_senders)


In [22]:
def list_senders(w):
    senders = set()
    for index, row in primary_candidates_corpus.iterrows():
        if w in str(row['body_text']):
            senders.add(row['name'])
    return senders

Unnamed: 0,w,corpus_w_count,p(w),"p(t,w)","p(t,w)/p(t)p(w)",pmi,unique_senders
1468,“Unsure,37,0.00022,37,20875005.73560,16.85406,7
1185,war crimes,36,0.00021,36,20875005.73560,16.85406,11
1474,an urgent public opinion poll,33,0.00019,33,20875005.73560,16.85406,7
829,their crimes,30,0.00018,30,20875005.73560,16.85406,20
1072,Trump’s crimes,30,0.00018,30,20875005.73560,16.85406,17
...,...,...,...,...,...,...,...
1301,Roger Stone,163,0.00096,45,5763038.39326,15.56698,60
1075,abusing,227,0.00133,62,5701543.41677,15.55625,92
1136,looters,122,0.00072,33,5646517.94487,15.54655,41
1903,Joe's campaign chip,167,0.00098,44,5500001.51117,15.52026,2


In [23]:
filtered_pmi['senders'] = filtered_pmi['w'].apply(list_senders)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_pmi['senders'] = filtered_pmi['w'].apply(list_senders)


In [24]:
filtered_pmi.loc[~filtered_pmi['w'].str.contains('crime')].head(50)

Unnamed: 0,w,corpus_w_count,p(w),"p(t,w)","p(t,w)/p(t)p(w)",pmi,unique_senders,senders
1468,“Unsure,37,0.00022,37,20875005.7356,16.85406,7,"{Marie Newman, Sima Ladjevardian, Brynne Kenne..."
1474,an urgent public opinion poll,33,0.00019,33,20875005.7356,16.85406,7,"{Marie Newman, Sima Ladjevardian, Brynne Kenne..."
1462,her administration’s Department of Justice,27,0.00016,27,20875005.7356,16.85406,7,"{Marie Newman, Sima Ladjevardian, Brynne Kenne..."
1137,ANTIFA members,21,0.00012,21,20875005.73559,16.85406,3,"{Debbie Lesko, Alek Skarlatos, Tony Gonzales}"
1138,our nation’s cities,21,0.00012,21,20875005.73559,16.85406,3,"{Debbie Lesko, Alek Skarlatos, Tony Gonzales}"
1142,our border control,21,0.00012,21,20875005.73559,16.85406,3,"{Debbie Lesko, Alek Skarlatos, Tony Gonzales}"
1144,our great nation’s principles,21,0.00012,21,20875005.73559,16.85406,3,"{Debbie Lesko, Alek Skarlatos, Tony Gonzales}"
1553,misdemeanors,21,0.00012,21,20875005.73559,16.85406,18,"{William Weld, Lee Zeldin, Mary Jennings ""M.J...."
1824,my Complete & Total Endorsement,41,0.00024,40,20365859.25424,16.82937,2,"{Steve Bullock, Bill Hagerty}"
1864,-Nikki Haley,38,0.00022,37,20325663.47939,16.82739,2,"{Alek Skarlatos, Bill Hagerty}"


In [26]:
def count_email_addresses(w):
    senders = set()
    for index, row in primary_candidates_corpus.iterrows():
        if w in str(row['body_text']):
            senders.add(row['from_address'])
    return len(senders)

In [27]:
filtered_pmi['unique_emails'] = filtered_pmi['w'].apply(count_email_addresses)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_pmi['unique_emails'] = filtered_pmi['w'].apply(count_email_addresses)


In [30]:
filtered_pmi.loc[~filtered_pmi['w'].str.contains('crime')].head(50)

Unnamed: 0,w,corpus_w_count,p(w),"p(t,w)","p(t,w)/p(t)p(w)",pmi,unique_senders,senders,unique_emails
1468,“Unsure,37,0.00022,37,20875005.7356,16.85406,7,"{Marie Newman, Sima Ladjevardian, Brynne Kenne...",3
1474,an urgent public opinion poll,33,0.00019,33,20875005.7356,16.85406,7,"{Marie Newman, Sima Ladjevardian, Brynne Kenne...",2
1462,her administration’s Department of Justice,27,0.00016,27,20875005.7356,16.85406,7,"{Marie Newman, Sima Ladjevardian, Brynne Kenne...",3
1137,ANTIFA members,21,0.00012,21,20875005.73559,16.85406,3,"{Debbie Lesko, Alek Skarlatos, Tony Gonzales}",10
1138,our nation’s cities,21,0.00012,21,20875005.73559,16.85406,3,"{Debbie Lesko, Alek Skarlatos, Tony Gonzales}",10
1142,our border control,21,0.00012,21,20875005.73559,16.85406,3,"{Debbie Lesko, Alek Skarlatos, Tony Gonzales}",10
1144,our great nation’s principles,21,0.00012,21,20875005.73559,16.85406,3,"{Debbie Lesko, Alek Skarlatos, Tony Gonzales}",10
1553,misdemeanors,21,0.00012,21,20875005.73559,16.85406,18,"{William Weld, Lee Zeldin, Mary Jennings ""M.J....",18
1824,my Complete & Total Endorsement,41,0.00024,40,20365859.25424,16.82937,2,"{Steve Bullock, Bill Hagerty}",2
1864,-Nikki Haley,38,0.00022,37,20325663.47939,16.82739,2,"{Alek Skarlatos, Bill Hagerty}",2


In [31]:
filtered_pmi.at[1553, 'senders']

{'Antonio Delgado',
 'Christina Finello',
 'Earl Blumenauer',
 'Jason Smith',
 'Joe Biden',
 'Joe Walsh',
 'John Cornyn',
 'Lee Zeldin',
 'Lindsey Boylan',
 'Lindsey Graham',
 'Lisa Scheller',
 'Mary Jennings "M.J." Hegar',
 'Mike Quigley',
 'Ritchie Torres',
 'Sara Jacobs',
 'Tom Cotton',
 'Wendy Davis',
 'William Weld'}