In [1]:
import re
import json
import pickle
import Algorithmia
import numpy as np
import pandas as pd
import recordlinkage
from afinn import Afinn
import tfidf_matcher as tm
import plotly.express as px
from urllib.request import Request, urlopen
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
# contains correct muckrack urls with either google scraping, tfidf matching or brute force approach.
with open('../Data/all_muckrack_links.pkl', 'rb') as f:
    author_df = pickle.load(f)

# original news dataset containing article info 
with open('../Data/news.pkl', 'rb') as f:
    news_df = pickle.load(f)

# emails dataset
emails_df = pd.read_csv("../Data/compression_test.txt.gz", compression='gzip',sep = '\t')
emails_df = emails_df.fillna("")
emails_df = emails_df[['email_address', 'first_name_df', 'second_name_df']]

In [3]:
# make a first name column for blocking using recordlinkage
def first_name(url):
    return url[21:].split('-')[0]

# get the outlet the author most commonly works for and has the most articles for
def author_outlet(author):
    return news_df[news_df.author == author].site_name.value_counts().index[0]

# concatenate above 2 fields to compare it with potential email matches
def concatenate_fields(author, outlet):
    return author + ' ' + outlet

In [33]:
def match_disambiguate_deduplicate(authors, emails, key):
    
    authors['first_name_df'] = authors.apply(lambda x: first_name(x['request_url']), axis=1)
    authors['common_outlet'] = authors.apply(lambda x: author_outlet(x['author']), axis=1)
    authors['author + outlet'] = authors.apply(lambda x: concatenate_fields(x['author'], x['common_outlet']), axis=1)
    
    # create and indexer and block columns by the first name
    indexer = recordlinkage.Index()
    indexer.block('first_name_df')
    candidate_links = indexer.index(authors, emails)
    print("candidate_links:", len(candidate_links))
    
    # compare the author + outlet string from authors dataframe with the email string from the emails dataframe
    compare_cl = recordlinkage.Compare()
    compare_cl.string('author + outlet', 'email_address', label = 'match')
    features = compare_cl.compute(candidate_links, authors, emails)
    
    # find all unique author indexes for which a potential email match is found
    author_indexes = list(set([i[0] for i in features[features.match >= 0.5].index]))

    # to test, take only rows where the match confidence is >= 0.5
    features = features[features.match >= 0.5]
    
    # create an empty dataframe to store matches
    matches = pd.DataFrame (columns = ['author_index', 'email_index', 'match_threshold', 'scraper_response'])

    # for every author, having a variety of matching email indexes, take the one with highest match confidence
    for i in author_indexes:
        temp = features.loc[i].sort_values('match', ascending = False).head(1)
        row = pd.Series(list([i, temp.index[0], round(temp.match.values[0], 3), '-']), index = matches.columns)
        matches = matches.append(row, ignore_index = True) 
    
    # do a google search for every matching email IF they have a valid muckrack url found before, else we cannot confirm
    
    client = Algorithmia.client(key)
    algo = client.algo('specrom/Google_scraper/0.1.4')
    algo.set_options(timeout=300)
    
    for i in range(len(matches)):
        if authors.iloc[int(matches.iloc[i].author_index)].method == 'no match':
            matches.loc[i, 'scraper_response'] = 'invalid'
            print("no valid muckrack url to compare with", matches.iloc[i].author_index)
        else:
            email_to_google = emails.iloc[int(matches.iloc[i].email_index)].email_address
            query = {"query": f"{email_to_google} muckrack"}
            results = algo.pipe(query).result
            matches.at[i, 'scraper_response'] = results
            print(i)
        
    return matches

In [34]:
email_matches = match_disambiguate_deduplicate(author_df, emails_df, 'simn//4kdV/plVdXzEM8Y+vtLnc1')

candidate_links: 1263952
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
2

In [57]:
email_matches

Unnamed: 0,author_index,email_index,match_threshold,scraper_response
0,1,2080352,0.552,"[{'position': 1, 'snippet': 'Find James Fowler..."
1,2054,1764154,0.560,"[{'position': 1, 'snippet': 'newsday.com — Kev..."
2,11,205311,0.500,"[{'position': 1, 'snippet': 'Find Jenna Ciccot..."
3,2074,312243,0.615,"[{'position': 1, 'snippet': 'Find Paul Doyle o..."
4,33,38933,0.533,"[{'position': 1, 'snippet': 'Find Matt Cooper ..."
...,...,...,...,...
401,1989,6443,0.538,[]
402,2011,785848,0.581,[]
403,2022,809326,0.625,[]
404,2023,5612,0.528,[]


In [82]:
def found_match(author_index, email_index, scraper_response):
    if not scraper_response or scraper_response == 'invalid':
        return 'Invalid'
    else:
        print(author_df.iloc[author_index].redirect_url, scraper_response[0]['url'])
        print()
        return author_df.iloc[author_index].redirect_url == scraper_response[0]['url']

In [83]:
email_matches['email_checks_out'] = email_matches.apply(lambda x: found_match(x['author_index'], x['email_index'], x['scraper_response']), axis=1)

https://muckrack.com/james-rodger https://muckrack.com/james-fowler-5

https://muckrack.com/david-hall https://muckrack.com/davidjcriblez

https://muckrack.com/jenna-ciccotelli https://muckrack.com/jenna-ciccotelli

https://muckrack.com/paul-doyle https://muckrack.com/paul-doyle-1

https://muckrack.com/matthew-cooper-3 https://muckrack.com/matt-cooper-2

https://muckrack.com/helen-kelly https://muckrack.com/helen-barnes-1

https://muckrack.com/stuart-ballard https://muckrack.com/stuart-patterson

https://muckrack.com/claire-anderson https://muckrack.com/claire-churchard/articles

https://muckrack.com/beth-mishler-elmore https://muckrack.com/beth-mishler-elmore

https://muckrack.com/sophie-harris https://muckrack.com/sophie-bird

https://muckrack.com/alex-putterman https://muckrack.com/stuart-n-brotman/articles

https://muckrack.com/niomi-harris https://muckrack.com/niomi-harris

https://muckrack.com/jessica-schladebeck https://muckrack.com/jessica-chen

https://muckrack.com/brian-maziq

In [89]:
email_matches[email_matches.email_checks_out == True].match_threshold.mean()

0.6078484848484849

In [90]:
email_matches[email_matches.email_checks_out == False].match_threshold.mean()

0.5383629032258065

In [92]:
email_matches.email_checks_out.value_counts()

Invalid    216
False      124
True        66
Name: email_checks_out, dtype: int64

In [93]:
email_matches.to_pickle('email_matches.pkl')