In [1]:
import pandas as pd
import re
import pickle
import tfidf_matcher as tm
pd.options.mode.chained_assignment = None  # default='warn'
from urllib.request import Request, urlopen

In [2]:
with open('../Data/news.pkl', 'rb') as f:
    news = pickle.load(f)
    # 3844 unique authors

muckrack = pd.read_csv('../Data/muckrack_persons_fetchlist.csv')

In [3]:
def form_urls():
    temp = news[['author', 'article_count']].groupby('author').count().sort_values(['article_count'], ascending = False)
    temp = temp.reset_index()
    urls = []
    for i in range(len(temp)):
        url = temp.loc[i].author.lower()
        url = re.sub(r'\|.*$', '', url) # remove everything after |
        url = re.sub(r'\,.*$', '', url) # remove evrything after ,
        
        punctuations = '''!()[]{};:'"\,<>./?#$%^&*_~'''
        for x in url.lower(): 
            if x in punctuations: url = url.replace(x, "") # remove punctuations except hyphen
        url = re.sub(r'(\d+)', '', url) # remove numbers
        url = re.sub(r'(--)', '', url)
        url = re.sub(r'(\w+@\w+)', '', url) # remove emails
        url = ' '.join([i.strip() for i in url.split()]) # remove spaces and lowercase
        url = re.sub(r'(\s+)(and)(\s+)(\w*)(\s*)(\w*)', '', url) # remove anything after and
        keywords = ['sa', 'tulsa', 'editor', 'writer', 'world-herald', 'news', 'world', 'richmond', 'times-dispatch', 'new',
                   'hampshire', 'union', 'leader', 'for', 'the', 'state', 'journal', 'correspondant', 'sfgate', 'special',
                   'from', 'the', 'gazette', 'times', 'staff', 'senior', 'dr', 'correspondent', 'by', 'editorial board',
                   'research', 'wire reports', 'security', 'real', 'estate', 'to', 'post', 'and', 'courier', 'policy',
                   'commercial', 'bureau', 'political', 'roanoke', 'college', 'football', 'editorial','democrat-gazette',
                   'arizona', 'daily', 'star', '--hamburg', 'column', 'lincoln', 'managing', 'backstage', 'with', 'sports',
                   'ii', 'iii', 'capitol', 'media', 'services']
        url = ' '.join([i for i in url.split() if i not in keywords]) # remove other keywords
        url = re.sub(r'(@)', '', url) # remove handles
        url = 'https://muckrack.com/' + url.replace(' ', '-').strip('-') # make URL
        urls.append(url)
    temp['request_url'] = urls
    temp['redirect_url'] = urls
    temp['method'] = 'none'
    temp['confidence'] = '-'
    return temp

In [4]:
data = form_urls()

In [5]:
data

Unnamed: 0,author,article_count,request_url,redirect_url,method,confidence
0,Neil Shaw,523,https://muckrack.com/neil-shaw,https://muckrack.com/neil-shaw,none,-
1,James Rodger,512,https://muckrack.com/james-rodger,https://muckrack.com/james-rodger,none,-
2,Jack Davis,251,https://muckrack.com/jack-davis,https://muckrack.com/jack-davis,none,-
3,Adam Wells,223,https://muckrack.com/adam-wells,https://muckrack.com/adam-wells,none,-
4,Sophie McCoid,218,https://muckrack.com/sophie-mccoid,https://muckrack.com/sophie-mccoid,none,-
...,...,...,...,...,...,...
3833,Levi Sumagaysay,10,https://muckrack.com/levi-sumagaysay,https://muckrack.com/levi-sumagaysay,none,-
3834,Ben Ashford,10,https://muckrack.com/ben-ashford,https://muckrack.com/ben-ashford,none,-
3835,Craig Brown,10,https://muckrack.com/craig-brown,https://muckrack.com/craig-brown,none,-
3836,Courtney Connley,10,https://muckrack.com/courtney-connley,https://muckrack.com/courtney-connley,none,-


In [6]:
lookup = list(muckrack.url)
k_matches = 1
ngram_length = 3
def match_with_tfdif(wrong_url):
    wrong_url = [wrong_url]
    results = tm.matcher(wrong_url, lookup, k_matches, ngram_length)
    match_confidence = results['Match Confidence'].iloc[0]
    right_url = results['Lookup 1'].iloc[0]
    return match_confidence, right_url

In [7]:
match_with_tfdif('https://muckrack.com/ryan-taylor')

(0.89, 'https://muckrack.com/ryan-taylor-3')

In [None]:
for j in range(len(data)):
    url = data.iloc[j].request_url
    try:
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        res = urlopen(req)
        sauce = res.read()
        data.loc[j, 'method'] = 'brute force'
        data.loc[j, 'redirect_url'] = res.url
        print(j, "found brute force", url)
    except:
        confidence, url = match_with_tfdif(url)
        try:
            req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
            res = urlopen(req)
            sauce = res.read()
            data.loc[j, 'method'] = 'tfidf match'
            data.loc[j, 'confidence'] = confidence
            data.loc[j, 'request_url'] = url
            data.loc[j, 'redirect_url'] = res.url
            print(j, "found tfidf", url)
        except:
            print(j, "found nothing", url)
            data.loc[j, 'method'] = 'no match'
            # google scraper
    print()

0 found brute force https://muckrack.com/neil-shaw

1 found brute force https://muckrack.com/james-rodger

2 found brute force https://muckrack.com/jack-davis

3 found brute force https://muckrack.com/adam-wells

4 found brute force https://muckrack.com/sophie-mccoid

5 found brute force https://muckrack.com/jack-otway

6 found brute force https://muckrack.com/joseph-zucker

7 found brute force https://muckrack.com/simon-duke

8 found brute force https://muckrack.com/maren-estrada

9 found brute force https://muckrack.com/dave-johnson

10 found brute force https://muckrack.com/brett-gibbons

11 found brute force https://muckrack.com/jenna-ciccotelli

12 found brute force https://muckrack.com/kipp-jones

13 found brute force https://muckrack.com/rianne-addo

14 found brute force https://muckrack.com/scott-polacek

15 found brute force https://muckrack.com/clark-schultz

16 found brute force https://muckrack.com/lewis-winter

17 found brute force https://muckrack.com/tom-bull

18 found b

146 found brute force https://muckrack.com/will-twigger

147 found brute force https://muckrack.com/jack-wright

148 found brute force https://muckrack.com/joe-tansey

149 found brute force https://muckrack.com/daniel-murphy

150 found brute force https://muckrack.com/joe-bray

151 found brute force https://muckrack.com/sophie-bateman

152 found brute force https://muckrack.com/chris-kitching

153 found brute force https://muckrack.com/joshua-smith

154 found brute force https://muckrack.com/jenn-gidman

155 found brute force https://muckrack.com/michelle-marshall

156 found brute force https://muckrack.com/rebecca-davison

157 found brute force https://muckrack.com/verity-sulway

158 found brute force https://muckrack.com/kristopher-knox

159 found brute force https://muckrack.com/ben-hooper

160 found brute force https://muckrack.com/rachel-russell

161 found brute force https://muckrack.com/holly-fleet

162 found brute force https://muckrack.com/paul-kasabian

163 found brute force 

288 found brute force https://muckrack.com/emily-hodgkin

289 found brute force https://muckrack.com/joe-pinkstone

290 found brute force https://muckrack.com/jonathan-adams

291 found brute force https://muckrack.com/emer-scully

292 found brute force https://muckrack.com/erin-coates

293 found brute force https://muckrack.com/luke-kenton

294 found brute force https://muckrack.com/sophie-haslett

295 found brute force https://muckrack.com/josh-challies

296 found brute force https://muckrack.com/peter-sblendorio

297 found brute force https://muckrack.com/abbie-llewelyn

298 found brute force https://muckrack.com/tom-davidson

299 found brute force https://muckrack.com/kurt-zindulka

300 found brute force https://muckrack.com/geoff-earle

301 found brute force https://muckrack.com/monica-greep

302 found brute force https://muckrack.com/julie-delahaye

303 found tfidf https://muckrack.com/andy-medici

304 found brute force https://muckrack.com/jessica-mcbride

305 found brute force h

432 found brute force https://muckrack.com/john-binder

433 found brute force https://muckrack.com/monique-friedlander

434 found brute force https://muckrack.com/dade-hayes

435 found brute force https://muckrack.com/matthew-dresch

436 found brute force https://muckrack.com/marlene-lenthang

437 found brute force https://muckrack.com/mary-kekatos

438 found brute force https://muckrack.com/thomas-bristow

439 found brute force https://muckrack.com/connor-oneill

440 found brute force https://muckrack.com/paige-holland

441 found brute force https://muckrack.com/adam-jones

442 found brute force https://muckrack.com/glen-williams

443 found brute force https://muckrack.com/catherine-murphy

444 found brute force https://muckrack.com/breitbart-london

445 found brute force https://muckrack.com/john-hayward

446 found brute force https://muckrack.com/kelly-ashmore

447 found brute force https://muckrack.com/warren-muggleton

448 found brute force https://muckrack.com/cortney-obrien

449

575 found brute force https://muckrack.com/jack-montgomery

576 found brute force https://muckrack.com/david-ng

577 found brute force https://muckrack.com/ross-ibbetson

578 found brute force https://muckrack.com/jem-aswad

579 found brute force https://muckrack.com/daniel-morrow

580 found brute force https://muckrack.com/dan-carden

581 found brute force https://muckrack.com/jane-herz

582 found brute force https://muckrack.com/emily-scrivener

583 found brute force https://muckrack.com/stephen-johnson

584 found brute force https://muckrack.com/tita-smith

585 found brute force https://muckrack.com/frances-martel

586 found brute force https://muckrack.com/ted-johnson

587 found brute force https://muckrack.com/bridie-pearson-jones

588 found brute force https://muckrack.com/brooks-hays

589 found brute force https://muckrack.com/vandana-singh

590 found brute force https://muckrack.com/katie-harris

591 found brute force https://muckrack.com/edgar-thompson

592 found brute force h

718 found tfidf https://muckrack.com/aodhan-gregory

719 found brute force https://muckrack.com/helena-vesty

720 found brute force https://muckrack.com/danyal-hussain

721 found brute force https://muckrack.com/graeme-young

722 found brute force https://muckrack.com/robin-fambrough

723 found brute force https://muckrack.com/kirsty-feerick

724 found brute force https://muckrack.com/heather-waugh

725 found tfidf https://muckrack.com/amanda-c-kooser

726 found brute force https://muckrack.com/jane-lavender

727 found brute force https://muckrack.com/kathleen-speirs

728 found brute force https://muckrack.com/adam-s-levy

729 found tfidf https://muckrack.com/stephen-brown

730 found brute force https://muckrack.com/christy-wheeland

731 found brute force https://muckrack.com/matt-dennien

732 found brute force https://muckrack.com/katie-sands

733 found brute force https://muckrack.com/laura-clements

734 found brute force https://muckrack.com/sam-mcevoy

735 found brute force https:/

862 found brute force https://muckrack.com/ieuan-ivett

863 found brute force https://muckrack.com/thomas-tracy

864 found brute force https://muckrack.com/samreen-ahmad

865 found brute force https://muckrack.com/jessica-sager

866 found brute force https://muckrack.com/paige-oldfield

867 found brute force https://muckrack.com/brooks-kubena

868 found tfidf https://muckrack.com/matt-bauer

869 found brute force https://muckrack.com/david-lee

870 found brute force https://muckrack.com/michael-lynch

871 found brute force https://muckrack.com/jessica-odonnell

872 found brute force https://muckrack.com/kate-mcgreavy

873 found brute force https://muckrack.com/oriana-gonzalez

874 found brute force https://muckrack.com/tara-fitzpatrick

875 found brute force https://muckrack.com/devon-ivie

876 found brute force https://muckrack.com/chisanga-malata

877 found brute force https://muckrack.com/eamon-quinn

878 found brute force https://muckrack.com/katie-collings

879 found brute force h

1005 found brute force https://muckrack.com/alex-bowmer

1006 found brute force https://muckrack.com/dave-stubbings

1007 found brute force https://muckrack.com/alexa-philippou

1008 found brute force https://muckrack.com/elizabeth-denton

1009 found brute force https://muckrack.com/stuart-layt

1010 found brute force https://muckrack.com/thomas-molloy

1011 found brute force https://muckrack.com/hayley-halpin

1012 found tfidf https://muckrack.com/tom-collins

1013 found brute force https://muckrack.com/tanay-hudson

1014 found brute force https://muckrack.com/abby-gardner

1015 found brute force https://muckrack.com/elana-lyn-gross

1016 found brute force https://muckrack.com/utpal-bhaskar

1017 found brute force https://muckrack.com/katie-gagliano

1018 found brute force https://muckrack.com/pat-hill

1019 found brute force https://muckrack.com/mark-dunphy

1020 found tfidf https://muckrack.com/ophelia-buckleton

1021 found brute force https://muckrack.com/liam-heylin

1022 found br

1147 found brute force https://muckrack.com/elsa-keslassy

1148 found brute force https://muckrack.com/dan-kay

1149 found brute force https://muckrack.com/rachael-dexter

1150 found brute force https://muckrack.com/kevin-landrigan

1151 found brute force https://muckrack.com/rhik-kundu

1152 found brute force https://muckrack.com/jeremy-schultz

1153 found brute force https://muckrack.com/damien-fisher

1154 found brute force https://muckrack.com/jason-guerrasio

1155 found brute force https://muckrack.com/rory-cassidy

1156 found tfidf https://muckrack.com/larry-di-giovanni

1157 found tfidf https://muckrack.com/ravindra-sonavane

1158 found brute force https://muckrack.com/alison-medley

1159 found brute force https://muckrack.com/rick-karlin

1160 found brute force https://muckrack.com/elizabeth-rayne

1161 found tfidf https://muckrack.com/jess-thomson

1162 found brute force https://muckrack.com/rosaleen-fenton

1163 found brute force https://muckrack.com/holden-walter-warner

116

1287 found tfidf https://muckrack.com/orla-ryan

1288 found brute force https://muckrack.com/conor-mcmahon

1289 found tfidf https://muckrack.com/dom-amato

1290 found brute force https://muckrack.com/david-smith

1291 found brute force https://muckrack.com/dede-biles

1292 found brute force https://muckrack.com/canice-leung

1293 found brute force https://muckrack.com/erik-beaston

1294 found brute force https://muckrack.com/brendan-pierson

1295 found brute force https://muckrack.com/callum-carson

1296 found brute force https://muckrack.com/alex-raskin

1297 found brute force https://muckrack.com/corinne-reichert

1298 found brute force https://muckrack.com/bill-bostock

1299 found brute force https://muckrack.com/carley-lanich

1300 found brute force https://muckrack.com/daniel-politi

1301 found brute force https://muckrack.com/grace-ho

1302 found brute force https://muckrack.com/eduardo-medina

1303 found brute force https://muckrack.com/alex-shultz

1304 found brute force https

1427 found brute force https://muckrack.com/sara-merken

1428 found brute force https://muckrack.com/james-whaling

1429 found brute force https://muckrack.com/anju-ann-mathew

1430 found brute force https://muckrack.com/kieren-williams

1431 found brute force https://muckrack.com/dylan-haas

1432 found tfidf https://muckrack.com/james-dunn

1433 found brute force https://muckrack.com/chris-hachey

1434 found brute force https://muckrack.com/allana-akhtar

1435 found brute force https://muckrack.com/sam-phillips

1436 found brute force https://muckrack.com/mark-sparrow

1437 found brute force https://muckrack.com/steven-m-sipple

1438 found brute force https://muckrack.com/tom-dare

1439 found brute force https://muckrack.com/victor-dasgupta

1440 found brute force https://muckrack.com/glen-owen

1441 found brute force https://muckrack.com/amanda-hancock

1442 found brute force https://muckrack.com/akash-podishetty

1443 found brute force https://muckrack.com/martin-domin

1444 found b

1568 found brute force https://muckrack.com/jenni-evans

1569 found brute force https://muckrack.com/ffion-lewis

1570 found brute force https://muckrack.com/paul-hirst

1571 found brute force https://muckrack.com/prashant-k-nanda

1572 found brute force https://muckrack.com/mike-mcgrath-bryan

1573 found brute force https://muckrack.com/sharon-liptrott

1574 found brute force https://muckrack.com/joseph-flaherty

1575 found brute force https://muckrack.com/aaron-flanagan

1576 found brute force https://muckrack.com/yun-li

1577 found brute force https://muckrack.com/alex-seabrook

1578 found brute force https://muckrack.com/tom-howard

1579 found brute force https://muckrack.com/jasper-jolly

1580 found brute force https://muckrack.com/yuen-sin

1581 found brute force https://muckrack.com/mark-walker

1582 found brute force https://muckrack.com/matt-donnelly

1583 found brute force https://muckrack.com/kristie-ackert

1584 found tfidf https://muckrack.com/eric-walsh

1585 found tfidf 

1709 found brute force https://muckrack.com/daniel-varghese

1710 found brute force https://muckrack.com/claire-osborn

1711 found brute force https://muckrack.com/andy-greene

1712 found brute force https://muckrack.com/bram-de-haas

1713 found brute force https://muckrack.com/heather-mcneill

1714 found tfidf https://muckrack.com/zara-wong

1715 found brute force https://muckrack.com/tim-levin

1716 found brute force https://muckrack.com/wong-shiying

1717 found brute force https://muckrack.com/stephen-norris

1718 found brute force https://muckrack.com/henry-winter

1719 found brute force https://muckrack.com/herbert-soden

1720 found brute force https://muckrack.com/fleming-smith

1721 found brute force https://muckrack.com/alfred-konuwa

1722 found brute force https://muckrack.com/jack-morse

1723 found brute force https://muckrack.com/eplunus-colvin

1724 found brute force https://muckrack.com/nick-vivarelli

1725 found brute force https://muckrack.com/wayne-cole

1726 found brut

1850 found brute force https://muckrack.com/rob-smyth

1851 found brute force https://muckrack.com/matthew-gault

1852 found brute force https://muckrack.com/kaveel-singh

1853 found brute force https://muckrack.com/richard-percival

1854 found brute force https://muckrack.com/lisa-lockwood

1855 found brute force https://muckrack.com/dave-prentice

1856 found brute force https://muckrack.com/michael-balsamo

1857 found brute force https://muckrack.com/kate-kelland

1858 found brute force https://muckrack.com/linda-navarro

1859 found brute force https://muckrack.com/chris-roling

1860 found brute force https://muckrack.com/martha-brennan

1861 found tfidf https://muckrack.com/richard-sima

1862 found brute force https://muckrack.com/cory-stieg

1863 found brute force https://muckrack.com/russell-myers

1864 found brute force https://muckrack.com/dale-ellis

1865 found brute force https://muckrack.com/alex-lowe

1866 found tfidf https://muckrack.com/tom-cannavan

1867 found brute force

1991 found brute force https://muckrack.com/lalmani-verma

1992 found brute force https://muckrack.com/steve-baltin

1993 found brute force https://muckrack.com/fabian-koh

1994 found brute force https://muckrack.com/thomas-george

1995 found brute force https://muckrack.com/steven-heaney

1996 found brute force https://muckrack.com/ian-sherr

1997 found brute force https://muckrack.com/chris-smyth

1998 found brute force https://muckrack.com/vicky-mckeever

1999 found brute force https://muckrack.com/maria-chutchian

2000 found brute force https://muckrack.com/aisling-kiernan

2001 found tfidf https://muckrack.com/annaroseiovine

2002 found brute force https://muckrack.com/ashley-collman

2003 found brute force https://muckrack.com/sandeep-a-ashar

2004 found brute force https://muckrack.com/wilson-alexander

2005 found tfidf https://muckrack.com/dougmaccash

2006 found brute force https://muckrack.com/sarah-butler

2007 found brute force https://muckrack.com/natalia-penza

2008 found

2132 found brute force https://muckrack.com/jordan-moreau

2133 found brute force https://muckrack.com/jon-freeman

2134 found brute force https://muckrack.com/kellie-ell

2135 found brute force https://muckrack.com/alex-lawson

2136 found brute force https://muckrack.com/timothy-goh

2137 found tfidf https://muckrack.com/daveress1

2138 found brute force https://muckrack.com/annie-palmer

2139 found brute force https://muckrack.com/jeff-mclane

2140 found brute force https://muckrack.com/amanda-blanco

2141 found brute force https://muckrack.com/jon-doel

2142 found tfidf https://muckrack.com/michael-folkson

2143 found brute force https://muckrack.com/anulekha-ray

2144 found brute force https://muckrack.com/john-moritz

2145 found brute force https://muckrack.com/childs-walker

2146 found brute force https://muckrack.com/john-jeffay

2147 found brute force https://muckrack.com/marty-obrien

2148 found brute force https://muckrack.com/john-gibson

2149 found brute force https://muckr

2273 found brute force https://muckrack.com/jack-roskopp

2274 found brute force https://muckrack.com/kamaldeep-singh-brar

2275 found tfidf https://muckrack.com/kaunain-sheriff

2276 found brute force https://muckrack.com/jeff-amy

2277 found brute force https://muckrack.com/robyn-sidersky

2278 found brute force https://muckrack.com/william-sanders

2279 found brute force https://muckrack.com/damien-robbins

2280 found brute force https://muckrack.com/fraser-wilson

2281 found brute force https://muckrack.com/henry-zeffman

2282 found brute force https://muckrack.com/ford-turner

2283 found brute force https://muckrack.com/ajay-jadhav

2284 found brute force https://muckrack.com/dani-di-placido

2285 found brute force https://muckrack.com/daniel-boffey

2286 found brute force https://muckrack.com/sareena-dayaram

2287 found brute force https://muckrack.com/faaez-samadi

2288 found brute force https://muckrack.com/frank-green

2289 found brute force https://muckrack.com/saurabh-prasha

2414 found brute force https://muckrack.com/rob-reischel

2415 found tfidf https://muckrack.com/graeme-leach

2416 found brute force https://muckrack.com/andrew-cain

2417 found brute force https://muckrack.com/grace-dickinson

2418 found brute force https://muckrack.com/liam-corless

2419 found tfidf https://muckrack.com/naseemmiller

2420 found brute force https://muckrack.com/nancy-tartaglione

2421 found brute force https://muckrack.com/elizabeth-gravier

2422 found brute force https://muckrack.com/max-gorden

2423 found brute force https://muckrack.com/elizabeth-ammon

2424 found brute force https://muckrack.com/elena-ferrarin

2425 found tfidf https://muckrack.com/deniseodonoghue

2426 found brute force https://muckrack.com/shivani-kumaresan

2427 found brute force https://muckrack.com/samantha-hutchinson

2428 found brute force https://muckrack.com/denis-walsh

2429 found brute force https://muckrack.com/jennifer-williams

2430 found brute force https://muckrack.com/sreenivas-ja

2555 found tfidf https://muckrack.com/eric-harrison

2556 found brute force https://muckrack.com/angelica-mari

2557 found tfidf https://muckrack.com/wayne-roustan

2558 found brute force https://muckrack.com/jonathan-mattise

2559 found brute force https://muckrack.com/gareth-walker

2560 found tfidf https://muckrack.com/afarsh

2561 found brute force https://muckrack.com/erin-vanderhoof

2562 found tfidf https://muckrack.com/greg-gilligan

2563 found brute force https://muckrack.com/sangmi-cha

2564 found brute force https://muckrack.com/veronika-kero

2565 found brute force https://muckrack.com/molly-crane-newman

2566 found brute force https://muckrack.com/eleanor-hayward

2567 found brute force https://muckrack.com/callum-booth

2568 found brute force https://muckrack.com/sergio-carmona

2569 found brute force https://muckrack.com/caitlin-owens

2570 found brute force https://muckrack.com/gabriela-baczynska

2571 found brute force https://muckrack.com/selene-san-felice

2572 found

2696 found brute force https://muckrack.com/shashank-nayar

2697 found brute force https://muckrack.com/ruki-sayid

2698 found brute force https://muckrack.com/cormac-okeeffe

2699 found brute force https://muckrack.com/sasha-lekach

2700 found brute force https://muckrack.com/brian-gilmartin

2701 found brute force https://muckrack.com/tim-reynolds

2702 found brute force https://muckrack.com/tim-schwartz

2703 found brute force https://muckrack.com/dan-gentile

2704 found tfidf https://muckrack.com/fergushunter

2705 found brute force https://muckrack.com/malibongwe-dayimani

2706 found brute force https://muckrack.com/alex-golden

2707 found brute force https://muckrack.com/laurence-hammack

2708 found brute force https://muckrack.com/matthew-daly

2709 found brute force https://muckrack.com/seanna-adcox

2710 found brute force https://muckrack.com/paul-rees

2711 found brute force https://muckrack.com/dave-skretta

2712 found tfidf https://muckrack.com/paul-newman-2

2713 found bru

2838 found brute force https://muckrack.com/arriana-mclymore

2839 found brute force https://muckrack.com/chris-doyle

2840 found brute force https://muckrack.com/jabari-young

2841 found brute force https://muckrack.com/sarah-mitroff

2842 found brute force https://muckrack.com/judy-bergeron

2843 found brute force https://muckrack.com/joe-harris

2844 found brute force https://muckrack.com/john-simerman

2845 found brute force https://muckrack.com/mark-long

2846 found brute force https://muckrack.com/mark-long

2847 found brute force https://muckrack.com/aparajita-saxena

2848 found brute force https://muckrack.com/matt-binder

2849 found brute force https://muckrack.com/tom-brueggemann

2850 found brute force https://muckrack.com/james-picerno

2851 found brute force https://muckrack.com/barry-collins

2852 found tfidf https://muckrack.com/rjfoley

2853 found brute force https://muckrack.com/vivek-deshpande

2854 found brute force https://muckrack.com/james-manso

2855 found brute 

2979 found tfidf https://muckrack.com/isabelhardman

2980 found brute force https://muckrack.com/joe-mclean

2981 found brute force https://muckrack.com/em-holter

2982 found brute force https://muckrack.com/jane-hamilton

2983 found brute force https://muckrack.com/abhinav-rajput

2984 found brute force https://muckrack.com/abhinav-kaul

2985 found brute force https://muckrack.com/abdel-jimenez

2986 found brute force https://muckrack.com/emily-andrews

2987 found brute force https://muckrack.com/james-hurley

2988 found brute force https://muckrack.com/ethan-millman

2989 found brute force https://muckrack.com/eric-kohn

2990 found brute force https://muckrack.com/estelle-shirbon

2991 found brute force https://muckrack.com/justin-ong

2992 found brute force https://muckrack.com/joanna-partridge

2993 found brute force https://muckrack.com/tristan-greene

2994 found brute force https://muckrack.com/david-hambling

2995 found brute force https://muckrack.com/deborah-horn

2996 found b

3120 found brute force https://muckrack.com/emily-gosden

3121 found brute force https://muckrack.com/ananya-bhattacharya

3122 found brute force https://muckrack.com/ishita-ayan-dutt

3123 found brute force https://muckrack.com/wahyudi-soeriaatmadja

3124 found brute force https://muckrack.com/emma-gritt

3125 found brute force https://muckrack.com/sarah-brazendale

3126 found tfidf https://muckrack.com/jacob-carah

3127 found brute force https://muckrack.com/aby-sam-thomas

3128 found brute force https://muckrack.com/sarah-hodgson

3129 found brute force https://muckrack.com/eoin-english

3130 found brute force https://muckrack.com/peter-fitzsimons

3131 found brute force https://muckrack.com/aaron-sheldrick

3132 found brute force https://muckrack.com/sarah-lumley

3133 found brute force https://muckrack.com/georgia-diebelius

3134 found brute force https://muckrack.com/stephanie-kelly

3135 found brute force https://muckrack.com/sarah-obrien

3136 found brute force https://muckrack

3261 found brute force https://muckrack.com/kirsty-needham

3262 found brute force https://muckrack.com/mike-jones

3263 found brute force https://muckrack.com/lameez-omarjee

3264 found brute force https://muckrack.com/roisin-kelly

3265 found brute force https://muckrack.com/callum-keown

3266 found brute force https://muckrack.com/ashna-butani

3267 found brute force https://muckrack.com/daniel-jones

3268 found brute force https://muckrack.com/ashley-harding

3269 found brute force https://muckrack.com/kristen-lee

3270 found brute force https://muckrack.com/greg-wood

3271 found brute force https://muckrack.com/heather-osbourne

3272 found brute force https://muckrack.com/nandagopal-rajan

3273 found brute force https://muckrack.com/danyelle-khmara

3274 found brute force https://muckrack.com/christopher-tan

3275 found brute force https://muckrack.com/sindhu-sundar

3276 found brute force https://muckrack.com/cheryl-teh

3277 found tfidf https://muckrack.com/marissarothkopf

3278

In [10]:
data.to_pickle('../Data/all_muckrack_links.pkl')

In [11]:
data

Unnamed: 0,author,article_count,request_url,redirect_url,method,confidence
0,Neil Shaw,523,https://muckrack.com/neil-shaw,https://muckrack.com/neil-shaw-1,brute force,-
1,James Rodger,512,https://muckrack.com/james-rodger,https://muckrack.com/james-rodger,brute force,-
2,Jack Davis,251,https://muckrack.com/jack-davis,https://muckrack.com/jack-davis,brute force,-
3,Adam Wells,223,https://muckrack.com/adam-wells,https://muckrack.com/adam-wells,brute force,-
4,Sophie McCoid,218,https://muckrack.com/sophie-mccoid,https://muckrack.com/sophie-mccoid,brute force,-
...,...,...,...,...,...,...
3833,Levi Sumagaysay,10,https://muckrack.com/levi-sumagaysay,https://muckrack.com/levi-sumagaysay,brute force,-
3834,Ben Ashford,10,https://muckrack.com/ben-ashford,https://muckrack.com/ben-ashford,brute force,-
3835,Craig Brown,10,https://muckrack.com/craig-brown,https://muckrack.com/craig-brown,brute force,-
3836,Courtney Connley,10,https://muckrack.com/courtney-connley,https://muckrack.com/courtney-connley,brute force,-


In [16]:
data[data.request_url != data.redirect_url]

Unnamed: 0,author,article_count,request_url,redirect_url,method,confidence
0,Neil Shaw,523,https://muckrack.com/neil-shaw,https://muckrack.com/neil-shaw-1,brute force,-
7,Simon Duke,214,https://muckrack.com/simon-duke,https://muckrack.com/simonduketimes,brute force,-
13,Rianne Addo,168,https://muckrack.com/rianne-addo,https://muckrack.com/rianneaddo,brute force,-
15,SA Editor Clark Schultz,161,https://muckrack.com/clark-schultz,https://muckrack.com/clarkschultz,brute force,-
16,Lewis Winter,160,https://muckrack.com/lewis-winter,https://muckrack.com/lewisawinter,brute force,-
...,...,...,...,...,...,...
3802,Colin Kruger,10,https://muckrack.com/colin-kruger,https://muckrack.com/colinjkruger,brute force,-
3804,Rafael Olmeda,10,https://muckrack.com/rafael-olmeda,https://muckrack.com/rolmeda,brute force,-
3814,Rachel Koning Beals,10,https://muckrack.com/rachel-koning-beals,https://muckrack.com/rachelkbeals,brute force,-
3817,Corey Jones Tulsa World,10,https://muckrack.com/corey-jones,https://muckrack.com/coreyhjones_,brute force,-


In [None]:
urlopen(req).url

In [None]:
data[:100][data.request_url != data.redirect_url]