In [1]:
import pickle
import random
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import ftfy
import humanize
from tabulate import tabulate

In [2]:
with open('../Data/articles.pkl', 'rb') as f:
    articles = pickle.load(f)
    
with open('../Data/all_muckrack_links.pkl', 'rb') as f:
    data = pickle.load(f)

In [3]:
articles = articles[['author', 'site_name', 'title', 'description', 'full_text']]

In [4]:
iab_mod = pickle.load(open('../Data/IAB/IAB_classifier.p','rb'))
iab_vec = pickle.load(open('../Data/IAB/IAB_vectorizer.p','rb'))
iab_bin = pickle.load(open('../Data/IAB/IAB_binarizer.p','rb'))



In [5]:
nltk.download('stopwords')
nltk.download('wordnet')
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
def iab_taxonomy_v2(text):
    text = clean(text)
    text = iab_vec.transform([text])
    res = iab_bin.inverse_transform(iab_mod.predict(text))
    return res

In [7]:
def clean(doc):
    doc = ftfy.fix_text(doc)
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    processed = re.sub(r"\d+","", normalized)
    y = processed.split()
    return ' '.join(y)

In [8]:
all_news = articles.merge(data, on = 'author')
all_news = all_news[all_news.Beat != '-'].reset_index()

In [9]:
random_news = random.sample(range(1, len(all_news)), 1000)
random_news = all_news[all_news.index.isin(random_news)].reset_index().drop(columns = ['level_0', 'index'])

In [10]:
random_news

Unnamed: 0,author,site_name,title,description,full_text,article_count,request_url,redirect_url,method,confidence,Beat
0,David Matthews,nydailynews.com,U.S. teen who broke Cayman quarantine gets red...,The Cayman Islands has reduced the prison sent...,The Cayman Islands has reduced the prison sen...,113,https://muckrack.com/david-matthews,https://muckrack.com/david-matthews,brute force,-,"Education, Science"
1,David Matthews,nydailynews.com,"70-year-old Taco Bell worker gets $6,000 gift ...",A 70-year-old Taco Bell employee received more...,"70-year-old Taco Bell worker gets $6,000 gift...",113,https://muckrack.com/david-matthews,https://muckrack.com/david-matthews,brute force,-,"Education, Science"
2,David Matthews,sun-sentinel.com,John Mulaney checks into rehab: reports - Sout...,Comedian John Mulaney has checked into rehab b...,"Mulaney, 38, who joined the writing staff of ...",113,https://muckrack.com/david-matthews,https://muckrack.com/david-matthews,brute force,-,"Education, Science"
3,David Matthews,nydailynews.com,Utah hiker survives 100 foot fall from cliff -...,A Utah hiker survived after slipping and falli...,Utah hiker survives 100-foot fall from cliff ...,113,https://muckrack.com/david-matthews,https://muckrack.com/david-matthews,brute force,-,"Education, Science"
4,Denis Slattery,nydailynews.com,"Cuomo confident in Biden win, coronavirus aid ...","The governor, who backed the fellow Democrat’s...",ALBANY — Gov. Cuomo isn’t surprised that Pres...,36,https://muckrack.com/denis-slattery,https://muckrack.com/denis-slattery,brute force,-,"Politics, U.S."
...,...,...,...,...,...,...,...,...,...,...,...
995,Sruthi Shankar,U.S.,European stocks slide after virus fears knock ...,European stocks fell on Thursday following sha...,European stocks slide after virus fears knock...,13,https://muckrack.com/sruthi-shankar,https://muckrack.com/sruthi-shanker,brute force,-,Business and Finance
996,Mary Kekatos,Mail Online,COVID-19 Texas: El Paso posts grim ad for temp...,The El Paso County Medical Examiner's Office p...,The El Paso County Medical Examiner's Office ...,10,https://muckrack.com/mary-kekatos,https://muckrack.com/marykekatos,brute force,-,"Health, Metro New York"
997,Jennifer A. Kingson,Axios,The urban bathroom shortage worsens - Axios,People are calling on municipal leaders to reo...,The urban bathroom shortage worsens With libr...,11,https://muckrack.com/jenniferking,https://muckrack.com/jenniferking,no match,0.62,Australia
998,John Collett,Brisbane Times,Inner-city apartments face 10 per cent price s...,The recovery in property prices picked up spee...,Inner-city apartments face 10 per cent price ...,13,https://muckrack.com/john-collett,https://muckrack.com/jcollett_money,brute force,-,Business and Finance


In [11]:
random_news.author.value_counts()

James Rodger         6
Rob Goldberg         4
Alaa Elassar, CNN    4
Rianne Addo          4
David Matthews       4
                    ..
Eric McGowan         1
Wendy Liberatore     1
James Dutton         1
Geoff Ziezulewicz    1
Elana Lyn Gross      1
Name: author, Length: 766, dtype: int64

In [12]:
random_news.Beat.value_counts()

United Kingdom                                            138
Sports, United Kingdom                                     64
Arts and Entertainment, United Kingdom                     47
Arts and Entertainment                                     45
Sports                                                     39
                                                         ... 
Politics, Singapore                                         1
Transportation, U.S.                                        1
Arts and Entertainment, Food and Dining, U.S. Regional      1
Arts and Entertainment, Metro Chicago, Sports, U.S.         1
Military, U.S., U.S. Regional                               1
Name: Beat, Length: 254, dtype: int64

In [13]:
test = []

In [14]:
for i in range(1000):
    news_article = random_news.iloc[i]
    content = news_article.full_text
    author_beat = news_article.Beat
    news_article = ' '.join([(news_article.title + ' ') * 4, (news_article.description + ' ') * 2, news_article.full_text])
    predicted_beat = iab_taxonomy_v2(news_article)
    test.append([author_beat, predicted_beat[0], content])

In [15]:
random_news

Unnamed: 0,author,site_name,title,description,full_text,article_count,request_url,redirect_url,method,confidence,Beat
0,David Matthews,nydailynews.com,U.S. teen who broke Cayman quarantine gets red...,The Cayman Islands has reduced the prison sent...,The Cayman Islands has reduced the prison sen...,113,https://muckrack.com/david-matthews,https://muckrack.com/david-matthews,brute force,-,"Education, Science"
1,David Matthews,nydailynews.com,"70-year-old Taco Bell worker gets $6,000 gift ...",A 70-year-old Taco Bell employee received more...,"70-year-old Taco Bell worker gets $6,000 gift...",113,https://muckrack.com/david-matthews,https://muckrack.com/david-matthews,brute force,-,"Education, Science"
2,David Matthews,sun-sentinel.com,John Mulaney checks into rehab: reports - Sout...,Comedian John Mulaney has checked into rehab b...,"Mulaney, 38, who joined the writing staff of ...",113,https://muckrack.com/david-matthews,https://muckrack.com/david-matthews,brute force,-,"Education, Science"
3,David Matthews,nydailynews.com,Utah hiker survives 100 foot fall from cliff -...,A Utah hiker survived after slipping and falli...,Utah hiker survives 100-foot fall from cliff ...,113,https://muckrack.com/david-matthews,https://muckrack.com/david-matthews,brute force,-,"Education, Science"
4,Denis Slattery,nydailynews.com,"Cuomo confident in Biden win, coronavirus aid ...","The governor, who backed the fellow Democrat’s...",ALBANY — Gov. Cuomo isn’t surprised that Pres...,36,https://muckrack.com/denis-slattery,https://muckrack.com/denis-slattery,brute force,-,"Politics, U.S."
...,...,...,...,...,...,...,...,...,...,...,...
995,Sruthi Shankar,U.S.,European stocks slide after virus fears knock ...,European stocks fell on Thursday following sha...,European stocks slide after virus fears knock...,13,https://muckrack.com/sruthi-shankar,https://muckrack.com/sruthi-shanker,brute force,-,Business and Finance
996,Mary Kekatos,Mail Online,COVID-19 Texas: El Paso posts grim ad for temp...,The El Paso County Medical Examiner's Office p...,The El Paso County Medical Examiner's Office ...,10,https://muckrack.com/mary-kekatos,https://muckrack.com/marykekatos,brute force,-,"Health, Metro New York"
997,Jennifer A. Kingson,Axios,The urban bathroom shortage worsens - Axios,People are calling on municipal leaders to reo...,The urban bathroom shortage worsens With libr...,11,https://muckrack.com/jenniferking,https://muckrack.com/jenniferking,no match,0.62,Australia
998,John Collett,Brisbane Times,Inner-city apartments face 10 per cent price s...,The recovery in property prices picked up spee...,Inner-city apartments face 10 per cent price ...,13,https://muckrack.com/john-collett,https://muckrack.com/jcollett_money,brute force,-,Business and Finance


In [16]:
pd.read_html(tabulate(test, tablefmt='html', headers=["Muckrack beat", "Predicted tag", "Article content"]))[0].to_csv("test.csv")