# data

> Web scraping and tools for data collection and processing

In [None]:
#| default_exp data

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sbavery/pseudometer/blob/main/nbs/01_data.ipynb)

In [None]:
#| hide
## Google Colab / Enchant Library Install for Dictionaries
#!apt update
#!apt install enchant-2 --fix-missing
#!apt install -qq enchant-2

In [None]:
#| export
import warnings
warnings.filterwarnings('ignore')
import requests
from bs4 import BeautifulSoup
import enchant
import re
import random
from collections import Counter
from fastai.text.all import *
import hashlib
import pickle

In [None]:
#| hide
## Utility Function to Check GPU Status
def check_gpu():
    print("CUDA Available: ", torch.cuda.is_available())
    num_devices = torch.cuda.device_count()
    if num_devices > 0:
        for device in range(0,num_devices):
            print("Device", device, "|", torch.cuda.get_device_name(device), 
            "| Allocated:", round(torch.cuda.memory_allocated(device)/1024**3,1), "GB",
            "| Cached:", round(torch.cuda.memory_reserved(device)/1024**3,1), "GB")

torch.cuda.empty_cache()

In [None]:
#| hide
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.0 GB | Cached: 0.0 GB


## Web Scraper

In [None]:
#| export
class Webpage:
    def __init__(self, url):
        self.url = url
        self.hash = self.get_hash_str()
        self.requested = False
        self.page_text = ""
        self.html = ""
        self.links = []
        self.text = []
        self.cleaned_text = []
        self.most_common_words = []
    
    def get_page(self, headers, min_size, max_size):
        r = requests.get(self.url, stream=True, headers=headers)
        content_length = int(r.headers.get('Content-Length', 0))
        data = []
        length = 0

        if content_length > max_size:
            return None

        for chunk in r.iter_content(1024):
            data.append(chunk)
            length += len(chunk)
            if length > max_size:
                return None
        r._content = b''.join(data)
        if len(r.text) < min_size: return None
        return r.text

    def get_page_html(self, min_size=1000, max_size=2000000):
        user_agents = [ 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 
            'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148', 
            'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36' 
        ] 
        user_agent = random.choice(user_agents) 
        headers = {'User-Agent': user_agent} 
        self.page_text = self.get_page(headers, min_size, max_size)
        self.html = BeautifulSoup(self.page_text, "html.parser")
        self.requested = True

    def get_hash_str(self, inp=""):
        return hashlib.sha3_256((self.url+inp).encode()).hexdigest()

    def get_html_anchors(self, keyword="http"):
        for anchor in self.html.findAll('a'):
            link = anchor.get('href')
            if link == None or link == "":
                continue
            if keyword in link:
                self.links.append(link)
                
    def get_html_text(self, tags=["p"]):
        for tag in tags:
            for p in self.html.findAll(tag):
                p_text = p.getText().strip()
                if p_text == None or p_text == '':
                    continue
                self.text.append(p_text)

    def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
        all_text = ' '.join(self.text).lower()
        regex_text = re.sub(rx,'',all_text).strip()
        split = regex_text.split()
        split = [word for word in split if word not in ignore]
        if enchant_dict != "": d = enchant.Dict(enchant_dict)
        for word in split:
            if len(self.cleaned_text) >= max_words: break
            if len(word) >= min_word_len:
                if enchant_dict == "":
                    self.cleaned_text.append(word)
                elif d.check(word): 
                    self.cleaned_text.append(word)

    def k_common_words(self, k=10, ignore=[]):
        if self.cleaned_text == "":
            text = self.text
        else:
            text = self.cleaned_text
        all_text = ' '.join(text).lower()
        split = all_text.split()
        split_ignore = [word for word in split if word not in ignore]
        counts = Counter(split_ignore)
        k_most_common = counts.most_common(k)
        self.most_common_words = k_most_common

    def save_text(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.text, file)
        file.close()

    def load_text(self, path, fname):
        file = open(path+fname, 'rb')
        self.text = pickle.load(file)
        file.close()

    def save_links(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.links, file)
        file.close()

    def load_links(self, path, fname):
        file = open(path+fname, 'rb')
        self.links = pickle.load(file)
        file.close()

In [None]:
#| hide
url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"
common_english = Webpage(url)
common_english.get_page_html(min_size=1000)
english_words = common_english.html.getText().lower()
english_words = english_words.split('\n')
print(len(english_words),"most common English words")
#english_words

1000 most common English words


In [None]:
#| hide
url = "http://www.ageofautism.com/"
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)

In [None]:
#| hide
test_page = Webpage(url)
test_page.get_page_html()
test_page.get_html_text()
test_page.get_html_anchors()
test_page.clean_html_text(500, ignore=english_words[:50], rx="[^a-zA-Z ]+")
test_page.save_text(path, test_page.hash+'.text')
test_page.save_links(path, test_page.hash+'.links')

In [None]:
new_page = Webpage(url)
fname_text = new_page.hash+'.text'
fname_links = new_page.hash+'.links'
if os.path.isfile(path+fname_text): 
    new_page.load_text(path, fname_text)
    print("Loading Text")
else:
    new_page.get_page_html()
    new_page.get_html_text(tags=["p","h1","h2","h3","span"])
    new_page.save_text(path, fname_text)

if os.path.isfile(path+fname_links): 
    new_page.load_links(path, fname_links)
    print("Loading Links")
else:
    new_page.get_page_html()
    new_page.get_html_anchors()
    new_page.save_links(path, fname_links)
new_page.clean_html_text(500, ignore=english_words[:50], rx="[^a-zA-Z ]+")
new_page.k_common_words(k=5,ignore=english_words[:50])
print(len(new_page.cleaned_text))
' '.join(new_page.cleaned_text)[:500]

Loading Text
Loading Links
500


'few extra minutes myself morning been hoping extra time my week get few things done sorting through paperwork catching laundry writing major things adding having extra time get least thing completed exactly hoped got morning first breakfast while eating decided scroll through felt awful after because ate because read longtime advocate posted link caught my eye usually read strangers obituaries sad glad decided click if feel moved shared please forward link learned witnessing family will continue'

In [None]:
#| export
def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
    page = Webpage(url)
    fname_text = page.hash+'.text'
    fname_links = page.hash+'.links'
    if path == None:
        page.get_page_html()
        page.get_html_text(tags=["p","h1","h2","h3","span"])
        page.get_html_anchors()
    else:
        if os.path.isfile(path+fname_text): 
            page.load_text(path, fname_text)
        else:
            page.get_page_html()
            page.get_html_text(tags=["p","h1","h2","h3","span"])
            page.save_text(path, fname_text)

        if os.path.isfile(path+fname_links): 
            page.load_links(path, fname_links)
        else:
            if page.html == "": page.get_page_html()
            page.get_html_anchors()
            page.save_links(path, fname_links)

    if page.text is not None:
        page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
        page.k_common_words(k=k, ignore=ignore_common)
    return page

def get_all_links(url, dict, category, k, min_words=20, max_words=500, ignore_text=[], ignore_common=[], ignore_filenames=[".mp3",".jpg",".png"], max_links="", path=None):
    primary_page = get_page_all(url, k, max_words, ignore_text, ignore_common, path)
    if primary_page.cleaned_text is not []:
        dict[url] = [primary_page.cleaned_text, primary_page.most_common_words, category]
        if max_links == "" or max_links > len(primary_page.links): max_links=len(primary_page.links)
        
        for count, link in enumerate(primary_page.links[:max_links]):
            if all(x not in link for x in ignore_filenames):
                try:
                    page = get_page_all(link, k, max_words, ignore_text, ignore_common, path)
                    if page.cleaned_text is not []:
                        if len(page.cleaned_text) == 0: continue
                        if len(page.cleaned_text) < min_words: category='unknown'
                        if len(page.cleaned_text) > max_words: page.cleaned_text = page.cleaned_text[:max_words]
                        if [page.cleaned_text, page.most_common_words] in dict.values(): continue
                        dict[link] = [page.cleaned_text, page.most_common_words, category]
                except:
                    pass
            if link in dict:
                res = str(len(dict[link][0]))+" words | "+str(dict[link][1][:3])
            else:
                res = "Rejected"
            progress_message = "%s link %4d/%4d | %s = %s %s" % (url, count, len(primary_page.links), link, res, 200*' ')
            sys.stdout.write("\r" + progress_message)
            sys.stdout.flush()
    else:
        print(url,"returned None, Skipping...")

In [None]:
categories = {
    "unknown":["https://www.huffpost.com/",
"https://www.wired.com/",
"https://www.theguardian.com/us",
"https://www.goodgopher.com"],
    "pseudoscience":["http://www.ageofautism.com/",
 "http://www.naturalnews.com", 
 "https://foodbabe.com/starthere/",
 "http://www.chopra.com",
 "https://www.mercola.com/",
 "https://www.history.com/",
 "https://doctoroz.com/",
 "https://www.disclose.tv/",
 "https://nationalreport.net/",
 "https://heartland.org/",
 "https://www.dailymail.co.uk/",
 "https://www.motherjones.com/"],
    "science":["https://sciencebasedmedicine.org/",
 "https://www.hopkinsmedicine.org/gim/research/method/ebm.html",
 "https://www.bbc.com/news/science_and_environment",
 "https://www.nature.com/",
 "https://www.science.org/",
 "https://www.snopes.com/top/",
 "https://quackwatch.org/",
 "https://www.skepdic.com/",
 "http://scibabe.com/",
 "http://pandasthumb.org/",
 "https://skepticalscience.com/",
 "https://www.cdc.gov/",
 "https://apnews.com/",
 "https://www.economist.com/",
 "https://www.livescience.com/",
 "https://www.newscientist.com/"]
 }

In [None]:
k = 30 # words
min_words = 50
max_words = 450
max_links = 50
ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on'] 
ignore_common = english_words[:50]
ignore_filenames = [".mp3",".jpg",".png",".mp4",".jfif","facebook.com","twitter.com"]

In [None]:
#| hide
d_dl = {}
d_train = {}
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)

for category in categories:
    for source in categories[category]:
        if category == "unknown":
            max_l = 0
        else:
            max_l = max_links
        get_all_links(source, d_dl, category, k, min_words, max_words, ignore_text, ignore_common, 
        ignore_filenames, max_l, path)


https://www.newscientist.com/ link   32/  33 | https://www.pinterest.com/newscientist/ = 14 words | [('new', 3), ('scientist', 1), ('opens', 1)]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [None]:
for link in d_dl:
    text = d_dl[link][0]
    common_words = ' '.join([count[0] for count in d_dl[link][1]])
    if link not in d_train:
        d_train[link] = [' '.join(text), common_words, d_dl[link][2]]
d_train

{'https://www.huffpost.com/': ['by entering your email clicking sign up agreeing let us send you customized marketing messages about us our advertising partners you are also agreeing our terms service privacy policy do you have info share with reporters how main menu news politics opinion entertainment life voices special projects personal video horoscopes from our partners newsletters international follow us trending latest news whats happening life you may like shopping you may like miss its personal personal politics entertainment life shopping communities you may like morning email got tip white supremacist nick filmed food fight at burger man charged with threatening doctor over transgender care noodle bones day pug dies at age hospital doctor charged with sex crimes eliminates us world cup parsons reflect love story behind spoiler alert celebrity gifts will make you famous with your friends actor lee dead at company seeks permission test brain implants people gifts give if you wa

In [None]:
#| hide
d_counts = {}
for link in d_dl:
    if d_dl[link][2] in d_counts:
        d_counts[d_dl[link][2]]+=Counter(dict(d_dl[link][1]))
    else:
        d_counts[d_dl[link][2]]=Counter(dict(d_dl[link][1]))

for category in d_counts:
    print("####",category,k,"Most Common Words ####\n",d_counts[category].most_common(k),"\n\n")

#### unknown 30 Most Common Words ####
 [('our', 697), ('health', 479), ('medicine', 415), ('more', 414), ('about', 325), ('care', 304), ('will', 273), ('climate', 268), ('science', 266), ('has', 257), ('research', 256), ('food', 244), ('no', 242), ('us', 241), ('internal', 223), ('their', 216), ('general', 216), ('its', 206), ('new', 205), ('division', 195), ('my', 193), ('email', 192), ('heartland', 192), ('menu', 184), ('if', 179), ('information', 170), ('who', 168), ('policy', 158), ('economist', 156), ('share', 155)] 


#### pseudoscience 30 Most Common Words ####
 [('our', 443), ('mother', 243), ('health', 196), ('donate', 154), ('access', 153), ('twitter', 146), ('oz', 139), ('us', 138), ('more', 132), ('technical', 128), ('storage', 128), ('purpose', 124), ('trump', 115), ('subscribe', 115), ('policy', 112), ('meditation', 111), ('my', 109), ('preferences', 104), ('marketing', 98), ('user', 96), ('senate', 91), ('its', 87), ('privacy', 86), ('will', 84), ('news', 81), ('informa

## Data Preparation

In [None]:
#| hide
df = pd.DataFrame.from_dict(d_train, orient='index', columns=['text', 'common_words', 'label'])
df.head()

Unnamed: 0,text,common_words,label
https://www.huffpost.com/,by entering your email clicking sign up agreeing let us send you customized marketing messages about us our advertising partners you are also agreeing our terms service privacy policy do you have info share with reporters how main menu news politics opinion entertainment life voices special projects personal video horoscopes from our partners newsletters international follow us trending latest news whats happening life you may like shopping you may like miss its personal personal politics entertainment life shopping communities you may like morning email got tip white supremacist nick film...,us may like trump our life personal charged story dead constitution actually email agreeing about partners news politics entertainment shopping white man doctor over care dies love gifts actor want,unknown
https://www.wired.com/,revisit this article visit my profile saved stories revisit this article select my account saved stories eve hoover eve hall may our biggest stories delivered your inbox every day see all newsletters by signing up you agree our user agreement including class action waiver arbitration provisions our privacy policy cookie statement receive marketing emails from wired you can unsubscribe at any time more from wired contact all rights reserved use this site constitutes acceptance our user agreement privacy policy cookie statement your privacy rights wired may earn portion sales from products a...,stories our twitter wired most biggest part my may privacy more site hunt dark webs kingpin should galaxy its revisit article saved eve user agreement policy cookie statement rights used,unknown
https://www.theguardian.com/us,from voting rights climate collapse reproductive freedom guardian relentlessly reports truth uncovers injustice exposes misinformation fearless independent journalism is essential building better world we are raising fund our journalism support guardian from as little as help us reach our goal thank you wake up global view get guardians top stories best reads one hit sign up first thing read latest here privacy notice newsletters may contain info about charities online ads content funded by outside parties more information click here our privacy policy we operate google protect our website...,our us home guardian world new after better privacy now farewell artist who climate truth journalism support help global guardians first thing latest here about more policy google protect news,unknown
https://www.goodgopher.com,instructions chrome opera safari edge gopher mail can also be linked your account if you have one once created good gopher mail account just plug your new account info into all set independent media academia remember me,account gopher mail instructions chrome opera safari edge also linked if once created good just plug new info into set independent media academia remember me,unknown
http://www.ageofautism.com/,by had few extra minutes myself morning been hoping some extra time my week get few things done sorting through some paperwork catching up laundry writing out some major things but they are adding up having extra time get at least one thing completed was exactly what hoped got morning but first breakfast while eating decided scroll through felt awful after not because what ate but because what read longtime advocate had posted link caught my eye usually read strangers obituaries as sad as was glad decided click if you feel moved by what he shared please forward his link with all learned wi...,our who my deaths age autism lies those extra been read posted better health so during people percent few morning time get things through least decided because link feel family,pseudoscience


In [None]:
dls = TextDataLoaders.from_df(df, bs=16, text_col='text', label_col='label')
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,xxbos this blog ran from march has been replaced by no armed xxunk xxunk no raising smart kids ct no xxunk xxunk no xxunk design no autism no xxunk cold reading no jun experiences xxunk enemies reason xxunk xxunk parks mystery park closed no may peter evolution bacterial xxunk chiropractors run xxunk cam aids soldiers forced work as male xxunk st no report tam xxunk skeptical journalists no science religion politics no mar xxunk begins filming what will become its hit show paranormal state ted hypocrisy psychic xxunk without clue critical thinking ct xxunk politics science young earth creationists poll reveals xxunk mercury health healing prayer studies find people who pray are talking themselves mice no march mostly republican war science no march abortion zoo takes intelligent design association advancement science xxunk educational practices no xxunk over cartoons no bizarre case tale torture murder xxunk xxunk satanic xxunk xxunk films,unknown
1,xxbos heartland institute submitted public comments proposal repeal clean power plan environmental protect agency issued an advanced notice proposed titled repeal carbon dioxide xxunk guidelines existing xxunk sources electric utility generating units clean power plan heartland institute senior fellow peter research fellow submitted comprehensive extensively documented more than figures more than xxunk public comment support repeal they note comment addresses following topics about heartland institute clean power plan is based an xxunk interpretation section clean air xxunk there is no legal authority must be costs have already vastly exceeded even expected benefits ii fossil fuels are essential prosperity worldwide hundreds years since industrial revolution fossil fuel use is has been associated with higher economic growth xxunk wages health life xxunk population reduced even after decades government xxunk xxunk alternative energy sources such as solar wind play only niche role us energy official us government projections show fossil fuels will be,unknown
2,xxbos march moment science ca nt be only one taken good long at society thought fuck gone berserk gon na go xxunk llamas well moment science berserk llama syndrome four species include llamas alpacas genetic testing indicates llamas alpacas were likely domesticated from respectively walking xxunk are capable breeding with each other which can result fertile offspring are mostly found wild while llamas alpacas are off living xxunk lives xxunk being bred their renewable supply fuzzy socks distant cousin xxunk these pack animals are west coast south bred wool six continents baby llamas called weigh lbs at birth typically grow be about lb adults are around same size while adult alpacas are lb range wool texture color can vary greatly from one pack next tempting as may be go hardcore procuring your xxunk supplies gon na throw out there maybe its not merely because an enemy joy though often linked,science


In [None]:
#| hide
torch.cuda.empty_cache()

In [None]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(4, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.021731,0.823727,0.646341,00:14


epoch,train_loss,valid_loss,accuracy,time
0,0.757276,0.884916,0.597561,00:24
1,0.653238,0.712408,0.664634,00:24
2,0.542174,0.701765,0.707317,00:24
3,0.397527,0.703672,0.707317,00:24


In [None]:
#| hide
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.4 GB | Cached: 2.1 GB


In [None]:
learn.show_results()

Unnamed: 0,text,category,category_
0,xxbos by food babe whenever enter conventional grocery store get heart xxunk you might think am kidding about this but am not my body gets heated my face starts xxunk end up saying about times before leave store think by now would have my emotions under control but think why am so passionate about xxunk this food system know you are too because you keep emailing commenting xxunk me all xxunk up things you are seeing out there ca nt thank you enough this investigation is one you have been asking so here is if you are new food babe you definitely want read this before you ever go deli counter again received emails comments social media expressing been led believe boars head deli meat than other brands wanted know if really is better or if just got really good marketing xxunk elsewhere sure makes sound like they add cheap,unknown,unknown
1,xxbos by had few extra minutes myself morning been hoping some extra time my week get few things done sorting through some paperwork catching up laundry writing out some major things but they are adding up having extra time get at least one thing completed was exactly what hoped got morning but first breakfast while eating decided scroll through felt awful after not because what ate but because what read longtime advocate had posted link caught my eye usually read strangers obituaries as sad as was glad decided click if you feel moved by what he shared please forward his link with all learned with all were witnessing with all this family will continue suffer its least we can do is contributing editor age autism lies my govt told me better future coming health guide down lies light why we came believe them posted by age autism at am comments,unknown,pseudoscience
2,xxbos ca nt get enough my writing want find out what else been up around internet well here talks appearances guest appearances detection fall food babe interview with thinking atheist when good science sounds agenda agenda avoiding food myths bad agenda ten questions food agenda things xxunk us reason rally experience dumb food myths debunked by this should vaccines be freedom report fake news survival guide hosted by atheist community business being atheists national convention how live life atheists national convention everything is killing you marketing guide bad science skeptical runs skeptical eye over alternative national science week people need hear from agriculture food babe vs vs princess brings scientific xxunk freedom report vaccines skeptical libertarian xxunk read between headlines xxunk sponsored by communicating about food myths social media age animal xxunk alliance food glorious with logic global warming evolution vaccines more beer xxunk national science week promo with xxunk,science,unknown
3,xxbos candidate machine learning digital twin aerospace industry valid from language interdisciplinary security reliability number fixed term full hours hours per start doctoral university seeks hire outstanding researchers at its interdisciplinary security reliability trust team under prof is carrying out interdisciplinary research secure reliable information communication technologies systems services often collaboration with industrial xxunk or is active several international research projects funded by national international research further information you may check looking people driven by excellence excited about innovation looking make difference if this sounds like you come aerospace industry is committed improving vehicles life cycle management due long vehicle life cycle more than years manufacturing service digital twin is by xxunk company have potential life cycle management tools by providing an xxunk process integrates all different stages air vehicles from operation maintenance om until their end life although aerospace industry is data is currently not digital twin platforms thus,science,unknown
4,xxbos when you purchase through links our site we may earn an affiliate commission how works minerals were found inside slice meteorite which was found two minerals have never been seen before earth have been discovered inside massive meteorite they could hold important clues how xxunk form two brand new minerals were found inside single xxunk xxunk slice taken from ton metric tons meteorite which was found scientists named minerals after xxunk after opens new tab managing director state university xxunk initiative principal investigator upcoming xxunk mission which will send probe investigate xxunk asteroid evidence how our solar systems planets formed whenever you find new mineral means actual geological conditions chemistry rock was different than what s been found before herd opens new tab professor department earth atmospheric sciences at university said statement opens new tab what makes this exciting this particular meteorite you have two officially described minerals are,science,science
5,xxbos stock ready ship verified customer team member xxunk ever since we launched our small company have wanted make snack bar would be happy eat you see always been type person reward myself at end long day with little treat was ritual i d grab my favorite cup tea treat then i d sit xxunk enjoy was little me moment would help me xxunk from day problem sometimes would eat piece chocolate other times was another snack would feel good moment horrible later needed something better why set out create something better wanted snack bar snack bar could xxunk eat at end long day feel good about only had one question what makes great snack bar had rules ever eat snack bar seemingly xxunk all xxunk out your mouth as if you spent an eternity desert or worse bar so sticky you feel like you need wash your hands immediately,unknown,unknown
6,xxbos google has declared war independent media has begun blocking emails from from getting our readers we recommend as free uncensored email receiving service or as free encrypted email send receive service okay continue with my address by tags apple regime big tech china xxunk xxunk deep state double standard musk free speech freedom xxunk globalist agenda hypocrisy john left cult liberty lies monitor private company xxunk article may contain statements reflect opinion author natural news once again regime is making up rules as goes along when comes how companies do not donate or side with democratic party are treated security strategic communications john retired us navy admiral responded reports apple is limiting its xxunk feature china by claiming regime business telling private companies how xxunk their initiatives he went then claim keeping an eye twitter is different thing after white house pledged do so earlier this week despite fact,unknown,unknown
7,xxbos its allies have agreed limit price oil barrel democrat will face republican walker after long at times bitter campaign as men finally go trial over attacks speaks victims families survive late scare against united states move into world cup quarterfinals with ruthless display finishing follow live text updates test match special commentary from day five first test between xxunk victory over set up world cup quarterfinal with was more proof sure touch team selection writes becomes top xxunk but he is upstaged by performance their last win over most amazing videos from why some jobs turn out be very different what was xxunk getting here easy but those who make are rewarded with perfect snow why soft girl xxunk is much more than social media trend mountain streams are helping make alpine villages energy china has vaccinated few old people making difficult abandon its policy its estimated plant animal,science,unknown
8,xxbos digital identity digital xxunk xxunk central bank digital currency social credit system human augmentation internet bodies these are all part xxunk future being rolled out by globalist cabal as solutions worlds problems has long been sold as way make us healthier eventually xxunk today its being promoted under great reset banner equity propaganda is everyone will benefit from human augmentation from richest xxunk reality is xxunk program xxunk name only anyone who thinks globalist cabal xxunk people planet intend allow xxunk xxunk who xxunk up their resources live even longer than we do already is sadly mistaken they have no such intent all talk health equity is smoke screen carrot xxunk people into going along with their plan what will ultimately be extermination billions some within this cabal world economic forum adviser being xxunk example are openly talking about elites plan society which is xxunk what sounds like world,unknown,unknown


## Making Model Predictions

In [None]:
test_categories = {
    'unknown':['https://www.huffpost.com/'],
    'pseudoscience':['https://www.foxnews.com/opinion',
'https://newspunch.com/'],
    'science':['https://www.si.edu/explore/science',
'https://www.theskepticsguide.org/about',
'https://arstechnica.com/']
}

In [None]:
d_pred = {}

for i,category in enumerate(test_categories):
    print(category)
    for source in test_categories[category]:
        train_source = False
        page = get_page_all(source, k, max_words, ignore_text, ignore_common)
        length = len(page.cleaned_text)
        print(source,length)
        if  length < min_words:
            print("ERROR:",source,length,"words")
        else:
            common_words = ' '.join([count[0] for count in page.most_common_words])
            text = ' '.join(page.cleaned_text)
            with learn.no_bar(), learn.no_logging():
                prediction = learn.predict(text)
            p = prediction[2][i].item()
            print(prediction)

            if source in d_train.keys(): train_source = True
            if category == prediction[0]:
                accuracy = p
            else:
                accuracy = 1-p
            d_pred[source] = [category, prediction[0], p, train_source, accuracy]

df = pd.DataFrame.from_dict(d_pred, orient='index', columns=['actual', 'prediction', 'probability', 'training source', 'accuracy'])

avg_accuracy = df['accuracy'].mean()
train_accuracy = df.loc[df['training source'] == True, 'accuracy'].mean()
test_accuracy = df.loc[df['training source'] == False, 'accuracy'].mean()

print("Average Accuracy =",avg_accuracy)
print("Train Source Accuracy =",train_accuracy)
print("Test Source Accuracy =",test_accuracy)
df.loc[df['training source'] == False]

unknown
https://www.huffpost.com/ 450
('unknown', tensor(2), tensor([0.2406, 0.0243, 0.7351]))
pseudoscience
https://www.foxnews.com/opinion 450
('unknown', tensor(2), tensor([0.2269, 0.0561, 0.7170]))
https://newspunch.com/ 397
('unknown', tensor(2), tensor([0.2764, 0.0212, 0.7024]))
science
https://www.si.edu/explore/science 450
('science', tensor(1), tensor([0.0206, 0.5065, 0.4730]))
https://www.theskepticsguide.org/about 450
('unknown', tensor(2), tensor([0.0311, 0.2885, 0.6804]))
https://arstechnica.com/ 450
('unknown', tensor(2), tensor([0.0894, 0.2398, 0.6708]))
Average Accuracy = 0.5475209802389145
Train Source Accuracy = 0.24057066440582275
Test Source Accuracy = 0.6089110434055328


Unnamed: 0,actual,prediction,probability,training source,accuracy
https://www.foxnews.com/opinion,pseudoscience,unknown,0.056059,False,0.943941
https://newspunch.com/,pseudoscience,unknown,0.021153,False,0.978847
https://www.si.edu/explore/science,science,science,0.472975,False,0.472975
https://www.theskepticsguide.org/about,science,unknown,0.680387,False,0.319613
https://arstechnica.com/,science,unknown,0.670821,False,0.329179


## Exporting and Loading the Model

In [None]:
#learn.export('models/2022.12.01 Model v1 88pct')

In [None]:
#learn = load_learner('models/2022.11.28 Model.pth', cpu=False)