# data

> Web scraping and tools for data collection and processing

In [1]:
#| default_exp data

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sbavery/pseudometer/blob/main/nbs/01_data.ipynb)

In [2]:
#| hide
## Google Colab / Enchant Library Install for Dictionaries
#!apt update
#!apt install enchant-2 --fix-missing
#!apt install -qq enchant-2

In [3]:
#| export
import warnings
warnings.filterwarnings('ignore')
import requests
from bs4 import BeautifulSoup
import enchant
import re
import random
from collections import Counter
from fastai.text.all import *
import hashlib
import pickle

In [4]:
#| hide
## Utility Function to Check GPU Status
def check_gpu():
    print("CUDA Available: ", torch.cuda.is_available())
    num_devices = torch.cuda.device_count()
    if num_devices > 0:
        for device in range(0,num_devices):
            print("Device", device, "|", torch.cuda.get_device_name(device), 
            "| Allocated:", round(torch.cuda.memory_allocated(device)/1024**3,1), "GB",
            "| Cached:", round(torch.cuda.memory_reserved(device)/1024**3,1), "GB")

torch.cuda.empty_cache()

In [5]:
#| hide
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.0 GB | Cached: 0.0 GB


## Web Scraper

In [6]:
#| export
class Webpage:
    def __init__(self, url):
        self.url = url
        self.hash = self.get_hash_str()
        self.requested = False
        self.page_text = ""
        self.html = ""
        self.links = []
        self.text = []
        self.cleaned_text = []
        self.most_common_words = []
    
    def get_page(self, headers, min_size, max_size):
        r = requests.get(self.url, stream=True, headers=headers)
        content_length = int(r.headers.get('Content-Length', 0))
        data = []
        length = 0

        if content_length > max_size:
            return None

        for chunk in r.iter_content(1024):
            data.append(chunk)
            length += len(chunk)
            if length > max_size:
                return None
        r._content = b''.join(data)
        if len(r.text) < min_size: return None
        return r.text

    def get_page_html(self, min_size=1000, max_size=2000000):
        user_agents = [ 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 
            'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148', 
            'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36' 
        ] 
        user_agent = random.choice(user_agents) 
        headers = {'User-Agent': user_agent} 
        self.page_text = self.get_page(headers, min_size, max_size)
        self.html = BeautifulSoup(self.page_text, "html.parser")
        self.requested = True

    def get_hash_str(self, inp=""):
        return hashlib.sha3_256((self.url+inp).encode()).hexdigest()

    def get_html_anchors(self, keyword="http"):
        for anchor in self.html.findAll('a'):
            link = anchor.get('href')
            if link == None or link == "":
                continue
            if keyword in link:
                self.links.append(link)
                
    def get_html_text(self, tags=["p"]):
        for tag in tags:
            for p in self.html.findAll(tag):
                p_text = p.getText().strip()
                if p_text == None or p_text == '':
                    continue
                self.text.append(p_text)

    def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
        all_text = ' '.join(self.text).lower()
        regex_text = re.sub(rx,'',all_text).strip()
        split = regex_text.split()
        split = [word for word in split if word not in ignore]
        if enchant_dict != "": d = enchant.Dict(enchant_dict)
        for word in split:
            if len(self.cleaned_text) >= max_words: break
            if len(word) >= min_word_len:
                if enchant_dict == "":
                    self.cleaned_text.append(word)
                elif d.check(word): 
                    self.cleaned_text.append(word)

    def k_common_words(self, k=10, ignore=[]):
        if self.cleaned_text == "":
            text = self.text
        else:
            text = self.cleaned_text
        all_text = ' '.join(text).lower()
        split = all_text.split()
        split_ignore = [word for word in split if word not in ignore]
        counts = Counter(split_ignore)
        k_most_common = counts.most_common(k)
        self.most_common_words = k_most_common

    def save_text(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.text, file)
        file.close()

    def load_text(self, path, fname):
        file = open(path+fname, 'rb')
        self.text = pickle.load(file)
        file.close()

    def save_links(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.links, file)
        file.close()

    def load_links(self, path, fname):
        file = open(path+fname, 'rb')
        self.links = pickle.load(file)
        file.close()

In [7]:
#| hide
url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"
common_english = Webpage(url)
common_english.get_page_html(min_size=1000)
english_words = common_english.html.getText().lower()
english_words = english_words.split('\n')
print(len(english_words),"most common English words")
#english_words

1000 most common English words


In [8]:
#| hide
url = "http://www.ageofautism.com/"
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)

In [9]:
#| hide
test_page = Webpage(url)
test_page.get_page_html()
test_page.get_html_text()
test_page.get_html_anchors()
test_page.clean_html_text(500, ignore=english_words[:50], rx="[^a-zA-Z ]+")
test_page.save_text(path, test_page.hash+'.text')
test_page.save_links(path, test_page.hash+'.links')

In [10]:
new_page = Webpage(url)
fname_text = new_page.hash+'.text'
fname_links = new_page.hash+'.links'
if os.path.isfile(path+fname_text): 
    new_page.load_text(path, fname_text)
    print("Loading Text")
else:
    new_page.get_page_html()
    new_page.get_html_text(tags=["p","h1","h2","h3","span"])
    new_page.save_text(path, fname_text)

if os.path.isfile(path+fname_links): 
    new_page.load_links(path, fname_links)
    print("Loading Links")
else:
    new_page.get_page_html()
    new_page.get_html_anchors()
    new_page.save_links(path, fname_links)
new_page.clean_html_text(500, ignore=english_words[:50], rx="[^a-zA-Z ]+")
new_page.k_common_words(k=5,ignore=english_words[:50])
print(len(new_page.cleaned_text))
' '.join(new_page.cleaned_text)[:500]

Loading Text
Loading Links
500


'holy high fructose corn syrup health defenses defender has article exposes yet more twisted truth about academy pediatrics company once liked teach world sing perfect harmony funds pediatric healthcare confused health many years ago pediatrics appalled row vending machines loaded junk food soda glass soda back most ounces made sugar refreshing treat big food such food industry so intertwined healthcare its impossible separate them aside if watched wheres my recommend academy pediatrics great par'

In [11]:
#| export
def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
    page = Webpage(url)
    fname_text = page.hash+'.text'
    fname_links = page.hash+'.links'
    if path == None:
        page.get_page_html()
        page.get_html_text(tags=["p","h1","h2","h3","span"])
        page.get_html_anchors()
    else:
        if os.path.isfile(path+fname_text): 
            page.load_text(path, fname_text)
        else:
            page.get_page_html()
            page.get_html_text(tags=["p","h1","h2","h3","span"])
            page.save_text(path, fname_text)

        if os.path.isfile(path+fname_links): 
            page.load_links(path, fname_links)
        else:
            if page.html == "": page.get_page_html()
            page.get_html_anchors()
            page.save_links(path, fname_links)

    if page.text is not None:
        page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
        page.k_common_words(k=k, ignore=ignore_common)
    return page

def get_all_links(url, dict, category, k, min_words=20, max_words=500, ignore_text=[], ignore_common=[], ignore_filenames=[".mp3",".jpg",".png"], max_links="", path=None):
    primary_page = get_page_all(url, k, max_words, ignore_text, ignore_common, path)
    if primary_page.cleaned_text is not []:
        dict[url] = [primary_page.cleaned_text, primary_page.most_common_words, category]
        if max_links == "" or max_links > len(primary_page.links): max_links=len(primary_page.links)
        
        for count, link in enumerate(primary_page.links[:max_links]):
            if all(x not in link for x in ignore_filenames):
                try:
                    page = get_page_all(link, k, max_words, ignore_text, ignore_common, path)
                    if page.cleaned_text is not []:
                        if len(page.cleaned_text) == 0: continue
                        if len(page.cleaned_text) < min_words: continue #category='unknown'
                        if len(page.cleaned_text) > max_words: page.cleaned_text = page.cleaned_text[:max_words]
                        if [page.cleaned_text, page.most_common_words] in dict.values(): continue
                        dict[link] = [page.cleaned_text, page.most_common_words, category]
                except:
                    pass
            if link in dict:
                res = str(len(dict[link][0]))+" words | "+str(dict[link][1][:3])
            else:
                res = "Rejected"
            progress_message = "%s link %4d/%4d | %s = %s %s" % (url, count, len(primary_page.links), link, res, 500*' ')
            sys.stdout.write("\r" + progress_message)
            sys.stdout.flush()
    else:
        print(url,"returned None, Skipping...")

In [12]:
"""
    "unknown":["https://www.huffpost.com/",
"https://www.wired.com/",
"https://www.theguardian.com/us",
"https://www.goodgopher.com"],
"""

categories = {
    "pseudoscience":["http://www.ageofautism.com/",
 "http://www.naturalnews.com", 
 "https://foodbabe.com/starthere/",
 "http://www.chopra.com",
 "https://www.mercola.com/",
 "https://www.history.com/",
 "https://doctoroz.com/",
 "https://www.disclose.tv/",
 "https://nationalreport.net/",
 "https://heartland.org/",
 "https://www.dailymail.co.uk/",
 "https://www.motherjones.com/"],
    "science":["https://sciencebasedmedicine.org/",
 "https://www.hopkinsmedicine.org/gim/research/method/ebm.html",
 "https://www.bbc.com/news/science_and_environment",
 "https://www.nature.com/",
 "https://www.science.org/",
 "https://www.snopes.com/top/",
 "https://quackwatch.org/",
 "https://www.skepdic.com/",
 "http://scibabe.com/",
 "http://pandasthumb.org/",
 "https://skepticalscience.com/",
 "https://www.cdc.gov/",
 "https://apnews.com/",
 "https://www.economist.com/",
 "https://www.livescience.com/",
 "https://www.newscientist.com/"]
 }

In [13]:
k = 30 # words
min_words = 100
max_words = 450
max_links = 50
ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on'] 
ignore_common = english_words[:50]
ignore_filenames = [".mp3",".jpg",".png",".mp4",".jfif","facebook.com","twitter.com"]

In [14]:
#| hide
d_dl = {}
d_train = {}
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)

for category in categories:
    for source in categories[category]:
        if category == "unknown":
            max_l = 0
        else:
            max_l = max_links
        get_all_links(source, d_dl, category, k, min_words, max_words, ignore_text, ignore_common, 
        ignore_filenames, max_l, path)


https://www.newscientist.com/ link   30/  33 | https://www.twitter.com/newscientist = Rejected                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [15]:
for link in d_dl:
    text = d_dl[link][0]
    common_words = ' '.join([count[0] for count in d_dl[link][1]])
    if link not in d_train:
        d_train[link] = [' '.join(text), common_words, d_dl[link][2]]
#d_train

In [16]:
#| hide
d_counts = {}
for link in d_dl:
    if d_dl[link][2] in d_counts:
        d_counts[d_dl[link][2]]+=Counter(dict(d_dl[link][1]))
    else:
        d_counts[d_dl[link][2]]=Counter(dict(d_dl[link][1]))

for category in d_counts:
    print("####",category,k,"Most Common Words ####\n",d_counts[category].most_common(k),"\n\n")

#### pseudoscience 30 Most Common Words ####
 [('our', 671), ('health', 507), ('more', 292), ('food', 287), ('has', 247), ('mother', 241), ('us', 233), ('about', 228), ('policy', 220), ('its', 217), ('my', 216), ('their', 214), ('will', 211), ('heartland', 192), ('access', 169), ('news', 168), ('twitter', 168), ('trump', 164), ('information', 162), ('email', 158), ('donate', 151), ('subscribe', 147), ('privacy', 143), ('oz', 143), ('care', 135), ('purpose', 135), ('new', 133), ('if', 129), ('her', 128), ('technical', 128)] 


#### science 30 Most Common Words ####
 [('our', 893), ('medicine', 764), ('more', 755), ('science', 669), ('about', 599), ('new', 565), ('health', 489), ('menu', 405), ('us', 375), ('care', 348), ('research', 347), ('its', 343), ('will', 339), ('published', 332), ('johns', 286), ('news', 284), ('economist', 278), ('these', 255), ('has', 250), ('cookies', 250), ('information', 249), ('access', 240), ('world', 239), ('internal', 236), ('climate', 235), ('general', 

## Data Preparation

In [17]:
#| hide
df = pd.DataFrame.from_dict(d_train, orient='index', columns=['text', 'common_words', 'label'])
df.head()

Unnamed: 0,text,common_words,label
http://www.ageofautism.com/,holy high fructose corn syrup health defenses defender has an article exposes yet more twisted truth about academy pediatrics company once liked teach he world sing perfect harmony funds pediatric healthcare not be confused with health many years ago was at pediatrics appalled by row vending machines loaded with junk food soda glass soda back when was at most ounces made with sugar was refreshing treat big food such as is food industry is so intertwined with healthcare its impossible separate them as an aside if you watched wheres my recommend academy pediatrics great partner ours new pape...,health public academy paper pediatrics conferences truth about nutrition events speakers researchers found food industry great new influence academic institutions uncovered biggest senior vice president alliance documents know conference has,pseudoscience
https://www.ageofautism.com/,by had few extra minutes myself morning been hoping some extra time my week get few things done sorting through some paperwork catching up laundry writing out some major things but they are adding up having extra time get at least one thing completed was exactly what hoped got morning but first breakfast while eating decided scroll through felt awful after not because what ate but because what read longtime advocate had posted link caught my eye usually read strangers obituaries as sad as was glad decided click if you feel moved by what he shared please forward his link with all learned wi...,our who my deaths age autism lies those extra been read posted better health so during people percent few morning time get things through least decided because link feel family,pseudoscience
https://www.ageofautism.com/exclusives.html,editorials from series by here come you had me at an elaborate fraud series deer special report what do epidemiological studies really tell us note from there are epidemiological studies here vaccines autism these studies represent most often cited papers by scientists public health officials members media when trying refute any evidence an association between vaccinations autism there are serious methodological limitations design flaws conflicts interest or other problems related each these studies these flaws have been pointed out by government officials other researchers medical review ...,autism studies health these series here public epidemiological vaccines officials limitations flaws study defense donate editorials come me elaborate fraud deer special report do really tell us note represent most,pseudoscience
https://www.ageofautism.com/science/,tom urged get their bivalent vaccine booster yesterday twitter kindly let him his followers know about week class starting this week through countermeasures injury compensation program this is program absolves corporations whose products harm during pandemic you can take course both live recorded version national vaccine injury compensation program countermeasures injury compensation program used emergency authorized this course students will learn structure function defects programs us created by congress award compensation adults children potentially actually harmed or killed by vaccines...,compensation injury program vaccine course high disease consequence no countermeasures us four their know week will john march health before been infectious has get class through products both live national,pseudoscience
https://www.ageofautism.com/a-welcome-from-dan-olmste.html,welcome age autism daily web newspaper autism epidemic donate please either use donate button right sidebar secure accepts all credit cards or send check autism age box ct donations are tax deductible our nonprofit is thank are published give voice those who believe autism is an environmentally induced illness is treatable children can recover most part major media united states interested point view they wont investigate causes possible biomedical treatments autism independently they listen most important people parents many whom have witnessed autistic regression medical illness after va...,autism comments donate health age our those epidemic who believe illness most people medical us defense public welcome daily web newspaper please either button right sidebar secure accepts credit cards,pseudoscience


In [18]:
dls = TextDataLoaders.from_df(df, bs=16, text_col='text', label_col='label')
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,xxbos children today are from previous generations proof is news coverage we see every day this site shows you what s happening schools around are increasingly disabled chronically ill education system has accommodate them long associated with autism like sensory issues repetitive behaviors lack social skills are now problems affecting mainstream students blame is predictably placed bad parenting otherwise known as trauma from home addressing mental health needs is as important as academics modern educators this is an xxunk disaster here are about children who ca nt learn or xxunk like children have always been expected what childhood has become is chilling xxunk future mankind home schools week plans check xxunk capacity secondary schools local authorities could then use updated numbers spectrum news ma new xxunk students with autism opens news calls changes special educational needs funding system news xxunk hub model autism funding leaves some kids behind premier announcement,pseudoscience
1,xxbos by food babe whenever enter conventional grocery store get heart xxunk you might think am kidding about this but am not my body gets heated my face starts xxunk end up saying about times before leave store think by now would have my emotions under control but think why am so passionate about xxunk this food system know you are too because you keep emailing commenting xxunk me all xxunk up things you are seeing out there ca nt thank you enough this investigation is one you have been asking so here is if you are new food babe you definitely want read this before you ever go deli counter again received emails comments social media expressing been led believe boars head deli meat than other brands wanted know if really is better or if just got really good marketing xxunk elsewhere sure makes sound like they add cheap,pseudoscience
2,xxbos where employees are corporations key assets workers greater power comes threatening walk out door when musk bought twitter he clearly know key assets he was buying lay twitters workers heads corporate balance sheets assets corporation are its xxunk equipment patents brand name workers considered assets they appear as costs fact xxunk are typically corporations total costs which is why companies often cut xxunk increase profits reason this is corporations have xxunk been viewed as production systems assets are things corporations own which turn xxunk labor xxunk materials components into xxunk products reduce costs these xxunk xxunk each product xxunk more profit or been traditional view yet today increasingly corporations just production systems systems directing people who work within them large growing part value corporation now lies heads its workers heads know how xxunk know what needs improvement know where strengths vulnerabilities are found know why corporation xxunk or these,science


In [19]:
#| hide
torch.cuda.empty_cache()

In [20]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(4, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.575557,0.343493,0.935252,00:13


epoch,train_loss,valid_loss,accuracy,time
0,0.335934,0.281781,0.913669,00:23
1,0.317811,0.181477,0.935252,00:23
2,0.2117,0.136886,0.956835,00:24
3,0.164531,0.130294,0.956835,00:24


In [21]:
#| hide
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.4 GB | Cached: 2.2 GB


In [22]:
learn.show_results()

Unnamed: 0,text,category,category_
0,xxbos this blog ran from march has been replaced by no armed xxunk xxunk no raising smart kids ct no xxunk xxunk no xxunk design no autism no xxunk cold reading no jun experiences xxunk enemies reason xxunk xxunk parks mystery park closed no may peter evolution bacterial xxunk chiropractors run xxunk cam aids soldiers forced work as male xxunk st no report tam xxunk skeptical journalists no science religion politics no mar xxunk begins filming what will become its hit show paranormal state ted hypocrisy psychic xxunk without clue critical thinking ct xxunk politics science young earth creationists poll reveals xxunk mercury health healing prayer studies find people who pray are talking themselves mice no march mostly republican war science no march abortion zoo takes intelligent design association advancement science xxunk educational practices no xxunk over cartoons no bizarre case tale torture murder xxunk xxunk satanic xxunk xxunk films,science,science
1,xxbos organic vegan xxunk organic plant based protein powder with ingredients your body will thank you add scoop xxunk overnight xxunk or baking need recipe go just add water sweet smooth way prevent pm crash free shipping within us this was reaction when she tried chocolate protein how are we able get rid xxunk texture xxunk flavor you get from other protein powders simple we use unique blend plant proteins one main ingredients is organic pea protein but instead turning whole pea xxunk all into pea protein we xxunk first this makes our powder smooth delicious avoids additives good reason with only ingredients be wondering how does taste so good answer we use real food keep pure no fillers no xxunk no artificial colors no artificial sweeteners we also avoid instead we use organic xxunk fruit add just right amount xxunk view nutrition facts view nutrition facts going double check,pseudoscience,pseudoscience
2,xxbos heartland institute submitted public comments proposal repeal clean power plan environmental protect agency issued an advanced notice proposed titled repeal carbon dioxide xxunk guidelines existing xxunk sources electric utility generating units clean power plan heartland institute senior fellow peter research fellow submitted comprehensive extensively documented more than figures more than xxunk public comment support repeal they note comment addresses following topics about heartland institute clean power plan is based an xxunk interpretation section clean air xxunk there is no legal authority must be costs have already vastly exceeded even expected benefits ii fossil fuels are essential prosperity worldwide hundreds years since industrial revolution fossil fuel use is has been associated with higher economic growth xxunk wages health life xxunk population reduced even after decades government xxunk xxunk alternative energy sources such as solar wind play only niche role us energy official us government projections show fossil fuels will be,pseudoscience,pseudoscience
3,xxbos dissect your favorite foods learn truth about food health industry just take look at dollar companies changed free guide healthy xxunk anytime anywhere free grocery shopping guide navigate xxunk like pro know what eat your health with fast easy meal plans welcome my blog here you will find wealth information from food reports recipes so you too can lead food babe lifestyle orange pops recipe healthy swap orange were one my favorite treats as child since learned how bad ingredients are them really missed them thankfully figured out how recreate this xxunk staple with real food ingredients which are actually healthy you spicy chicken sandwich ingredients exposed see what s those fries biscuits too when came out with their spicy chicken sandwich couple years ago was unlike anything i d ever seen lines were around block demand was so insane they sold out these xxunk weeks my first thought,pseudoscience,pseudoscience
4,xxbos candidate machine learning digital twin aerospace industry valid from language interdisciplinary security reliability number fixed term full hours hours per start doctoral university seeks hire outstanding researchers at its interdisciplinary security reliability trust team under prof is carrying out interdisciplinary research secure reliable information communication technologies systems services often collaboration with industrial xxunk or is active several international research projects funded by national international research further information you may check looking people driven by excellence excited about innovation looking make difference if this sounds like you come aerospace industry is committed improving vehicles life cycle management due long vehicle life cycle more than years manufacturing service digital twin is by xxunk company have potential life cycle management tools by providing an xxunk process integrates all different stages air vehicles from operation maintenance om until their end life although aerospace industry is data is currently not digital twin platforms thus,science,science
5,xxbos when you purchase through links our site we may earn an affiliate commission how works minerals were found inside slice meteorite which was found two minerals have never been seen before earth have been discovered inside massive meteorite they could hold important clues how xxunk form two brand new minerals were found inside single xxunk xxunk slice taken from ton metric tons meteorite which was found scientists named minerals after xxunk after opens new tab managing director state university xxunk initiative principal investigator upcoming xxunk mission which will send probe investigate xxunk asteroid evidence how our solar systems planets formed whenever you find new mineral means actual geological conditions chemistry rock was different than what s been found before herd opens new tab professor department earth atmospheric sciences at university said statement opens new tab what makes this exciting this particular meteorite you have two officially described minerals are,science,science
6,xxbos north schedule latest international news from tests out if you can visit keep health condition latest international news from how son militia leader escaped his nick cave talks latest international news from speaks xxunk writer latest international news from xxunk three young women who left their homes latest international news from women follows two women who have been xxunk us latest international news from demonstrations china football latest international news from tests out if you can visit keep health condition latest international news from click explores cutting edge gene therapy helping patients with serious latest international news from all latest sports news results from around globe travels latest international news from tells extraordinary story newly formed freedom latest international news from as winter begins bite has got an energy strategy life after latest international news from after thousands xxunk wash up dead coast local xxunk look xxunk latest international,science,science
7,xxbos sign up read our regular email newsletters children are xxunk than we thought so why do so many them believe father xxunk question reveals lot about child psychology even more about adults by rituals we build around may be more our benefit than our photos rituals we build around may be more our benefit than our martin photos was when he started question existence every like many kids he had left out an apple carrot reindeer cold beer man himself every year he found snacks an empty glass alongside xxunk presents next day but had started having doubts with his growing he even xxunk plan check his parents receipts was beginning end or end beginning my belief he says two decades later now xxunk at university is investigating father again this time he is xxunk ways children tell fact from fiction he wants know why some kids are more,science,science
8,xxbos our global coverage offers distinctive insight incisive analysis start your day free trial week over subscribers trust economist help them make sense world news join them by downloading economist app includes daily selection best our journalism along with full weekly version economist newspaper read or listen subscriber economist simply install app sign with your economist login economist is an trusted filter global affairs offering analysis everything from business finance politics science technology economics covers every region world including united states xxunk at big topics from climate change sustainability future work economist is also available as weekly stories latest news analysis combined with highlights from weekly issue economist world brief short morning briefing days global economist weekly edition full access weekly version economist audio version weekly edition listen selection our podcasts including intelligence daily xxunk xxunk from economists worldwide network morning briefing day analysis worlds biggest news stories updated by,science,science


## Making Model Predictions

In [23]:
test_categories = {
    #'unknown':[],
    'pseudoscience':['https://www.foxnews.com/opinion',
'https://newspunch.com/',
'https://www.huffpost.com/'],
    'science':['https://www.si.edu/explore/science',
'https://www.theskepticsguide.org/about',
'https://arstechnica.com/']
}

In [30]:
d_pred = {}

for category in test_categories:
    for source in test_categories[category]:
        train_source = False
        page = get_page_all(source, k, max_words, ignore_text, ignore_common)
        length = len(page.cleaned_text)
        if  length < min_words:
            print("ERROR:",source,length,"words")
        else:
            common_words = ' '.join([count[0] for count in page.most_common_words])
            text = ' '.join(page.cleaned_text)
            with learn.no_bar(), learn.no_logging():
                prediction = learn.predict(text)
            p = prediction[2][:].max().item()

            if source in d_train.keys(): train_source = True
            if category == prediction[0]:
                accuracy = p
            else:
                accuracy = 1-p
            d_pred[source] = [category, prediction[0], p, train_source, accuracy]

df = pd.DataFrame.from_dict(d_pred, orient='index', columns=['actual', 'prediction', 'probability', 'training source', 'accuracy'])

avg_accuracy = df['accuracy'].mean()
train_accuracy = df.loc[df['training source'] == True, 'accuracy'].mean()
test_accuracy = df.loc[df['training source'] == False, 'accuracy'].mean()

print("Average Accuracy =",avg_accuracy)
print("Train Source Accuracy =",train_accuracy)
print("Test Source Accuracy =",test_accuracy)
df.loc[df['training source'] == False]

https://www.foxnews.com/opinion
[('tucker', 6), ('twitter', 5), ('hunter', 5), ('laptop', 5), ('story', 5)]
('science', tensor(1), tensor([0.4723, 0.5277]))
https://newspunch.com/
[('news', 6), ('during', 4), ('surgery', 4), ('election', 4), ('new', 3)]
('science', tensor(1), tensor([0.2754, 0.7246]))
https://www.huffpost.com/
[('trump', 6), ('her', 6), ('us', 4), ('life', 4), ('new', 4)]
('pseudoscience', tensor(0), tensor([0.9845, 0.0155]))
https://www.si.edu/explore/science
[('science', 10), ('about', 8), ('our', 7), ('microscope', 6), ('panama', 6)]
('science', tensor(1), tensor([0.0116, 0.9884]))
https://www.theskepticsguide.org/about
[('science', 16), ('also', 10), ('skeptics', 9), ('guide', 8), ('skeptical', 8)]
('science', tensor(1), tensor([0.0304, 0.9696]))
https://arstechnica.com/
[('pm', 34), ('new', 8), ('its', 4), ('mole', 4), ('may', 3)]
('science', tensor(1), tensor([0.3732, 0.6268]))
Average Accuracy = 0.7195055882136027
Train Source Accuracy = nan
Test Source Accuracy

Unnamed: 0,actual,prediction,probability,training source,accuracy
https://www.foxnews.com/opinion,pseudoscience,science,0.527666,False,0.472334
https://newspunch.com/,pseudoscience,science,0.724625,False,0.275375
https://www.huffpost.com/,pseudoscience,pseudoscience,0.98449,False,0.98449
https://www.si.edu/explore/science,science,science,0.988414,False,0.988414
https://www.theskepticsguide.org/about,science,science,0.969587,False,0.969587
https://arstechnica.com/,science,science,0.626833,False,0.626833


## Exporting and Loading the Model

In [25]:
#learn.export('models/2022.12.01 Model v1 88pct')

In [26]:
#learn = load_learner('models/2022.11.28 Model.pth', cpu=False)