# data

> Web scraping and tools for data collection and processing

In [1]:
#| default_exp data

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sbavery/pseudometer/blob/main/nbs/01_data.ipynb)

In [2]:
#| hide
## Google Colab / Enchant Library Install for Dictionaries
#!apt update
#!apt install enchant-2 --fix-missing
#!apt install -qq enchant-2

In [3]:
#| export
import warnings
warnings.filterwarnings('ignore')
import requests
from bs4 import BeautifulSoup
import enchant
import re
import random
from collections import Counter
from fastai.text.all import *
import hashlib
import pickle

In [4]:
#| hide
## Utility Function to Check GPU Status
def check_gpu():
    print("CUDA Available: ", torch.cuda.is_available())
    num_devices = torch.cuda.device_count()
    if num_devices > 0:
        for device in range(0,num_devices):
            print("Device", device, "|", torch.cuda.get_device_name(device), 
            "| Allocated:", round(torch.cuda.memory_allocated(device)/1024**3,1), "GB",
            "| Cached:", round(torch.cuda.memory_reserved(device)/1024**3,1), "GB")

torch.cuda.empty_cache()

In [5]:
#| hide
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.0 GB | Cached: 0.0 GB


## Web Scraper

In [6]:
#| export
class Webpage:
    def __init__(self, url):
        self.url = url
        self.hash = self.get_hash_str()
        self.requested = False
        self.page_text = ""
        self.html = ""
        self.links = []
        self.text = []
        self.cleaned_text = []
        self.most_common_words = []
    
    def get_page(self, headers, min_size, max_size):
        r = requests.get(self.url, stream=True, headers=headers)
        content_length = int(r.headers.get('Content-Length', 0))
        data = []
        length = 0

        if content_length > max_size:
            return None

        for chunk in r.iter_content(1024):
            data.append(chunk)
            length += len(chunk)
            if length > max_size:
                return None
        r._content = b''.join(data)
        if len(r.text) < min_size: return None
        return r.text

    def get_page_html(self, min_size=1000, max_size=2000000):
        user_agents = [ 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 
            'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148', 
            'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36' 
        ] 
        user_agent = random.choice(user_agents) 
        headers = {'User-Agent': user_agent} 
        self.page_text = self.get_page(headers, min_size, max_size)
        self.html = BeautifulSoup(self.page_text, "html.parser")
        self.requested = True

    def get_hash_str(self, inp=""):
        return hashlib.sha3_256((self.url+inp).encode()).hexdigest()

    def get_html_anchors(self, keyword="http"):
        for anchor in self.html.findAll('a'):
            link = anchor.get('href')
            if link == None or link == "":
                continue
            if keyword in link:
                self.links.append(link)
                
    def get_html_text(self, tags=["p"]):
        for tag in tags:
            for p in self.html.findAll(tag):
                p_text = p.getText().strip()
                if p_text == None or p_text == '':
                    continue
                self.text.append(p_text)

    def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
        all_text = ' '.join(self.text).lower()
        regex_text = re.sub(rx,'',all_text).strip()
        split = regex_text.split()
        split = [word for word in split if word not in ignore]
        if enchant_dict != "": d = enchant.Dict(enchant_dict)
        for word in split:
            if len(self.cleaned_text) >= max_words: break
            if len(word) >= min_word_len:
                if enchant_dict == "":
                    self.cleaned_text.append(word)
                elif d.check(word): 
                    self.cleaned_text.append(word)

    def k_common_words(self, k=10, ignore=[]):
        if self.cleaned_text == "":
            text = self.text
        else:
            text = self.cleaned_text
        all_text = ' '.join(text).lower()
        split = all_text.split()
        split_ignore = [word for word in split if word not in ignore]
        counts = Counter(split_ignore)
        k_most_common = counts.most_common(k)
        self.most_common_words = k_most_common

    def save_text(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.text, file)
        file.close()

    def load_text(self, path, fname):
        file = open(path+fname, 'rb')
        self.text = pickle.load(file)
        file.close()

    def save_links(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.links, file)
        file.close()

    def load_links(self, path, fname):
        file = open(path+fname, 'rb')
        self.links = pickle.load(file)
        file.close()

In [7]:
#| hide
url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"
common_english = Webpage(url)
common_english.get_page_html(min_size=1000)
english_words = common_english.html.getText().lower()
english_words = english_words.split('\n')
print(len(english_words),"most common English words")
#english_words

1000 most common English words


In [8]:
#| hide
url = "https://www.foxnews.com/opinion"
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)

In [9]:
#| hide
test_page = Webpage(url)
test_page.get_page_html()
test_page.get_html_text()
test_page.get_html_anchors()
test_page.clean_html_text(500, ignore=english_words[:50], rx="[^a-zA-Z ]+")
test_page.save_text(path, test_page.hash+'.text')
test_page.save_links(path, test_page.hash+'.links')

In [10]:
new_page = Webpage(url)
fname_text = new_page.hash+'.text'
fname_links = new_page.hash+'.links'
if os.path.isfile(path+fname_text): 
    new_page.load_text(path, fname_text)
    print("Loading Text")
else:
    new_page.get_page_html()
    new_page.get_html_text(tags=["p","h1","h2","h3","span"])
    new_page.save_text(path, fname_text)

if os.path.isfile(path+fname_links): 
    new_page.load_links(path, fname_links)
    print("Loading Links")
else:
    new_page.get_page_html()
    new_page.get_html_anchors()
    new_page.save_links(path, fname_links)
new_page.clean_html_text(500, ignore=english_words[:50], rx="[^a-zA-Z ]+")
new_page.k_common_words(k=5,ignore=english_words[:50])
print(len(new_page.cleaned_text))
' '.join(new_page.cleaned_text[:450])

Loading Text
Loading Links
500


'material may published broadcast rewritten redistributed fox news network rights reserved quotes displayed delayed least minutes market data provided powered implemented digital solutions legal statement mutual fund data provided social engineering tactics used limit exposure information did fit within narrative about musk release twitter files showing world campaign media worked together censor hunter laptop story internal documents reveal education association lost nearly percent its membership delves into medias narratives surrounding lawyer leading election light prison sentence tucker fox news host tucker weighs governments handling southern border crisis tucker tonight college consumers should wary forking over big bucks certain degrees believed too long only ticket life came college internal documents reveal education association lost nearly percent its membership musk release twitter files showing world campaign media worked together censor hunter laptop story social engineeri

In [11]:
#| export
def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
    page = Webpage(url)
    fname_text = page.hash+'.text'
    fname_links = page.hash+'.links'
    if path == None:
        page.get_page_html()
        page.get_html_text(tags=["p","h1","h2","h3","span"])
        page.get_html_anchors()
    else:
        if os.path.isfile(path+fname_text): 
            page.load_text(path, fname_text)
        else:
            page.get_page_html()
            page.get_html_text(tags=["p","h1","h2","h3","span"])
            page.save_text(path, fname_text)

        if os.path.isfile(path+fname_links): 
            page.load_links(path, fname_links)
        else:
            if page.html == "": page.get_page_html()
            page.get_html_anchors()
            page.save_links(path, fname_links)

    if page.text is not None:
        page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
        page.k_common_words(k=k, ignore=ignore_common)
    return page

def get_all_links(url, dict, category, k, min_words=20, max_words=500, ignore_text=[], ignore_common=[], ignore_filenames=[".mp3",".jpg",".png"], max_links="", path=None):
    primary_page = get_page_all(url, k, max_words, ignore_text, ignore_common, path)
    if primary_page.cleaned_text is not []:
        dict[url] = [primary_page.cleaned_text, primary_page.most_common_words, category]
        if max_links == "" or max_links > len(primary_page.links): max_links=len(primary_page.links)
        
        for count, link in enumerate(primary_page.links[:max_links]):
            if all(x not in link for x in ignore_filenames):
                try:
                    page = get_page_all(link, k, max_words, ignore_text, ignore_common, path)
                    if page.cleaned_text is not []:
                        if len(page.cleaned_text) < min_words: continue
                        if len(page.cleaned_text) > max_words: page.cleaned_text = page.cleaned_text[:max_words]
                        if [page.cleaned_text, page.most_common_words] in dict.values(): continue
                        dict[link] = [page.cleaned_text, page.most_common_words, category]
                except:
                    pass
            if link in dict:
                res = str(len(dict[link][0]))+" words | "+str(dict[link][1][:3])
            else:
                res = "Rejected"
            progress_message = "%s link %4d/%4d | %s = %s %s" % (url, count, len(primary_page.links), link, res, 500*' ')
            sys.stdout.write("\r" + progress_message)
            sys.stdout.flush()
    else:
        print(url,"returned None, Skipping...")

In [12]:
"""
    "unknown":["https://www.huffpost.com/",
"https://www.wired.com/",
"https://www.theguardian.com/us",
"https://www.goodgopher.com"],
"""

categories = {
    "pseudoscience":["http://www.ageofautism.com/",
 "http://www.naturalnews.com", 
 "https://foodbabe.com/starthere/",
 "http://www.chopra.com",
 "https://www.mercola.com/",
 "https://www.history.com/",
 "https://doctoroz.com/",
 "https://www.disclose.tv/",
 "https://nationalreport.net/",
 "https://heartland.org/",
 "https://www.dailymail.co.uk/",
 "https://www.motherjones.com/",
 "https://www.foxnews.com/opinion"],
    "science":["https://sciencebasedmedicine.org/",
 "https://www.hopkinsmedicine.org/gim/research/method/ebm.html",
 "https://www.bbc.com/news/science_and_environment",
 "https://www.nature.com/",
 "https://www.science.org/",
 "https://www.snopes.com/top/",
 "https://quackwatch.org/",
 "https://www.skepdic.com/",
 "http://scibabe.com/",
 "http://pandasthumb.org/",
 "https://skepticalscience.com/",
 "https://www.cdc.gov/",
 "https://apnews.com/",
 "https://www.economist.com/",
 "https://www.livescience.com/",
 "https://www.newscientist.com/"]
 }

In [13]:
k = 30 # words
min_words = 50
max_words = 450
max_links = 30
ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on'] 
ignore_common = english_words[:50]
ignore_filenames = [".mp3",".jpg",".png",".mp4",".jfif","facebook.com","twitter.com"]

In [14]:
#| hide
d_dl = {}
d_train = {}
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)

for category in categories:
    for source in categories[category]:
        if category == "unknown":
            max_l = 0
        else:
            max_l = max_links
        get_all_links(source, d_dl, category, k, min_words, max_words, ignore_text, ignore_common, 
        ignore_filenames, max_l, path)


https://www.newscientist.com/ link   28/  33 | https://www.facebook.com/newscientist = Rejected                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

In [15]:
for link in d_dl:
    text = d_dl[link][0]
    common_words = ' '.join([count[0] for count in d_dl[link][1]])
    if link not in d_train:
        d_train[link] = [' '.join(text), common_words, d_dl[link][2]]
#d_train

In [16]:
#| hide
d_counts = {}
for link in d_dl:
    if d_dl[link][2] in d_counts:
        d_counts[d_dl[link][2]]+=Counter(dict(d_dl[link][1]))
    else:
        d_counts[d_dl[link][2]]=Counter(dict(d_dl[link][1]))

for category in d_counts:
    print("####",category,k,"Most Common Words ####\n",d_counts[category].most_common(k),"\n\n")

#### pseudoscience 30 Most Common Words ####
 [('our', 482), ('health', 372), ('food', 261), ('more', 253), ('news', 248), ('ago', 199), ('fox', 188), ('my', 185), ('has', 174), ('about', 165), ('us', 163), ('its', 159), ('donate', 156), ('will', 149), ('email', 144), ('oz', 143), ('heartland', 142), ('mother', 134), ('twitter', 130), ('their', 129), ('policy', 123), ('new', 120), ('subscribe', 118), ('hours', 118), ('data', 113), ('trump', 113), ('comments', 102), ('meditation', 100), ('national', 99), ('senate', 98)] 


#### science 30 Most Common Words ####
 [('our', 679), ('more', 604), ('science', 512), ('about', 452), ('medicine', 432), ('health', 369), ('new', 348), ('published', 319), ('us', 291), ('news', 267), ('menu', 261), ('will', 260), ('care', 256), ('johns', 222), ('its', 216), ('world', 212), ('economist', 205), ('access', 200), ('information', 190), ('has', 183), ('research', 180), ('these', 159), ('cookies', 158), ('may', 157), ('if', 154), ('no', 147), ('twitter', 1

## Data Preparation

In [17]:
#| hide
df = pd.DataFrame.from_dict(d_train, orient='index', columns=['text', 'common_words', 'label'])
df.head()

Unnamed: 0,text,common_words,label
http://www.ageofautism.com/,holy high fructose corn syrup health defenses defender has an article exposes yet more twisted truth about academy pediatrics company once liked teach he world sing perfect harmony funds pediatric healthcare not be confused with health many years ago was at pediatrics appalled by row vending machines loaded with junk food soda glass soda back when was at most ounces made with sugar was refreshing treat big food such as is food industry is so intertwined with healthcare its impossible separate them as an aside if you watched wheres my recommend academy pediatrics great partner ours new pape...,health public academy paper pediatrics conferences truth about nutrition events speakers researchers found food industry great new influence academic institutions uncovered biggest senior vice president alliance documents know conference has,pseudoscience
https://www.ageofautism.com/,by had few extra minutes myself morning been hoping some extra time my week get few things done sorting through some paperwork catching up laundry writing out some major things but they are adding up having extra time get at least one thing completed was exactly what hoped got morning but first breakfast while eating decided scroll through felt awful after not because what ate but because what read longtime advocate had posted link caught my eye usually read strangers obituaries as sad as was glad decided click if you feel moved by what he shared please forward his link with all learned wi...,our who my deaths age autism lies those extra been read posted better health so during people percent few morning time get things through least decided because link feel family,pseudoscience
https://www.ageofautism.com/donate.html,hello your donation autism age is tax now use secure online donations scroll down their easy use form you can always send us paper or electronic check as well email me at any time with ideas suggestions or gentle critiques our is thank you ed cause unknown epidemic sudden deaths health defense transcend fear blueprint mindful leadership public health jr real bill gates big global war democracy public health health defense donate click cover buy book shop amazon support recent comments past current contributors connect search donate donate top,health donate defense public hello donation autism age tax now secure online donations scroll down their easy form always send us paper electronic check well email me any time ideas,pseudoscience
https://www.ageofautism.com/contact-us.html,autism age box ct ed cause unknown epidemic sudden deaths health defense transcend fear blueprint mindful leadership public health jr real bill gates big global war democracy public health health defense donate click cover buy book shop amazon support recent comments past current contributors connect search donate contact us top,health defense public donate autism age box ct ed cause unknown epidemic sudden deaths transcend fear blueprint mindful leadership jr real bill gates big global war democracy click cover buy,pseudoscience
https://www.ageofautism.com/exclusives.html,editorials from series by here come you had me at an elaborate fraud series deer special report what do epidemiological studies really tell us note from there are epidemiological studies here vaccines autism these studies represent most often cited papers by scientists public health officials members media when trying refute any evidence an association between vaccinations autism there are serious methodological limitations design flaws conflicts interest or other problems related each these studies these flaws have been pointed out by government officials other researchers medical review ...,autism studies health these series here public epidemiological vaccines officials limitations flaws study defense donate editorials come me elaborate fraud deer special report do really tell us note represent most,pseudoscience


In [18]:
dls = TextDataLoaders.from_df(df, bs=16, text_col='text', label_col='label')
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,xxbos this blog ran from march has been replaced by no armed xxunk xxunk no raising smart kids ct no xxunk xxunk no xxunk design no autism no xxunk cold reading no jun experiences xxunk xxunk reason xxunk xxunk parks mystery park closed no may peter evolution bacterial xxunk chiropractors run xxunk cam aids soldiers forced work as male xxunk st no report tam xxunk skeptical journalists no science religion politics no mar xxunk begins filming what will become its hit show paranormal state ted hypocrisy psychic xxunk without clue critical thinking ct xxunk politics science young earth creationists poll reveals xxunk mercury health healing prayer studies find people who pray are talking themselves mice no march mostly republican war science no march abortion zoo takes intelligent design association advancement science xxunk educational practices no xxunk over cartoons no bizarre case tale xxunk murder rape xxunk satanic xxunk xxunk films,science
1,xxbos public radio insight interview with xxwrep 3 interview begins minutes seconds into podcast skeptical world wide hoax institute radio interview with real audio skip first five minutes which are from another show get interview radio interview with jack spin mp interview with interview with xxunk mm interview with skeptic magazine interview with xxunk web magazine morning herald bee enterprise point inquiry token skeptic interview with consider this mp what is skeptics dictionary what s harm believing something if works you if believe by doing jumping xxunk row will be xxunk xxunk works what is bad about you seem think everyone who xxunk with you is xxunk ill what basis do you have saying xxunk gods xxunk astrology conspiracies or psychics are xxunk why you skeptical skepticism why do you think science has answer all our questions why do you think skeptics scientists should be atheists skepticism or atheism kind,science
2,xxbos where employees are corporations key assets workers greater power comes threatening walk out door when musk bought twitter he clearly know key assets he was buying xxunk twitters workers heads corporate balance sheets assets corporation are its xxunk equipment patents brand name workers considered assets they appear as costs fact xxunk are typically corporations total costs which is why companies often cut xxunk increase profits reason this is corporations have xxunk been viewed as production systems assets are things corporations own which turn xxunk labor xxunk materials components into xxunk products reduce costs these xxunk xxunk each product xxunk more profit or been traditional view yet today increasingly corporations just production systems systems directing people who work within them large growing part value corporation now lies heads its workers heads know how xxunk know what needs improvement know where strengths xxunk are found know why corporation xxunk or these,science


In [19]:
#| hide
torch.cuda.empty_cache()

In [20]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(4, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.58998,0.503491,0.824561,00:11


epoch,train_loss,valid_loss,accuracy,time
0,0.425431,0.321936,0.842105,00:18
1,0.361045,0.402602,0.868421,00:18
2,0.260173,0.268871,0.894737,00:19
3,0.182726,0.25433,0.903509,00:18


In [21]:
#| hide
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.4 GB | Cached: 2.1 GB


In [22]:
learn.show_results()

Unnamed: 0,text,category,category_
0,xxbos march moment science ca nt be only one taken good long at society thought fuck gone berserk gon na go xxunk llamas well moment science berserk llama syndrome four species include llamas alpacas genetic testing indicates llamas alpacas were likely domesticated from respectively walking xxunk are capable breeding with each other which can result fertile offspring are mostly found wild while llamas alpacas are off living xxunk lives xxunk being bred their renewable supply xxunk socks distant cousin xxunk these pack animals are west coast south bred wool six continents baby llamas called weigh lbs at birth typically grow be about lb adults are around same size while adult alpacas are lb range wool texture color can vary greatly from one pack next tempting as may be go hardcore procuring your xxunk supplies gon na throw out there maybe its not merely because an enemy joy though often linked,science,science
1,xxbos free economy shipping orders over applicable only us learn more about our shipping policy view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all view all login see your shopping cart this is an important measure protect your email privacy check your inbox an email from may take up minutes message arrive click link email complete your newsletter subscription be xxunk page where you can access all free content is exclusive subscribers if you see email you may need check promotions tab or spam xxunk your inbox there is no need submit your email address again if you still,pseudoscience,science
2,xxbos stock ready ship verified customer team member xxunk ever since we launched our small company have wanted make snack bar would be happy eat you see always been type person reward myself at end long day with little treat was xxunk i d grab my favorite cup tea treat then i d sit xxunk enjoy was little me moment would help me xxunk from day problem sometimes would eat piece chocolate other times was another snack would feel good moment horrible later needed something better why set out create something better wanted snack bar snack bar could xxunk eat at end long day feel good about only had one question what makes great snack bar had rules ever eat snack bar seemingly xxunk all xxunk out your mouth as if you spent an xxunk desert or worse bar so sticky you feel like you need wash your hands immediately,pseudoscience,pseudoscience
3,xxbos its allies have agreed limit price oil barrel democrat will face republican walker after long at times bitter campaign as men finally go trial over attacks speaks victims families survive late scare against united states move into world cup quarterfinals with ruthless display finishing follow live text updates test match special commentary from day five first test between xxunk victory over set up world cup quarterfinal with was more proof sure touch team selection writes becomes top xxunk but he is upstaged by performance their last win over most amazing videos from why some jobs turn out be very different what was xxunk getting here easy but those who make are rewarded with perfect snow why soft girl xxunk is much more than social media trend mountain streams are helping make alpine villages energy china has vaccinated few old people making difficult abandon its policy its estimated plant animal,science,science
4,xxbos written by is renowned lifestyle editor been featured countless popular lifestyle publications websites is xxunk at handling editorial duties social media engagement partnerships edited by brings more than years journalism experience ensure articles reports have been edited overall clarity accuracy reader engagement scoop is an informational website founded by doctor who wanted make relevant medical information available people everywhere has spent over years researching effective healthcare practices now uses his extensive knowledge debunk health myths quack products couples can read through resources learn how spot misinformation maintain healthy lifestyle based proven science at is still an active competitive xxunk he set six state records his home state north won three gold xxunk world senior swimming regularly has enabled stay strong fit well into his he xxunk his longevity combination his xxunk swimming career his cooking throughout their marriage she has helped him keep his diet track lead long happy,science,science
5,xxbos holy high fructose corn syrup health xxunk defender has an article exposes yet more twisted truth about academy pediatrics company once liked teach he world sing perfect harmony funds pediatric healthcare not be confused with health many years ago was at pediatrics xxunk by row vending machines xxunk with junk food soda glass soda back when was at most xxunk made with sugar was xxunk treat big food such as is food industry is so intertwined with healthcare its impossible separate them as an aside if you watched where s my recommend academy pediatrics great partner ours new paper public health nutrition reveals influence public health conferences events speakers academic institutions public health groups including academy content article follows truth about how uncovered biggest lie day huff left his senior scientist vice president role at alliance was one happiest days his life due corruption he had witnessed at organization,pseudoscience,pseudoscience
6,xxbos skeptics dictionary hundreds definitions arguments essays occult topics ranging from acupuncture zombies skeptics dictionary is lively commonsense xxunk detailed information all things supernatural paranormal each entry gives skeptical definition or description most entries include both internal external links related material including comments from readers my here you will find brief description what skeptics dictionary is how came about complete list you will find an alphabetical listing all dictionary entries from acupuncture zombies complete list reader comments here you will find an alphabetical listing all reader comments dictionary entries xxunk introduction skeptics dictionary is book these are exactly what they xxunk be please read these before submitting comments translations is translation skeptics dictionary done xxunk by there are also many entries have been translated into by by has translated many entries into translation is also available paperback many entries have been translated into french by good people at several entries,science,science
7,xxbos as member food babe army you will be front lines this movement is sweeping food industry by storm creating better food system us all be part this change also be first know about major campaigns investigations along with future petitions news about great xxunk we are taking xxunk all delicious organic food my pantry wo nt sell your email address or send you spam only delicious organic living tips recipes just enter your email address below get started dissect your favorite foods learn truth about food health industry food babe army is powerful group people who are committed demanding actively creating change food industry we are millions people who sign petitions call major company headquarters share reports investigations with our friends use our voices make sure we are heard most importantly we vote with our dollars supporting companies are creating healthier food system all our mission is create healthier,pseudoscience,pseudoscience
8,xxbos by senior political reporter deputy us political editor published est updated est view comments baron xxunk trump rapper west during his performance at center honors night same evening made his dramatic first public appearance since he was attacked his xxunk xxunk trump having orange skin fat belly he also criticized antisemitic remarks during his xxunk rock band audience xxunk with laughter response actors xxunk am told president yours today is here where are you trump said as he began his xxunk as cameras xxunk president sitting box seat joked you look so good adding your big belly is your pretty orange skin has gone xxunk rest audience xxunk with laughter never breaking turned his attention seated next president but see you have new wife she is very xxunk must look away before get said first lady xxunk xxunk response kept up his performance as journalist as he went after,pseudoscience,pseudoscience


## Making Model Predictions

In [23]:
test_categories = {
    #'unknown':[],
    'pseudoscience':['https://www.foxnews.com/opinion',
'https://newspunch.com/',
'https://www.huffpost.com/'],
    'science':['https://www.si.edu/explore/science',
'https://www.theskepticsguide.org/about',
'https://arstechnica.com/']
}

In [27]:
d_pred = {}

for category in test_categories:
    for source in test_categories[category]:
        train_source = False
        page = get_page_all(source, k, max_words, ignore_text, ignore_common)
        length = len(page.cleaned_text)
        if  length < min_words:
            print("ERROR:",source,length,"words")
        else:
            common_words = ' '.join([count[0] for count in page.most_common_words])
            text = ' '.join(page.cleaned_text)
            with learn.no_bar(), learn.no_logging():
                prediction = learn.predict(text)
            p = prediction[2][:].max().item()

            if source in d_train.keys(): train_source = True
            if category == prediction[0]:
                accuracy = p
            else:
                accuracy = 1-p
            d_pred[source] = [category, prediction[0], p, train_source, accuracy]

df = pd.DataFrame.from_dict(d_pred, orient='index', columns=['actual', 'prediction', 'probability', 'training source', 'accuracy'])

avg_accuracy = df['accuracy'].mean()
train_accuracy = df.loc[df['training source'] == True, 'accuracy'].mean()
test_accuracy = df.loc[df['training source'] == False, 'accuracy'].mean()

print("Average Accuracy =",avg_accuracy)
print("Train Source Accuracy =",train_accuracy)
print("Test Source Accuracy =",test_accuracy)
df
#df.loc[df['training source'] == False]

Average Accuracy = 0.894449402888616
Train Source Accuracy = 0.9994723200798035
Test Source Accuracy = 0.8734448194503784


Unnamed: 0,actual,prediction,probability,training source,accuracy
https://www.foxnews.com/opinion,pseudoscience,pseudoscience,0.999472,True,0.999472
https://newspunch.com/,pseudoscience,pseudoscience,0.733288,False,0.733288
https://www.huffpost.com/,pseudoscience,pseudoscience,0.883438,False,0.883438
https://www.si.edu/explore/science,science,science,0.957423,False,0.957423
https://www.theskepticsguide.org/about,science,science,0.843022,False,0.843022
https://arstechnica.com/,science,science,0.950053,False,0.950053


## Exporting and Loading the Model

In [29]:
#learn.export('models/2022.12.07 Model v1 87pct')

In [26]:
#learn = load_learner('models/2022.11.28 Model.pth', cpu=False)