# data

> Web scraping and tools for data collection and processing

In [None]:
#| default_exp data

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sbavery/pseudometer/blob/main/nbs/01_data.ipynb)

In [None]:
#| hide
## Google Colab / Enchant Library Install for Dictionaries
#!apt update
#!apt install enchant-2 --fix-missing
#!apt install -qq enchant-2

In [None]:
#| export
import warnings
warnings.filterwarnings('ignore')
import requests
from bs4 import BeautifulSoup
import enchant
import re
import random
from collections import Counter
from fastai.text.all import *
import hashlib
import pickle

In [None]:
#| hide
## Utility Function to Check GPU Status
def check_gpu():
    print("CUDA Available: ", torch.cuda.is_available())
    num_devices = torch.cuda.device_count()
    if num_devices > 0:
        for device in range(0,num_devices):
            print("Device", device, "|", torch.cuda.get_device_name(device), 
            "| Allocated:", round(torch.cuda.memory_allocated(device)/1024**3,1), "GB",
            "| Cached:", round(torch.cuda.memory_reserved(device)/1024**3,1), "GB")

torch.cuda.empty_cache()

In [None]:
#| hide
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce GTX 1050 Ti | Allocated: 0.4 GB | Cached: 0.6 GB


## Web Scraper

In [None]:
#| export
class Webpage:
    def __init__(self, url):
        self.url = url
        self.hash = self.get_hash_str()
        self.requested = False
        self.page_text = ""
        self.html = ""
        self.links = []
        self.text = []
        self.cleaned_text = []
        self.most_common_words = []
    
    def get_page(self, headers, min_size, max_size):
        r = requests.get(self.url, stream=True, headers=headers)
        content_length = int(r.headers.get('Content-Length', 0))
        data = []
        length = 0

        if content_length > max_size:
            return None

        for chunk in r.iter_content(1024):
            data.append(chunk)
            length += len(chunk)
            if length > max_size:
                return None
        r._content = b''.join(data)
        if len(r.text) < min_size: return None
        return r.text

    def get_page_html(self, min_size=1000, max_size=2000000):
        user_agents = [ 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 
            'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148', 
            'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36' 
        ] 
        user_agent = random.choice(user_agents) 
        headers = {'User-Agent': user_agent} 
        self.page_text = self.get_page(headers, min_size, max_size)
        self.html = BeautifulSoup(self.page_text, "html.parser")
        self.requested = True

    def get_hash_str(self, inp=""):
        return hashlib.sha3_256((self.url+inp).encode()).hexdigest()

    def get_html_anchors(self, keyword="http"):
        for anchor in self.html.findAll('a'):
            link = anchor.get('href')
            if link == None or link == "":
                continue
            if keyword in link:
                self.links.append(link)
                
    def get_html_text(self, tags=["p"]):
        for tag in tags:
            for p in self.html.findAll(tag):
                p_text = p.getText().strip()
                if p_text == None or p_text == '':
                    continue
                self.text.append(p_text)

    def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
        all_text = ' '.join(self.text).lower()
        regex_text = re.sub(rx,'',all_text).strip()
        split = regex_text.split()
        split = [word for word in split if word not in ignore]
        if enchant_dict != "": d = enchant.Dict(enchant_dict)
        for word in split:
            if len(self.cleaned_text) >= max_words: break
            if len(word) >= min_word_len:
                if enchant_dict == "":
                    self.cleaned_text.append(word)
                elif d.check(word): 
                    self.cleaned_text.append(word)

    def k_common_words(self, k=10, ignore=[]):
        if self.cleaned_text == "":
            text = self.text
        else:
            text = self.cleaned_text
        all_text = ' '.join(text).lower()
        split = all_text.split()
        split_ignore = [word for word in split if word not in ignore]
        counts = Counter(split_ignore)
        k_most_common = counts.most_common(k)
        self.most_common_words = k_most_common

    def save_text(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.text, file)
        file.close()

    def load_text(self, path, fname):
        file = open(path+fname, 'rb')
        self.text = pickle.load(file)
        file.close()

    def save_links(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.links, file)
        file.close()

    def load_links(self, path, fname):
        file = open(path+fname, 'rb')
        self.links = pickle.load(file)
        file.close()

In [None]:
#| hide
url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"
common_english = Webpage(url)
common_english.get_page_html(min_size=1000)
english_words = common_english.html.getText().lower()
english_words = english_words.split('\n')
print(len(english_words),"most common English words")
#english_words

1000 most common English words


In [None]:
pseudo_sources = ["http://www.ageofautism.com/",
 "http://www.naturalnews.com", 
 "https://foodbabe.com/starthere/",
 "http://www.chopra.com",
 "https://www.mercola.com/",
 "https://www.history.com/",
 "https://doctoroz.com/",
 "https://www.disclose.tv/",
 "https://nationalreport.net/",
 "https://heartland.org/",
 "https://www.dailymail.co.uk/",
 "https://www.motherjones.com/"]

science_sources = ["https://sciencebasedmedicine.org/",
 "https://www.hopkinsmedicine.org/gim/research/method/ebm.html",
 "https://www.bbc.com/news/science_and_environment",
 "https://www.nature.com/",
 "https://www.science.org/",
 "https://www.snopes.com/top/",
 "https://quackwatch.org/",
 "https://www.skepdic.com/",
 "http://scibabe.com/",
 "http://pandasthumb.org/",
 "https://skepticalscience.com/",
 "https://www.cdc.gov/",
 "https://apnews.com/"]

In [None]:
#| hide
url = science_sources[7]
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)

In [None]:
#| hide
test_page = Webpage(url)
test_page.get_page_html()
test_page.get_html_text()
test_page.get_html_anchors()
test_page.clean_html_text(500, ignore=english_words[:50], rx="[^a-zA-Z ]+")
test_page.save_text(path, test_page.hash+'.text')
test_page.save_links(path, test_page.hash+'.links')

In [None]:
new_page = Webpage(url)
fname_text = new_page.hash+'.text'
fname_links = new_page.hash+'.links'
if os.path.isfile(path+fname_text): 
    new_page.load_text(path, fname_text)
    print("Loading Text")
else:
    new_page.get_page_html()
    new_page.get_html_text(tags=["p","h1","h2","h3","span"])
    new_page.save_text(path, fname_text)

if os.path.isfile(path+fname_links): 
    new_page.load_links(path, fname_links)
    print("Loading Links")
else:
    new_page.get_page_html()
    new_page.get_html_anchors()
    new_page.save_links(path, fname_links)
new_page.clean_html_text(500, ignore=english_words[:50], rx="[^a-zA-Z ]+")
new_page.k_common_words(k=5,ignore=english_words[:50])
' '.join(new_page.cleaned_text)[:200]

Loading Text
Loading Links


'skeptics dictionary features definitions arguments essays hundreds strange beliefs amusing deceptions dangerous delusions also features dozens entries logical fallacies cognitive biases perception sci'

In [None]:
#| export
def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
    page = Webpage(url)
    fname_text = page.hash+'.text'
    fname_links = page.hash+'.links'
    if path == None:
        page.get_page_html()
        page.get_html_text(tags=["p","h1","h2","h3","span"])
        page.get_html_anchors()
    else:
        if os.path.isfile(path+fname_text): 
            page.load_text(path, fname_text)
        else:
            page.get_page_html()
            page.get_html_text(tags=["p","h1","h2","h3","span"])
            page.save_text(path, fname_text)

        if os.path.isfile(path+fname_links): 
            page.load_links(path, fname_links)
        else:
            if page.html == "": page.get_page_html()
            page.get_html_anchors()
            page.save_links(path, fname_links)

    if page.text is not None:
        page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
        page.k_common_words(k=k, ignore=ignore_common)
    return page

def get_all_links(url, dict, k, min_words=20, max_words=500, ignore_text=[], ignore_common=[], ignore_filenames=[".mp3",".jpg",".png"], max_links="", path=None):
    primary_page = get_page_all(url, k, max_words, ignore_text, ignore_common, path)
    if primary_page.cleaned_text is not []:
        dict[url] = [primary_page.cleaned_text, primary_page.most_common_words]
        if max_links == "" or max_links > len(primary_page.links): max_links=len(primary_page.links)
        
        for count, link in enumerate(primary_page.links[:max_links]):
            if all(x not in link for x in ignore_filenames):
                try:
                    page = get_page_all(link, k, max_words, ignore_text, ignore_common, path)
                    if page.cleaned_text is not []:
                        if len(page.cleaned_text) < min_words: continue
                        if [page.cleaned_text, page.most_common_words] in dict.values(): continue
                        dict[link] = [page.cleaned_text, page.most_common_words]
                except:
                    pass
            if link in dict:
                res = str(len(dict[link][0]))+" words | "+str(dict[link][1][:3])
            else:
                res = "Rejected"
            progress_message = "%s link %4d/%4d | %s = %s %s" % (url, count, len(primary_page.links), link, res, 200*' ')
            sys.stdout.write("\r" + progress_message)
            sys.stdout.flush()
    else:
        print(url,"returned None, Skipping...")

In [None]:
k = 30 # words
min_words = 50
max_words = 450
max_links = 100
ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on'] 
ignore_common = english_words[:50]
ignore_filenames = [".mp3",".jpg",".png",".mp4",".jfif","facebook.com","twitter.com"]

In [None]:
#| hide
d_pse = {}
d_sci = {}
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)
path_pse = path+'pseudoscience/'
path_sci = path+'science/'
if os.path.isdir(path_pse) is False: os.mkdir(path_pse)
if os.path.isdir(path_sci) is False: os.mkdir(path_sci)

for source in pseudo_sources:
    get_all_links(source, d_pse, k, min_words, max_words, ignore_text, ignore_common, 
    ignore_filenames, max_links, path_pse)
for source in science_sources:
    get_all_links(source, d_sci, k, min_words, max_words, ignore_text, ignore_common, 
    ignore_filenames, max_links, path_sci)


https://apnews.com/ link   21/  22 | https://www.ap.org/careers/ = 156 words | [('our', 11), ('us', 3), ('career', 3)]                                                                                                                                                                                                                                                                                                                                        cted                                                                                                                                                                                                         

In [None]:
#| hide
count_pse = Counter()
count_sci = Counter()
for link in d_pse:
    count_pse+=Counter(dict(d_pse[link][1]))
for link in d_sci:
    count_sci+=Counter(dict(d_sci[link][1]))

print("#### Pseudoscience",k,"Most Common Words ####\n",count_pse.most_common(k),"\n\n")
print("#### Science",k,"Most Common Words ####\n",count_sci.most_common(k),"\n\n")

#### Pseudoscience 30 Most Common Words ####
 [('our', 1150), ('health', 715), ('their', 669), ('food', 621), ('more', 600), ('has', 597), ('us', 495), ('about', 469), ('its', 415), ('will', 397), ('mother', 391), ('my', 387), ('policy', 336), ('her', 313), ('who', 303), ('information', 285), ('news', 285), ('tweet', 279), ('if', 274), ('people', 258), ('new', 257), ('share', 250), ('email', 239), ('subscribe', 234), ('been', 232), ('heartland', 226), ('which', 222), ('no', 220), ('pin', 216), ('after', 204)] 


#### Science 30 Most Common Words ####
 [('medicine', 906), ('more', 808), ('our', 784), ('about', 760), ('science', 735), ('health', 619), ('menu', 485), ('climate', 460), ('research', 452), ('us', 426), ('information', 378), ('care', 373), ('access', 372), ('news', 368), ('has', 350), ('its', 334), ('new', 328), ('will', 328), ('nature', 325), ('johns', 324), ('these', 302), ('used', 295), ('their', 277), ('cookies', 275), ('if', 267), ('may', 249), ('storage', 233), ('techni

In [None]:
#| hide
for link in d_pse:
    if link in d_sci.keys():
        print(link)

## Data Preparation

In [None]:
#| hide
d_all = {}
for link in d_pse:
    text = d_pse[link][0]
    if len(text) > max_words: text = text[:max_words]
    common_words = ' '.join([count[0] for count in d_pse[link][1]])
    if link not in d_all:
        d_all[link] = [' '.join(text), common_words, 'pseudoscience']

for link in d_sci:
    text = d_sci[link][0]
    if len(text) > max_words: text = text[:max_words]
    common_words = ' '.join([count[0] for count in d_sci[link][1]])
    if link not in d_all:
        d_all[link] = [' '.join(text), common_words, 'science']

df = pd.DataFrame.from_dict(d_all, orient='index', columns=['text', 'common_words', 'label'])
df.head()

Unnamed: 0,text,common_words,label
http://www.ageofautism.com/,thanks you we hit our goal can not thank you enough those who donated those who read us those who share our posts those who contributed over last years most all our benefactors who so generously matched ill let you secret we are what is known tax parlance as post card organization does not mean we vacation so often we send postcards weekly means we are small but mighty our tax return fits post card feel energized grateful losing was earth shattering he was our our journalism genius our kindhearted rapier witted rebel alliance leader knew instinctively story autism was profoundly important ...,our who twitter deaths those so during percent content autism lies age been cause unknown people than see policy goal thank us tax post card mean story even has aware,pseudoscience
https://www.ageofautism.com/donate.html,hello your donation autism age is tax now use secure online donations scroll down their easy use form you can always send us paper or electronic check as well email me at any time with ideas suggestions or gentle critiques our is thank you ed cause unknown epidemic sudden deaths health defense transcend fear blueprint mindful leadership public health jr real bill gates big global war democracy public health health defense donate click cover buy book shop amazon support recent comments past current contributors connect search donate donate top,health donate defense public hello donation autism age tax now secure online donations scroll down their easy form always send us paper electronic check well email me any time ideas,pseudoscience
https://www.ageofautism.com/contact-us.html,autism age box ct ed cause unknown epidemic sudden deaths health defense transcend fear blueprint mindful leadership public health jr real bill gates big global war democracy public health health defense donate click cover buy book shop amazon support recent comments past current contributors connect search donate contact us top,health defense public donate autism age box ct ed cause unknown epidemic sudden deaths transcend fear blueprint mindful leadership jr real bill gates big global war democracy click cover buy,pseudoscience
https://www.ageofautism.com/exclusives.html,editorials from series by here come you had me at an elaborate fraud series deer special report what do epidemiological studies really tell us note from there are epidemiological studies here vaccines autism these studies represent most often cited papers by scientists public health officials members media when trying refute any evidence an association between vaccinations autism there are serious methodological limitations design flaws conflicts interest or other problems related each these studies these flaws have been pointed out by government officials other researchers medical review ...,autism studies health these series here public epidemiological vaccines officials limitations flaws study defense donate editorials come me elaborate fraud deer special report do really tell us note represent most,pseudoscience
https://www.ageofautism.com/science/,tom urged get their bivalent vaccine booster yesterday twitter kindly let him his followers know about week class starting this week through countermeasures injury compensation program this is program absolves corporations whose products harm during pandemic you can take course both live recorded version national vaccine injury compensation program countermeasures injury compensation program used emergency authorized this course students will learn structure function defects programs us created by congress award compensation adults children potentially actually harmed or killed by vaccines...,compensation injury program vaccine course high disease consequence no countermeasures us four their know week will john march health before been infectious has get class through products both live national,pseudoscience


In [None]:
dls = TextDataLoaders.from_df(df, bs=16, text_col='text', label_col='label')
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,xxbos this exchange consists seven posts each with comment by reply by plus one other post by is translation mostly by google translate from original posts comments have kindly xxunk translation you they are welcome comment at xxunk thumb keep mind posts these xxunk comments were written two years ago thus can not take more recent comments here into account we hope publish sections seven consecutive we chose day week without remembering its connection ancient days readers these authors are invited contribute further comments our comment system this weeks post starts with an introduction by followed by first his posts at comment by followed by xxunk by by two years ago professors informatics respectively published an article titled using statistical methods model molecular machines systems journal theoretical biology regular scientific intelligent design i d community publication was seen as breakthrough i d close reading paper however is not convincing seven,science
1,xxbos this is an additional post by made at site after seven posts his breakthrough intelligent design series were made he has requested we post here after second post series explain his position post what can be considered legitimate science central point discussion have with posts entitled breakthrough design concerns limits what can be considered legitimate science since his comments this matter are too long comments section have made post subject central question is how formulate hypothesis is suggested explanation hypotheses are not formulated an xxunk way you just grab something air hypothesis should be based prior knowledge should also be falsifiable must be possible show is incorrect scientific work then consists hypothesis whether proves be true or false something new has been learned thing here is hypothesis is formulated such way can be tested with result if you can not do you can not learn anything new if test,science
2,xxbos this blog ran from march has been replaced by no armed guards xxunk no raising smart kids ct no unwanted xxunk no xxunk design no autism no profiling cold reading no jun experiences xxunk enemies reason xxunk amusement parks mystery park closed no may peter evolution bacterial xxunk chiropractors run xxunk cam aids soldiers forced work as male xxunk st no report tam xxunk skeptical journalists no science religion politics no mar bigots begins filming what will become its hit show paranormal state ted hypocrisy psychic xxunk without clue critical thinking ct xxunk politics science young earth creationists poll reveals superstitions mercury health healing prayer studies find people who pray are talking themselves mice no march mostly republican war science no march abortion zoo takes intelligent design association advancement science xxunk educational practices no xxunk over cartoons no bizarre case tale torture murder rape xxunk satanic xxunk xxunk films,science


In [None]:
#| hide
torch.cuda.empty_cache()

In [None]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(4, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.569296,0.391617,0.829384,00:50


epoch,train_loss,valid_loss,accuracy,time
0,0.429256,0.315014,0.876777,01:32
1,0.364461,0.34955,0.862559,01:32
2,0.239097,0.230481,0.914692,01:32
3,0.160992,0.202744,0.914692,01:32


In [None]:
#| hide
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce GTX 1050 Ti | Allocated: 0.4 GB | Cached: 2.1 GB


In [None]:
learn.show_results()

Unnamed: 0,text,category,category_
0,xxbos mother illustration meeting which explained trump planned declare victory election night regardless actual results was not supposed be big deal gathering was intended help associates his patron xxunk mogul plan election night coverage news one media companies but wanted talk he explained repeatedly trump would announce early he had won would suggest any apparent victory had resulted from fraud he s just gon na go this is just with like percent vote counted trumps just gon na walk go winner said also took credit xxunk trump into position execute strategy saying several times he had xxunk plan arrested momentum late campaign was leak material from hunter laptop was very close winning outright then shutting down trump said hard drive from hell stopped his momentum drove up his xxunk argued failure respond allegations about hunter including false ones had driven up negative perceptions former vice president compared hunter material release,pseudoscience,pseudoscience
1,xxbos walker speaks during campaign rally john not too long ago walker said he had experienced racism having been stopped harassed by police because he was black man he expressed his fear police will abuse his son because he s black yet during his campaign senate walker republican now runoff against democratic incumbent has repeatedly declared racism does not exist united states this is all part confusing series remarks about racism walker has made recent years year before he would announce his senate bid walker appeared podcast jr black conservative discussed assorted matters relating race referring his son walker noted he was afraid if police stop him because he s black moreover walker recalled his own xxunk xxunk with law enforcement officers been stopped by police been harassed by police before discussing one incident when he was pulled over by cops he said why world are you gon na stop,pseudoscience,science
2,xxbos organic vegan xxunk organic plant based protein powder with ingredients your body will thank you add scoop smoothies overnight xxunk or baking need recipe go just add water sweet smooth way prevent pm crash free shipping within us this was reaction when she tried chocolate protein how are we able get rid xxunk texture xxunk flavor you get from other protein powders simple we use unique blend plant proteins one main ingredients is organic pea protein but instead turning whole pea pod all into pea protein we xxunk first this makes our powder smooth delicious avoids additives good reason with only ingredients be wondering how does taste so good answer we use real food keep pure no fillers no preservatives no artificial colors no artificial sweeteners we also avoid instead we use organic xxunk fruit add just right amount sweetness view nutrition facts view nutrition facts going double check,pseudoscience,pseudoscience
3,xxbos by daily mail est updated est as one most anticipated events global fashion calendar fashion councils annual awards ceremony is xxunk biggest names industry but with only few days left before this years xxunk event at royal hall dark disturbing cloud has been cast over proceedings scandal which no doubt has left both guests xxunk at their xxunk xxunk among xxunk famous xxunk xxunk scoop gong designer year no less is none other than creative director luxury fashion brand which this week was midst scandal over its use images children references child pornography controversy was sparked by its advertising campaign which showed xxunk clutching bags disguised as teddy bears what has been described as bondage gear as if this creepy enough separate spring campaign featuring xxunk neither whom had any creative input features xxunk placed top legal documents relating child pornography shot there is also book about controversial artist,pseudoscience,pseudoscience
4,xxbos stock ready ship verified customer team member hi ever since we launched our small company have wanted make snack bar would be happy eat you see always been type person reward myself at end long day with little treat was ritual i d grab my favorite cup tea treat then i d sit couch enjoy was little me moment would help me unwind from day problem sometimes would eat piece chocolate other times was another snack would feel good moment horrible later needed something better why set out create something better wanted snack bar snack bar could xxunk eat at end long day feel good about only had one question what makes great snack bar had rules ever eat snack bar seemingly xxunk all xxunk out your mouth as if you spent an eternity desert or worse bar so sticky you feel like you need wash your hands immediately,pseudoscience,pseudoscience
5,xxbos about this rating we received reader mail asked if copying pasting text from post could help show more content from friends also remove ads posts which we refer as claimed posting text would upgrade system help regain friends your news feed get rid ads however this was hoax most posts we found looked like this posts read as follows regain friends your news feed get rid ads hold your finger anywhere this post click copy go your page where says what s your mind tap your finger anywhere blank field click paste this upgrades system hello new old friends we previously reported other variations this same hoax one such post from claimed copying pasting its text could help avoid hearing from same friends nobody else another one promised copying pasting characters could circumvent algorithm make old friends appear users feeds neither these posts was true published an article about,science,science
6,xxbos march moment science my focus is somewhere neighborhood xxunk today probably because made mistake reading news about xxunk at everything anyway spirit not being able xxunk here are random xxunk science keep me xxunk at night moment science hey did you know we still have widespread agreement if virus something can be dead is technically alive first place xxunk infectious proteins can cause neurological disorders are simultaneously not alive damn near impossible kill largest living organism is mushroom makes blue xxunk look like xxunk genetic tests conducted samples from all over ton fungus covering xxunk forest confirmed was indeed one is considered delicious if not mildly poisonous when cooked xxunk we know what color dinosaurs were do with information what you will but xxunk xxunk xxunk xxunk xxunk looks pretty when its about feet below sea level bring surface where has no business hanging out looks well way you,science,science
7,xxbos xxunk jr xxunk onto stage at southern church radiating confidence xxunk standing xxunk crowd with his xxunk blue bobby eyes then he launched into an rant democrats drank he told people assembled far right conference branded as standing health freedom is criminal medical xxunk give child one these vaccines xxunk according video event one his many assertions ignored or went against legal scientific public health consensus then xxunk his book if just attendees amazon night he told crowd would land bestseller list they could stick amazon all profits he said would go his charity health defense while many nonprofits businesses have struggled during pandemic group has xxunk an investigation by associated press finds health defense has xxunk funding followers as used his star power as member one most famous families open doors raise money lend his group credibility filings with charity regulators show revenue more than doubled million since,science,science
8,xxbos who we are our website address what personal data we collect why we collect comments when visitors leave comments site we collect data shown comments form also visitors address browser user agent string help spam detection an string created from your email address also called xxunk may be provided service see if you are using service privacy policy is available here after approval your comment your profile picture is visible public context your comment media though this is not something generally happens with visitors this website bears noting if you upload images website you should avoid xxunk images with embedded location data included visitors website can download extract any location data from images website contact forms cookies if you leave comment our site you may saving your name email address website cookies these are your convenience so you do not have fill your details again when you leave another,science,science


## Making Model Predictions

In [None]:
test_sources = {
'https://infowarslife.com/':'pseudoscience',
'https://www.si.edu/explore/science':'science',
'https://www.foxnews.com/opinion':'pseudoscience',
'https://www.theskepticsguide.org/about':'science',
'https://www.huffpost.com/':'pseudoscience',
'https://arstechnica.com/':'science',
'https://newspunch.com/':'pseudoscience'}

pseudo_train_sources = dict([(source,'pseudoscience') for source in pseudo_sources])
science_train_sources = dict([(source,'science') for source in science_sources])
test_sources.update(pseudo_train_sources)
test_sources.update(science_train_sources)
sources = test_sources

In [None]:
d_pred = {}

for source in sources:
    train_source = False
    page = get_page_all(source, k, max_words, ignore_text, ignore_common)
    length = len(page.cleaned_text)
    if  length < min_words:
        print("ERROR:",source,length,"words")
    else:
        common_words = ' '.join([count[0] for count in page.most_common_words])
        text = ' '.join(page.cleaned_text)
        with learn.no_bar(), learn.no_logging():
            prediction = learn.predict(text)
        if prediction[0] == "science":
            p = prediction[2][1].item()
        else:
            p = prediction[2][0].item()

        if source in d_all.keys(): train_source = True
        if sources[source] == prediction[0]:
            accuracy = p
        else:
            accuracy = 1-p
        d_pred[source] = [sources[source], prediction[0], p, train_source, accuracy]

df = pd.DataFrame.from_dict(d_pred, orient='index', columns=['actual', 'prediction', 'probability', 'training source', 'accuracy'])

avg_accuracy = df['accuracy'].mean()
train_accuracy = df.loc[df['training source'] == True, 'accuracy'].mean()
test_accuracy = df.loc[df['training source'] == False, 'accuracy'].mean()

print("Average Accuracy =",avg_accuracy)
print("Train Source Accuracy =",train_accuracy)
print("Test Source Accuracy =",test_accuracy)
df.loc[df['training source'] == False]

Average Accuracy = 0.9211404025554657
Train Source Accuracy = 0.9328858351707459
Test Source Accuracy = 0.8791924289294651


Unnamed: 0,actual,prediction,probability,training source,accuracy
https://infowarslife.com/,pseudoscience,pseudoscience,0.964744,False,0.964744
https://www.si.edu/explore/science,science,science,0.997675,False,0.997675
https://www.foxnews.com/opinion,pseudoscience,pseudoscience,0.967668,False,0.967668
https://www.theskepticsguide.org/about,science,science,0.998813,False,0.998813
https://www.huffpost.com/,pseudoscience,pseudoscience,0.979651,False,0.979651
https://arstechnica.com/,science,pseudoscience,0.753343,False,0.246657
https://newspunch.com/,pseudoscience,pseudoscience,0.999139,False,0.999139


## Exporting and Loading the Model

In [None]:
#learn.export('models/2022.12.01 Model v1 88pct')

In [None]:
#learn = load_learner('models/2022.11.28 Model.pth', cpu=False)