# data

> Web scraping and tools for data collection and processing

In [None]:
#| default_exp data

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sbavery/pseudometer/blob/main/nbs/01_data.ipynb)

In [None]:
#| export
import warnings
warnings.filterwarnings('ignore')
import requests
from bs4 import BeautifulSoup
import enchant
import re
import random
from collections import Counter
from fastai.text.all import *
import hashlib
import pickle

In [None]:
## Utility Function to Check GPU Status
def check_gpu():
    print("CUDA Available: ", torch.cuda.is_available())
    num_devices = torch.cuda.device_count()
    if num_devices > 0:
        for device in range(0,num_devices):
            print("Device", device, "|", torch.cuda.get_device_name(device), 
            "| Allocated:", round(torch.cuda.memory_allocated(device)/1024**3,1), "GB",
            "| Cached:", round(torch.cuda.memory_reserved(device)/1024**3,1), "GB")

torch.cuda.empty_cache()

In [None]:
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.0 GB | Cached: 0.0 GB


## Web Scraper

In [None]:
#| export
class Webpage:
    def __init__(self, url):
        self.url = url
        self.hash = self.get_hash_str()
        self.requested = False
        self.page_text = ""
        self.html = ""
        self.links = []
        self.text = []
        self.cleaned_text = []
        self.most_common_words = []
    
    def get_page(self, headers, min_size, max_size):
        r = requests.get(self.url, stream=True, headers=headers)
        content_length = int(r.headers.get('Content-Length', 0))
        data = []
        length = 0

        if content_length > max_size:
            return None

        for chunk in r.iter_content(1024):
            data.append(chunk)
            length += len(chunk)
            if length > max_size:
                return None
        r._content = b''.join(data)
        if len(r.text) < min_size: return None
        return r.text

    def get_page_html(self, min_size=1000, max_size=2000000):
        user_agents = [ 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 
            'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148', 
            'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36' 
        ] 
        user_agent = random.choice(user_agents) 
        headers = {'User-Agent': user_agent} 
        self.page_text = self.get_page(headers, min_size, max_size)
        self.html = BeautifulSoup(self.page_text, "html.parser")
        self.requested = True

    def get_hash_str(self, inp=""):
        return hashlib.sha3_256((self.url+inp).encode()).hexdigest()

    def get_html_anchors(self, keyword="http"):
        for anchor in self.html.findAll('a'):
            link = anchor.get('href')
            if link == None or link == "":
                continue
            if keyword in link:
                self.links.append(link)
                
    def get_html_text(self, tags=["p"]):
        for tag in tags:
            for p in self.html.findAll(tag):
                p_text = p.getText().strip()
                if p_text == None or p_text == '':
                    continue
                self.text.append(p_text)

    def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2):
        all_text = ' '.join(self.text).lower()
        regex_text = re.sub(rx,'',all_text).strip()
        split = regex_text.split()
        split = [word for word in split if word not in ignore]
        if enchant_dict != "": d = enchant.Dict(enchant_dict)
        for word in split:
            if len(self.cleaned_text) >= max_words: break
            if len(word) >= min_word_len:
                if enchant_dict == "":
                    self.cleaned_text.append(word)
                elif d.check(word): 
                    self.cleaned_text.append(word)

    def k_common_words(self, k=10, ignore=[]):
        if self.cleaned_text == "":
            text = self.text
        else:
            text = self.cleaned_text
        all_text = ' '.join(text).lower()
        split = all_text.split()
        split_ignore = [word for word in split if word not in ignore]
        counts = Counter(split_ignore)
        k_most_common = counts.most_common(k)
        self.most_common_words = k_most_common

    def save_text(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.text, file)
        file.close()

    def load_text(self, path, fname):
        file = open(path+fname, 'rb')
        self.text = pickle.load(file)
        file.close()

    def save_links(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.links, file)
        file.close()

    def load_links(self, path, fname):
        file = open(path+fname, 'rb')
        self.links = pickle.load(file)
        file.close()

In [None]:
url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"
common_english = Webpage(url)
common_english.get_page_html(min_size=1000)
english_words = common_english.html.getText().lower()
english_words = english_words.split('\n')
print(len(english_words),"most common English words")
#english_words

1000 most common English words


In [None]:
pseudo_sources = ["http://www.ageofautism.com/",
 "http://www.naturalnews.com", 
 "https://foodbabe.com/starthere/",
 "http://www.chopra.com",
 "https://www.mercola.com/",
 "https://www.history.com/",
 "https://doctoroz.com/",
 "https://www.disclose.tv/",
 "https://nationalreport.net/",
 "https://heartland.org/",
 "https://www.dailymail.co.uk/",
 "https://www.motherjones.com/"]

science_sources = ["https://sciencebasedmedicine.org/",
 "https://www.hopkinsmedicine.org/gim/research/method/ebm.html",
 "https://www.bbc.com/news/science_and_environment",
 "https://www.nature.com/",
 "https://www.science.org/",
 "https://www.snopes.com/top/",
 "https://quackwatch.org/",
 "https://www.skepdic.com/",
 "http://scibabe.com/",
 "http://pandasthumb.org/",
 "https://skepticalscience.com/",
 "https://www.cdc.gov/",
 "https://apnews.com/"]

In [None]:
url = science_sources[7]
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)

In [None]:
test_page = Webpage(url)
test_page.get_page_html()
test_page.get_html_text()
test_page.get_html_anchors()
test_page.clean_html_text(500, ignore=english_words[:50], rx="[^a-zA-Z ]+")
test_page.save_text(path, test_page.hash+'.text')
test_page.save_links(path, test_page.hash+'.links')

In [None]:
new_page = Webpage(url)
fname_text = new_page.hash+'.text'
fname_links = new_page.hash+'.links'
if os.path.isfile(path+fname_text): 
    new_page.load_text(path, fname_text)
    print("Loading Text")
else:
    new_page.get_page_html()
    new_page.get_html_text(tags=["p","h1","h2","h3","span"])
    new_page.save_text(path, fname_text)

if os.path.isfile(path+fname_links): 
    new_page.load_links(path, fname_links)
    print("Loading Links")
else:
    new_page.get_page_html()
    new_page.get_html_anchors()
    new_page.save_links(path, fname_links)
new_page.clean_html_text(500, ignore=english_words[:50], rx="[^a-zA-Z ]+")
new_page.k_common_words(k=5,ignore=english_words[:50])
' '.join(new_page.cleaned_text)

Loading Text
Loading Links


'skeptics dictionary features definitions arguments essays hundreds strange beliefs amusing deceptions dangerous delusions also features dozens entries logical fallacies cognitive biases perception science philosophy also posted over years reader comments date status entry reader comments natural cancer cures revision argument ignorance reader comments reader comments psychokinesis reader comments sample skeptics dictionary grotto river gave near peasant named claimed virgin identifying herself immaculate conception appeared her think such great number provided opportunity channel short theological treatise significance seems however main message alleged mother god pray do penance conversion world take drink spring nutshell gods beings unnatural powers who never die believed controllers creators various parts nature many thought require worship obedience humans these gods reward punish us depending whether please them stories gods been told most societies know going back least years go

In [None]:
#| export
def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None):
    page = Webpage(url)
    fname_text = page.hash+'.text'
    fname_links = page.hash+'.links'
    if path == None:
        page.get_page_html()
        page.get_html_text(tags=["p","h1","h2","h3","span"])
        page.get_html_anchors()
    else:
        if os.path.isfile(path+fname_text): 
            page.load_text(path, fname_text)
        else:
            page.get_page_html()
            page.get_html_text(tags=["p","h1","h2","h3","span"])
            page.save_text(path, fname_text)

        if os.path.isfile(path+fname_links): 
            page.load_links(path, fname_links)
        else:
            if page.html == "": page.get_page_html()
            page.get_html_anchors()
            page.save_links(path, fname_links)

    if page.text is not None:
        page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+")
        page.k_common_words(k=k, ignore=ignore_common)
    return page

def get_all_links(url, dict, k, min_words=20, max_words=500, ignore_text=[], ignore_common=[], ignore_filenames=[".mp3",".jpg",".png"], max_links="", path=None):
    primary_page = get_page_all(url, k, max_words, ignore_text, ignore_common, path)
    if primary_page.cleaned_text is not []:
        dict[url] = [primary_page.cleaned_text, primary_page.most_common_words]
        if max_links == "" or max_links > len(primary_page.links): max_links=len(primary_page.links)
        
        for count, link in enumerate(primary_page.links[:max_links]):
            if all(x not in link for x in ignore_filenames):
                try:
                    page = get_page_all(link, k, max_words, ignore_text, ignore_common, path)
                    if page.cleaned_text is not []:
                        if len(page.cleaned_text) < min_words: continue
                        if [page.cleaned_text, page.most_common_words] in dict.values(): continue
                        dict[link] = [page.cleaned_text, page.most_common_words]
                except:
                    pass
            if link in dict:
                res = str(len(dict[link][0]))+" words | "+str(dict[link][1][:3])
            else:
                res = "Rejected"
            progress_message = "%s link %4d/%4d | %s = %s %s" % (url, count, len(primary_page.links), link, res, 200*' ')
            sys.stdout.write("\r" + progress_message)
            sys.stdout.flush()
    else:
        print(url,"returned None, Skipping...")

In [None]:
k = 30 # words
min_words = 50
max_words = 450
max_links = 100
ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on'] 
ignore_common = english_words[:50]
ignore_filenames = [".mp3",".jpg",".png",".mp4",".jfif","facebook.com","twitter.com"]

In [None]:
#| hide
#d_pse = {}
#get_all_links(pseudo_sources[2], d_pse, k, min_text_len, ignore_words, ignore_filenames)
#d_pse

In [None]:
d_pse = {}
d_sci = {}
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)
path_pse = path+'pseudoscience/'
path_sci = path+'science/'
if os.path.isdir(path_pse) is False: os.mkdir(path_pse)
if os.path.isdir(path_sci) is False: os.mkdir(path_sci)

for source in pseudo_sources:
    get_all_links(source, d_pse, k, min_words, max_words, ignore_text, ignore_common, 
    ignore_filenames, max_links, path_pse)
for source in science_sources:
    get_all_links(source, d_sci, k, min_words, max_words, ignore_text, ignore_common, 
    ignore_filenames, max_links, path_sci)


https://apnews.com/ link   20/  21 | https://www.ap.org/careers/ = 156 words | [('our', 11), ('us', 3), ('career', 3)]                                                                                                                                                                                                                                                                                                                          ion.png = Rejected                                                                                                                                                                                                         

In [None]:
count_pse = Counter()
count_sci = Counter()
for link in d_pse:
    count_pse+=Counter(dict(d_pse[link][1]))
for link in d_sci:
    count_sci+=Counter(dict(d_sci[link][1]))

print("#### Pseudoscience",k,"Most Common Words ####\n",count_pse.most_common(k),"\n\n")
print("#### Science",k,"Most Common Words ####\n",count_sci.most_common(k),"\n\n")

#### Pseudoscience 30 Most Common Words ####
 [('our', 1139), ('health', 711), ('more', 625), ('food', 620), ('has', 578), ('their', 550), ('about', 502), ('us', 465), ('my', 411), ('will', 409), ('mother', 400), ('her', 399), ('its', 379), ('who', 340), ('policy', 334), ('information', 289), ('news', 287), ('email', 283), ('tweet', 281), ('new', 276), ('share', 267), ('after', 261), ('if', 254), ('been', 253), ('people', 235), ('subscribe', 233), ('heartland', 226), ('twitter', 222), ('no', 216), ('pin', 216)] 


#### Science 30 Most Common Words ####
 [('medicine', 906), ('more', 792), ('our', 783), ('about', 760), ('science', 725), ('health', 618), ('menu', 483), ('us', 429), ('research', 425), ('climate', 418), ('information', 395), ('care', 373), ('news', 357), ('access', 349), ('new', 336), ('its', 332), ('has', 324), ('johns', 324), ('will', 320), ('nature', 310), ('these', 293), ('used', 290), ('their', 286), ('cookies', 274), ('if', 259), ('may', 258), ('storage', 228), ('tech

In [None]:
for link in d_pse:
    if link in d_sci.keys():
        print(link)

## Data Preparation

In [None]:
d_all = {}
for link in d_pse:
    text = d_pse[link][0]
    if len(text) > max_words: text = text[:max_words]
    common_words = ' '.join([count[0] for count in d_pse[link][1]])
    if link not in d_all:
        d_all[link] = [' '.join(text), common_words, 'pseudoscience']

for link in d_sci:
    text = d_sci[link][0]
    if len(text) > max_words: text = text[:max_words]
    common_words = ' '.join([count[0] for count in d_sci[link][1]])
    if link not in d_all:
        d_all[link] = [' '.join(text), common_words, 'science']

df = pd.DataFrame.from_dict(d_all, orient='index', columns=['text', 'common_words', 'label'])
df.head()

Unnamed: 0,text,common_words,label
http://www.ageofautism.com/,during spring kvetched twitter had slapped sensitive content label all our tweets included link or photo was before musk stepped see imagine if twitter became open again hope twitter exposes all censorship so many us have faced especially crackdown mean want see overtly controversial despicable content allowed but who defines pornography right know when see from defender twitter ditches misinformation policy plans reveal internal files free speech suppression effective twitter is no longer enforcing misleading information policy company said adding will soon reveal internal files free spee...,twitter content policy information so who misinformation which accounts analysis see enforcing misleading twitters about post deaths age autism our tweets included photo musk many us crackdown reveal internal files,pseudoscience
https://www.ageofautism.com/donate.html,hello your donation autism age is tax now use secure online donations scroll down their easy use form you can always send us paper or electronic check as well email me at any time with ideas suggestions or gentle critiques our is thank you ed cause unknown epidemic sudden deaths health defense transcend fear blueprint mindful leadership public health jr real bill gates big global war democracy public health health defense donate click cover buy book shop amazon support recent comments past current contributors connect search donate donate top,health donate defense public hello donation autism age tax now secure online donations scroll down their easy form always send us paper electronic check well email me any time ideas,pseudoscience
https://www.ageofautism.com/contact-us.html,autism age box ct ed cause unknown epidemic sudden deaths health defense transcend fear blueprint mindful leadership public health jr real bill gates big global war democracy public health health defense donate click cover buy book shop amazon support recent comments past current contributors connect search donate contact us top,health defense public donate autism age box ct ed cause unknown epidemic sudden deaths transcend fear blueprint mindful leadership jr real bill gates big global war democracy click cover buy,pseudoscience
https://www.ageofautism.com/exclusives.html,editorials from series by here come you had me at an elaborate fraud series deer special report what do epidemiological studies really tell us note from there are epidemiological studies here vaccines autism these studies represent most often cited papers by scientists public health officials members media when trying refute any evidence an association between vaccinations autism there are serious methodological limitations design flaws conflicts interest or other problems related each these studies these flaws have been pointed out by government officials other researchers medical review ...,autism studies health these series here public epidemiological vaccines officials limitations flaws study defense donate editorials come me elaborate fraud deer special report do really tell us note represent most,pseudoscience
https://www.ageofautism.com/science/,tom urged get their bivalent vaccine booster yesterday twitter kindly let him his followers know about week class starting this week through countermeasures injury compensation program this is program absolves corporations whose products harm during pandemic you can take course both live recorded version national vaccine injury compensation program countermeasures injury compensation program used emergency authorized this course students will learn structure function defects programs us created by congress award compensation adults children potentially actually harmed or killed by vaccines...,compensation injury program vaccine course high disease consequence no countermeasures us four their know week will john march health before been infectious has get class through products both live national,pseudoscience


In [None]:
dls = TextDataLoaders.from_df(df, bs=16, text_col='text', label_col='label')
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,xxbos this exchange consists seven posts each with comment by reply by plus one other post by is translation mostly by google translate from original posts comments have kindly corrected translation you they are welcome comment at xxunk thumb keep mind posts these xxunk comments were written two years ago thus can not take more recent comments here into account we hope publish sections seven consecutive we chose day week without remembering its connection ancient days readers these authors are invited contribute further comments our comment system this weeks post starts with an introduction by followed by first his posts at comment by followed by xxunk by by two years ago professors informatics respectively published an article titled using statistical methods model molecular machines systems journal theoretical biology regular scientific intelligent design i d community publication was seen as breakthrough i d close reading paper however is not convincing seven,science
1,xxbos walker speaks during campaign rally john not too long ago walker said he had experienced racism having been stopped harassed by police because he was black man he expressed his fear police will abuse his son because he s black yet during his campaign senate walker republican now runoff against democratic xxunk has repeatedly declared racism does not exist united states this is all part confusing series remarks about racism walker has made recent years year before he would announce his senate bid walker appeared podcast jr black conservative discussed assorted matters relating race referring his son walker noted he was afraid if police stop him because he s black moreover walker xxunk his own troubling xxunk with law enforcement officers been stopped by police been harassed by police before discussing one incident when he was pulled over by cops he said why world are you gon na stop,pseudoscience
2,xxbos this is an additional post by made at site after seven posts his breakthrough intelligent design series were made he has requested we post here after second post series explain his position post what can be considered legitimate science central point discussion have with posts entitled breakthrough design concerns limits what can be considered legitimate science since his comments this matter are too long comments section have made post subject central question is how formulate hypothesis is suggested explanation hypotheses are not formulated an xxunk way you just grab something air hypothesis should be based prior knowledge should also be falsifiable must be possible show is incorrect scientific work then consists hypothesis whether proves be true or false something new has been learned thing here is hypothesis is formulated such way can be tested with result if you can not do you can not learn anything new if test,science


In [None]:
torch.cuda.empty_cache()

In [None]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(4, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.537045,0.454569,0.8,00:18


epoch,train_loss,valid_loss,accuracy,time
0,0.408883,0.262582,0.87619,00:32
1,0.34462,0.29947,0.87619,00:33
2,0.230716,0.226557,0.919048,00:33
3,0.143907,0.223225,0.938095,00:33


In [None]:
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.4 GB | Cached: 2.2 GB


In [None]:
learn.show_results()

Unnamed: 0,text,category,category_
0,xxbos by weekends my husband have simple routine whoever xxunk up first gets ready day means taking him bathroom getting him changed giving him his while one us tends other gets bit break ease into day its sweet relationship my husband have father xxunk son son looks up father they need me intervene or xxunk way their weekend routine one recent morning though when got up later than my xxunk heard take his xxunk eggs xxunk cup coffee its breakfast make almost every day gets same sans coffee been awake about minutes by time i d gotten our meals ready last morning assuming my husband had already gotten his morning hovered over my chair just about sit down eat gotten good night sleep night before was less than xxunk morning so xxunk out assume he s just waiting his meal right my husband apologized said no he still needs take,pseudoscience,pseudoscience
1,xxbos heartland institute submitted public comments proposal repeal clean power plan environmental protect agency issued an advanced notice proposed titled repeal carbon dioxide emission guidelines existing xxunk sources electric utility generating units clean power plan heartland institute senior fellow peter research fellow submitted comprehensive xxunk documented more than figures more than xxunk public comment support repeal they note comment addresses following topics about heartland institute clean power plan is based an xxunk interpretation section clean air consequently there is no legal authority must be costs have already vastly exceeded even expected benefits ii fossil fuels are essential prosperity worldwide hundreds years since industrial revolution fossil fuel use is has been associated with higher economic growth xxunk wages health life xxunk population reduced even after decades government xxunk xxunk alternative energy sources such as solar wind play only niche role us energy official us government projections show fossil fuels will be,pseudoscience,pseudoscience
2,xxbos ca nt get enough my writing want find out what else been up around internet well here talks appearances guest appearances detection fall food babe interview with thinking atheist when good science sounds agenda agenda avoiding food myths bad agenda ten questions food agenda things xxunk us reason rally experience dumb food myths debunked by this should vaccines be freedom report fake news survival guide hosted by atheist community business being atheists national convention how live life atheists national convention everything is killing you marketing guide bad science skeptical runs skeptical eye over alternative national science week people need hear from agriculture food babe vs vs princess brings scientific xxunk freedom report vaccines skeptical libertarian xxunk read between headlines xxunk sponsored by communicating about food myths social media age animal agricultural alliance food glorious with logic global warming evolution vaccines more beer xxunk national science week promo with xxunk,science,science
3,xxbos please read these terms conditions carefully before using website terms conditions contain important information about your rights obligations as well as limitations exclusions by using this website fullest extent permitted by law you confirm your xxunk agreement acceptance these terms conditions if you xxunk not accept these terms conditions do not use this website all content this website is subject change at any time without notice attempts ensure information this website is complete accurate current despite our efforts information this website may occasionally be inaccurate incomplete or out fullest extent permitted by law makes no representation as how complete accurate or current any information is this website may make changes its website design functionality content at any time may provide links other sites are not maintained by does not endorse those sites is not responsible content such other sites please read our policy governing such third party content may,pseudoscience,science
4,xxbos by weekends my husband have simple routine whoever xxunk up first gets ready day means taking him bathroom getting him changed giving him his while one us tends other gets bit break ease into day its sweet relationship my husband have father xxunk son son looks up father they need me intervene or xxunk way their weekend routine one recent morning though when got up later than my xxunk heard take his xxunk eggs xxunk cup coffee its breakfast make almost every day gets same sans coffee been awake about minutes by time i d gotten our meals ready last morning assuming my husband had already gotten his morning hovered over my chair just about sit down eat gotten good night sleep night before was less than xxunk morning so xxunk out assume he s just waiting his meal right my husband apologized said no he still needs take,pseudoscience,pseudoscience
5,xxbos stock ready ship verified customer team member hi ever since we launched our small company have wanted make snack bar would be happy eat you see always been type person reward myself at end long day with little treat was ritual i d grab my favorite cup tea treat then i d sit xxunk enjoy was little me moment would help me unwind from day problem sometimes would eat piece chocolate other times was another snack would feel good moment horrible later needed something better why set out create something better wanted snack bar snack bar could xxunk eat at end long day feel good about only had one question what makes great snack bar had rules ever eat snack bar seemingly xxunk all xxunk out your mouth as if you spent an eternity desert or worse bar so sticky you feel like you need wash your hands immediately,pseudoscience,pseudoscience
6,xxbos prince attended game against heat at garden night they were pictured sitting xxunk cheering team just one hour after princes prize launch was xxunk into palace race row reverend was one several speakers at launch where she made speech included comments impact racism climate change her comments came amid controversy sparked by own godmother lady who was accused making racist comments domestic abuse campaigner night however put brave face game during which they sat owner his disgraced former fumed over media speculation about his his fellow executives lifestyle an interview xxunk medias metric is clicks during phone call with an xxunk from call he could be heard saying how he was tired speculation about his his fellow executives love lives earlier this month was revealed one his companies wrote about being according reports his xxunk roommates at headquarters were all dating each other officials are scratching their heads after,pseudoscience,pseudoscience
7,xxbos by associated press est updated est jury has convicted man murder pike county massacre more than six years after eight members another family were slaughtered their sleep iv was found guilty all counts he faced southern pike county including eight counts aggravated murder shootings seven adults teenager from family sat motionless as verdicts were read closing his eyes or looking down jurors delivered verdict after weighing denials other testimony against word witnesses including his brother mother who previously pleaded guilty their roles prosecutors said family at time massacre became obsessed with getting custody over then daughter with after they feared child would be molested by new boyfriend denied any knowledge his involvement killings testified he have let happen if he had known plans fatal shootings at three mobile homes xxunk near terrified residents launched one states most extensive criminal investigations prosecutors say slayings which initially spurred speculation about drug,pseudoscience,pseudoscience
8,xxbos xxunk jr xxunk onto stage at southern church radiating confidence xxunk standing xxunk crowd with his xxunk blue xxunk eyes then he launched into an rant democrats drank he told people assembled far right conference branded as standing health freedom is criminal medical xxunk give child one these vaccines xxunk according video event one his many assertions ignored or went against legal scientific public health consensus then xxunk his book if just attendees amazon night he told crowd would land bestseller list they could stick amazon all profits he said would go his charity health defense while many nonprofits businesses have struggled during pandemic group has xxunk an investigation by associated press finds health defense has xxunk funding followers as used his star power as member one most famous families open doors raise money lend his group credibility filings with charity regulators show revenue more than doubled million since,science,pseudoscience


In [None]:
test_sources = {'https://infowarslife.com/':'pseudoscience',
'https://www.dailymail.co.uk/':'pseudoscience',
'https://apnews.com/':'science',
'https://www.si.edu/explore/science':'science',
'https://www.foxnews.com/opinion':'pseudoscience',
'https://www.disclose.tv/':'pseudoscience',
'https://www.snopes.com/top/':'science',
'https://www.theskepticsguide.org/about':'science',
'https://www.cdc.gov/':'science',
'https://www.motherjones.com/':'pseudoscience',
'https://www.huffpost.com/':'pseudoscience',
'https://arstechnica.com/':'science',
'https://nationalreport.net/':'pseudoscience',
'https://newspunch.com/':'pseudoscience'}

In [None]:
d_pred = {}

for source in test_sources:
    page = get_page_all(source, k, max_words, ignore_text, ignore_common)
    length = len(page.cleaned_text)
    if  length < min_words:
        print("ERROR:",source,length,"words")
    else:
        common_words = ' '.join([count[0] for count in page.most_common_words])
        text = ' '.join(page.cleaned_text)
        with learn.no_bar(), learn.no_logging():
            prediction = learn.predict(text)
        if prediction[0] == "science":
            p = prediction[2][1].item()
        else:
            p = prediction[2][0].item()
        d_pred[source] = [test_sources[source], prediction[0], p]

df = pd.DataFrame.from_dict(d_pred, orient='index', columns=['actual', 'prediction', 'probability'])
df

Unnamed: 0,actual,prediction,probability
https://infowarslife.com/,pseudoscience,pseudoscience,0.814715
https://www.dailymail.co.uk/,pseudoscience,pseudoscience,0.987088
https://apnews.com/,science,science,0.870788
https://www.si.edu/explore/science,science,science,0.959251
https://www.foxnews.com/opinion,pseudoscience,pseudoscience,0.996972
https://www.disclose.tv/,pseudoscience,pseudoscience,0.642757
https://www.snopes.com/top/,science,science,0.991956
https://www.theskepticsguide.org/about,science,science,0.991945
https://www.cdc.gov/,science,science,0.995555
https://www.motherjones.com/,pseudoscience,pseudoscience,0.992036


In [None]:
#learn.save('2022.11.30 Model v3')

In [None]:
#learn = load_learner('models/2022.11.28 Model.pth', cpu=False)