# data

> Web scraping and tools for data collection and processing

In [1]:
#| default_exp data

In [2]:
#| export
import warnings
warnings.filterwarnings('ignore')
import requests
from bs4 import BeautifulSoup
import enchant
import re
import random
from collections import Counter
from fastai.text.all import *
import hashlib
import pickle

In [3]:
## Utility Function to Check GPU Status
def check_gpu():
    print("CUDA Available: ", torch.cuda.is_available())
    num_devices = torch.cuda.device_count()
    if num_devices > 0:
        for device in range(0,num_devices):
            print("Device", device, "|", torch.cuda.get_device_name(device), 
            "| Allocated:", round(torch.cuda.memory_allocated(device)/1024**3,1), "GB",
            "| Cached:", round(torch.cuda.memory_reserved(device)/1024**3,1), "GB")

torch.cuda.empty_cache()

In [4]:
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.0 GB | Cached: 0.0 GB


## Web Scraper

In [5]:
#| export
class Webpage:
    def __init__(self, url):
        self.url = url
        self.hash = self.get_hash_str()
        self.requested = False
        self.page_text = ""
        self.html = ""
        self.links = []
        self.text = []
        self.cleaned_text = []
        self.most_common_words = []
    
    def get_page(self, headers, min_size, max_size):
        r = requests.get(self.url, stream=True, headers=headers)
        content_length = int(r.headers.get('Content-Length', 0))
        data = []
        length = 0

        if content_length > max_size:
            return None

        for chunk in r.iter_content(1024):
            data.append(chunk)
            length += len(chunk)
            if length > max_size:
                return None
        r._content = b''.join(data)
        if len(r.text) < min_size: return None
        return r.text

    def get_page_html(self, min_size=1000, max_size=2000000):
        user_agents = [ 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 
            'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148', 
            'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36' 
        ] 
        user_agent = random.choice(user_agents) 
        headers = {'User-Agent': user_agent} 
        self.page_text = self.get_page(headers, min_size, max_size)
        self.html = BeautifulSoup(self.page_text, "html.parser")
        self.requested = True

    def get_hash_str(self, inp=""):
        return hashlib.sha3_256((self.url+inp).encode()).hexdigest()

    def get_html_anchors(self, keyword="http"):
        for anchor in self.html.findAll('a'):
            link = anchor.get('href')
            if link == None or link == "":
                continue
            if keyword in link:
                self.links.append(link)
                
    def get_html_text(self, tags=["p"]):
        for tag in tags:
            for p in self.html.findAll(tag):
                p_text = p.getText().strip()
                if p_text == None or p_text == '':
                    continue
                self.text.append(p_text)

    def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], min_word_len=2):
        rx = "[^a-zA-Z0-9 ]+"
        all_text = ' '.join(self.text).lower()
        regex_text = re.sub(rx,'',all_text).strip()
        split = regex_text.split()
        split = [word for word in split if word not in ignore]
        if enchant_dict != "": d = enchant.Dict(enchant_dict)
        for word in split:
            if len(self.cleaned_text) >= max_words: break
            if len(word) > min_word_len:
                if enchant_dict == "":
                    self.cleaned_text.append(word)
                elif d.check(word): 
                    self.cleaned_text.append(word)

    def k_common_words(self, k=10, ignore=["the","to","of","and","a","in","on","is","for","by"]):
        if self.cleaned_text == "":
            text = self.text
        else:
            text = self.cleaned_text
        all_text = ' '.join(text).lower()
        split = all_text.split()
        split_ignore = [word for word in split if word not in ignore]
        counts = Counter(split_ignore)
        k_most_common = counts.most_common(k)
        self.most_common_words = k_most_common

    def save_text(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.cleaned_text, file)
        file.close()

    def load_text(self, path, fname):
        file = open(path+fname, 'rb')
        self.cleaned_text = pickle.load(file)
        file.close()

    def save_links(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.links, file)
        file.close()

    def load_links(self, path, fname):
        file = open(path+fname, 'rb')
        self.links = pickle.load(file)
        file.close()

In [6]:
url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"
common_english = Webpage(url)
common_english.get_page_html(min_size=1000)
english_words = common_english.html.getText().lower()
english_words = english_words.split('\n')
print(len(english_words),"most common English words")
#english_words

1000 most common English words


In [7]:
pseudo_sources = ["http://www.ageofautism.com/",
 "http://www.naturalnews.com", 
 "https://foodbabe.com/starthere/",
 "http://www.chopra.com",
 "https://www.mercola.com/",
 "https://www.history.com/",
 "https://doctoroz.com/",
 "https://www.disclose.tv/",
 "https://christiananswers.net/",
 "https://heartland.org/"]

science_sources = ["https://sciencebasedmedicine.org/",
 "https://www.hopkinsmedicine.org/gim/research/method/ebm.html",
 "https://www.bbc.com/news/science_and_environment",
 "https://www.nature.com/",
 "https://www.science.org/",
 "https://www.snopes.com/top/",
 "https://quackwatch.org/",
 "https://www.skepdic.com/",
 "http://scibabe.com/",
 "http://pandasthumb.org/",
 "https://skepticalscience.com/"]

In [8]:
url = science_sources[7]
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)

In [9]:
test_page = Webpage(url)
test_page.get_page_html()
test_page.get_html_text()
test_page.get_html_anchors()
test_page.clean_html_text(500)
test_page.save_text(path, test_page.hash+'.text')
test_page.save_links(path, test_page.hash+'.links')

In [10]:
new_page = Webpage(url)
fname_text = new_page.hash+'.text'
fname_links = new_page.hash+'.links'
if os.path.isfile(path+fname_text): 
    new_page.load_text(path, fname_text)
    print("Loading Text")
else:
    new_page.get_page_html()
    new_page.get_html_text(tags=["p","h1","h2","h3","span"])
    new_page.clean_html_text(500, ignore=english_words[:50])
    new_page.save_text(path, fname_text)

if os.path.isfile(path+fname_links): 
    new_page.load_links(path, fname_links)
    print("Loading Links")
else:
    new_page.get_page_html()
    new_page.get_html_anchors()
    new_page.save_links(path, fname_links)
new_page.k_common_words(k=5,ignore=english_words[:50])
new_page.cleaned_text

Loading Text
Loading Links


['the',
 'skeptics',
 'dictionary',
 'features',
 'definitions',
 'arguments',
 'and',
 'essays',
 'hundreds',
 'strange',
 'beliefs',
 'amusing',
 'deceptions',
 'and',
 'dangerous',
 'delusions',
 'also',
 'features',
 'dozens',
 'entries',
 'logical',
 'fallacies',
 'cognitive',
 'biases',
 'perception',
 'science',
 'and',
 'philosophy',
 'also',
 'posted',
 'are',
 'over',
 'years',
 'reader',
 'comments',
 'date',
 'status',
 'entry',
 'reader',
 'comments',
 'natural',
 'cancer',
 'cures',
 'revision',
 'argument',
 'ignorance',
 'reader',
 'comments',
 'reader',
 'comments',
 'psychokinesis',
 'reader',
 'comments',
 'sample',
 'the',
 'skeptics',
 'dictionary',
 '1858',
 'grotto',
 'the',
 'river',
 'gave',
 'near',
 'peasant',
 'named',
 'claimed',
 'that',
 'the',
 'virgin',
 'identifying',
 'herself',
 'the',
 'immaculate',
 'conception',
 'appeared',
 'her',
 'some',
 'think',
 'such',
 'great',
 'number',
 'have',
 'provided',
 'opportunity',
 'channel',
 'short',
 'theol

In [11]:
#| export
def get_page_all(url, k, max_words, ignore_words, path = None):
    page = Webpage(url)
    fname_text = page.hash+'.text'
    fname_links = page.hash+'.links'
    if path == None:
        page.get_page_html()
        page.get_html_text(tags=["p","h1","h2","h3","span"])
        page.get_html_anchors()
        page.clean_html_text(max_words, ignore=english_words[:50])
    else:
        if os.path.isfile(path+fname_text): 
            page.load_text(path, fname_text)
        else:
            page.get_page_html()
            page.get_html_text(tags=["p","h1","h2","h3","span"])
            page.clean_html_text(max_words, ignore=english_words[:50])
            page.save_text(path, fname_text)

        if os.path.isfile(path+fname_links): 
            page.load_links(path, fname_links)
        else:
            if page.html == "": page.get_page_html()
            page.get_html_anchors()
            page.save_links(path, fname_links)

    if page.cleaned_text is not None:
        page.k_common_words(k=k, ignore=ignore_words)
    return page

def get_all_links(url, dict, k, min_words=20, max_words=500, ignore_words=[], ignore_filenames=[".mp3",".jpg",".png"], max_links="", path=None):
    page = get_page_all(url, k, max_words, ignore_words, path)
    if page.cleaned_text is not []:
        dict[url] = [page.cleaned_text, page.most_common_words]
        print(url,"Contains",len(page.links),"Links")
        if max_links == "" or max_links > len(page.links): max_links=len(page.links)
        
        for link in page.links[:max_links]:
            if all(x not in link for x in ignore_filenames):
                try:
                    page = get_page_all(link, k, max_words, ignore_words, path)
                    if page.cleaned_text is not []:
                        if len(page.cleaned_text) < min_words: continue
                        dict[link] = [page.cleaned_text, page.most_common_words]
                except:
                    pass
    else:
        print(url,"returned None, Skipping...")

In [12]:
k = 30 # words
min_words = 20
max_words = 1000
max_links = ""
ignore_words = english_words[:100]
ignore_filenames = [".mp3",".jpg",".png",".mp4",".jfif","facebook.com","twitter.com"]

In [13]:
#| hide
#d_pse = {}
#get_all_links(pseudo_sources[2], d_pse, k, min_text_len, ignore_words, ignore_filenames)
#d_pse

In [14]:
d_pse = {}
d_sci = {}
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)
path_pse = path+'pseudoscience/'
path_sci = path+'science/'
if os.path.isdir(path_pse) is False: os.mkdir(path_pse)
if os.path.isdir(path_sci) is False: os.mkdir(path_sci)

for source in pseudo_sources:
    get_all_links(source, d_pse, k, min_words, max_words, ignore_words, ignore_filenames, max_links, path_pse)
for source in science_sources:
    get_all_links(source, d_sci, k, min_words, max_words, ignore_words, ignore_filenames, max_links, path_sci)


http://www.ageofautism.com/ Contains 705 Links
http://www.naturalnews.com Contains 323 Links
https://foodbabe.com/starthere/ Contains 126 Links
http://www.chopra.com Contains 100 Links
https://www.mercola.com/ Contains 125 Links
https://www.history.com/ Contains 84 Links
https://doctoroz.com/ Contains 28 Links
https://www.disclose.tv/ Contains 139 Links
https://christiananswers.net/ Contains 9 Links
https://heartland.org/ Contains 142 Links
https://sciencebasedmedicine.org/ Contains 258 Links
https://www.hopkinsmedicine.org/gim/research/method/ebm.html Contains 103 Links
https://www.bbc.com/news/science_and_environment Contains 128 Links
https://www.nature.com/ Contains 73 Links
https://www.science.org/ Contains 29 Links
https://www.snopes.com/top/ Contains 35 Links
https://quackwatch.org/ Contains 136 Links
https://www.skepdic.com/ Contains 103 Links
http://scibabe.com/ Contains 118 Links
http://pandasthumb.org/ Contains 7 Links
https://skepticalscience.com/ Contains 131 Links


In [15]:
count_pse = Counter()
count_sci = Counter()
for link in d_pse:
    count_pse+=Counter(dict(d_pse[link][1]))
for link in d_sci:
    count_sci+=Counter(dict(d_sci[link][1]))

print("#### Pseudoscience",k,"Most Common Words ####\n",count_pse.most_common(k),"\n\n")
print("#### Science",k,"Most Common Words ####\n",count_sci.most_common(k),"\n\n")

#### Pseudoscience 30 Most Common Words ####
 [('our', 215), ('information', 185), ('food', 156), ('health', 145), ('twitter', 108), ('2022', 107), ('its', 90), ('donate', 86), ('any', 76), ('autism', 73), ('new', 73), ('read', 68), ('website', 54), ('content', 50), ('policy', 49), ('natural', 49), ('life', 49), ('here', 48), ('email', 47), ('views', 46), ('age', 45), ('free', 45), ('news', 42), ('senate', 42), ('personal', 40), ('vaccine', 38), ('support', 38), ('access', 38), ('heartland', 38), ('proton', 37)] 


#### Science 30 Most Common Words ####
 [('2022', 251), ('science', 182), ('read', 159), ('care', 125), ('written', 125), ('share', 122), ('information', 116), ('our', 110), ('medicine', 92), ('och', 84), ('news', 81), ('research', 78), ('new', 76), ('twitter', 72), ('menu', 70), ('why', 62), ('med', 61), ('published', 59), ('johns', 57), ('patients', 57), ('here', 54), ('guidelines', 53), ('updates', 53), ('health', 51), ('man', 51), ('skeptics', 47), ('tweet', 46), ('its',

In [16]:
for link in d_pse:
    if link in d_sci.keys():
        print(link)

## Data Preparation

In [17]:
d_all = {}
for link in d_pse:
    text = d_pse[link][0]
    if len(text) > max_words: text = text[:max_words]
    common_words = ' '.join([count[0] for count in d_pse[link][1]])
    if link not in d_all:
        d_all[link] = [' '.join(text), common_words, 'pseudoscience']

for link in d_sci:
    text = d_sci[link][0]
    if len(text) > max_words: text = text[:max_words]
    common_words = ' '.join([count[0] for count in d_sci[link][1]])
    if link not in d_all:
        d_all[link] = [' '.join(text), common_words, 'science']

df = pd.DataFrame.from_dict(d_all, orient='index', columns=['text', 'common_words', 'label'])
df.head()

Unnamed: 0,text,common_words,label
http://www.ageofautism.com/,during spring kvetched twitter slapped sensitive content label our tweets included link photo before musk stepped see imagine twitter became open again hope twitter exposes censorship many faced especially crackdown mean want see overtly controversial despicable content allowed who defines pornography right know see defender twitter ditches misinformation policy plans reveal internal files free speech suppression effective twitter longer enforcing misleading information policy company adding will soon reveal internal files free speech suppression which could shed light twitters past action...,autism age 2022 our twitter deaths vaccine posted comments its sudden information also epidemic children week died content link policy 0600 perhaps issues take between healthy health current affairs here,pseudoscience
https://www.ageofautism.com/,during spring kvetched twitter slapped sensitive content label our tweets included link photo before musk stepped see imagine twitter became open again hope twitter exposes censorship many faced especially crackdown mean want see overtly controversial despicable content allowed who defines pornography right know see defender twitter ditches misinformation policy plans reveal internal files free speech suppression effective twitter longer enforcing misleading information policy company adding will soon reveal internal files free speech suppression which could shed light twitters past action...,autism age 2022 our twitter deaths vaccine posted comments its sudden information also epidemic children week died content link policy 0600 perhaps issues take between healthy health current affairs here,pseudoscience
https://www.ageofautism.com/donate.html,hello donation autism age tax now secure online donations scroll down their easy form always send paper electronic check well email any time ideas suggestions gentle critiques our cause unknown epidemic sudden deaths 2021 2022 health defense transcend fear blueprint mindful leadership public health real bill gates big global war democracy public health health defense donate click cover buy book shop amazon support recent comments past current contributors connect search donate donate top,health donate defense public hello donation autism age tax secure online donations scroll easy form always send paper electronic check well email any ideas suggestions gentle critiques our cause unknown,pseudoscience
https://www.ageofautism.com/contact-us.html,autism age box 110546 06611 cause unknown epidemic sudden deaths 2021 2022 health defense transcend fear blueprint mindful leadership public health real bill gates big global war democracy public health health defense donate click cover buy book shop amazon support recent comments past current contributors connect search donate contact top,health defense public donate autism age box 110546 06611 cause unknown epidemic sudden deaths 2021 2022 transcend fear blueprint mindful leadership real bill gates big global war democracy click cover,pseudoscience
https://www.ageofautism.com/exclusives.html,editorials series here come elaborate fraud series deer special report epidemiological studies really tell note epidemiological studies here vaccines autism these studies represent most often cited papers scientists public health officials members media trying refute any evidence association between vaccinations autism serious methodological limitations design flaws conflicts interest problems related these studies these flaws been pointed government officials researchers medical review panels even authors studies themselves taken together limitations these studies make impossible conclude...,autism studies health series here public epidemiological vaccines officials limitations flaws study defense donate editorials elaborate fraud deer special report really tell note represent often cited papers scientists members media,pseudoscience


In [18]:
dls = TextDataLoaders.from_df(df, bs=8, text_col='text', label_col='label')
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,xxbos och system man den evolution may also study more restricted problem ancestry compare model common ancestry two species unique origin model according which species founded single couple carter 2014 carter xxunk 2019 och xxunk den med den man par man och och den art 7 xxrep 4 0 sedan 1 xxrep 4 0 man men den variation till 5 xxrep 5 0 xxunk 6 xxrep 3 0 till 1 xxrep 4 0 sedan man man genetic modeling human history part xxunk common descent unique origin approaches och genetic modeling human history part unique origin algorithm 2016 med med till discovery institute human origin possible 2019 man sin man par 500 xxrep 3 0 sedan med men 100 xxrep 3 0 discovery institute den den genetic modeling human history part xxunk common descent unique origin approaches 2016 man argue unique origin model where humanity xxunk single couple created diversity xxunk,science
1,xxbos 2022 before musk bought twitter social media giant suspended 11 xxrep 3 0 accounts removed 1 xxrep 5 0 pieces content violating its misinformation policy between 2020 2022 alone now musk owns has made 2022 help medical records more than 1 xxrep 3 0 health care workers interacting directly patients researchers concluded medical masks provide little benefit over respirators specifically researchers 2022 world experiencing strongest decline birth rates over 100 years its unprecedented figure yet except laypersons like xxunk independent researchers xxunk true health official making comments 2022 more than 230 million having received least two shots first time since shots came more vaccinated than persons dying infection just those eligible new boosters 2022 effort xxunk xxunk xxunk article xxunk benefits treatment goes great xxunk xxunk xxunk xxunk does xxunk prevent deaths 2022 now he s finally under xxunk role pandemic censorship went national institute xxunk infectious diseases director,pseudoscience
2,xxbos continuing browse our site agree our cookies revised privacy policy terms service agree group nations including xxunk digital vaccine read more future xxunk into world where everyone will xxunk technology lower class controlled read more evidence shows unprocessed red meat health risk however vilification natural raw meat products raises potential during victory gardens produced produce many now rediscovering many benefits gardening offers fresh inventory control plan just about everything earth land water minerals plants xxunk food energy genetically engineered foods grain crops like wheat oats barley common sources exposure toxic chemical linked even small amount just these vitamins risk dying even most severe cases drops like rock fact before musk bought twitter social media giant suspended 11 xxrep 3 0 accounts removed 1 xxrep 5 0 pieces content violating its help medical records more than 1 xxrep 3 0 health care workers interacting directly patients researchers concluded world experiencing,pseudoscience


In [19]:
torch.cuda.empty_cache()

In [20]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(4, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.905427,0.595364,0.75,00:04


epoch,train_loss,valid_loss,accuracy,time
0,0.475673,0.553062,0.8,00:05
1,0.4546,0.499734,0.75,00:05
2,0.370057,0.364939,0.95,00:05
3,0.328572,0.322171,0.9,00:05


In [21]:
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.4 GB | Cached: 2.0 GB


In [22]:
learn.show_results()

Unnamed: 0,text,category,category_
0,xxbos heartland institute worlds leading think tanks national nonprofit research education organization based xxunk its mission since its founding 1984 discover develop promote solutions social economic problems action xxunk well think xxunk measure our success impact real world heartland institute xxunk essential role national increasingly international movement personal liberty limited government xxunk between freedom xxunk leading xxunk thinkers nations xxunk national state elected officials because effective been subject xxunk criticism even libel various liberal advocacy groups elected officials even please see reply our critics page where answer our critics set record xxunk hope will xxunk comments xxunk websites xxunk conversations friends xxunk staff works board directors 2 xxrep 3 0 xxunk 500 xxunk professional xxunk who serve policy xxunk fellows more than officials who pay xxunk serve our legislative forum heartland staff board directors policy experts heartland institute nonprofit organization xxunk xxunk under section internal xxunk code focus issues education,pseudoscience,pseudoscience
1,xxbos enter term search box find its xxunk xxunk far right xxunk increase xxunk number terms automatically xxunk completely turn feature off archives global warming real leading climate change under xxunk climate skepticism public xxunk misinformation xxunk xxunk xxunk global warming website gets skeptical about global warming skepticism our mission simple xxunk climate misinformation presenting science explaining techniques science denial recently happened upon xxunk detailed article explaining scientific consensus made xxunk even though lot material about consensus xxunk about scientific consensus actually now xxunk created page xxunk xxunk entry which will point our readers towards page small xxunk xxunk xxunk over consensus article xxunk skeptical science full text available below sections about consensus based xxunk xxunk week most likely seen xxunk where term scientific consensus has been xxunk xxunk people example often xxunk xxunk popular opinion think result xxunk determined vote just finding xxunk because opinion xxunk even xxunk xxunk,science,science
2,xxbos cover xxunk xxunk powerful xxunk potential cell engineering ability xxunk xxunk regulatory xxunk enables researchers design xxunk cells advance basic science improve xxunk xxunk now xxunk various cell types groups cells xxunk smart xxunk xxunk xxunk xxunk xxunk xxunk functions see special section beginning page xxunk xxunk online cover smart xxunk xxunk patch blood glucose control insulin xxunk xxunk xxunk essential xxunk blood glucose levels inspired mechanism xxunk developed patch enables xxunk both insulin xxunk xxunk treatment type xxunk xxunk model patch xxunk blood glucose levels within xxunk range xxunk risk xxunk online cover xxunk potential xxunk months cover xxunk xxunk youth which new xxunk xxunk xxunk cells emerging after xxunk infectious challenge xxunk xxunk xxunk red xxunk xxunk xxunk cells join xxunk xxunk gray lack xxunk capacity model xxunk supported new mouse models xxunk separate xxunk discussed focus online cover xxunk replicate xxunk often heavy xxunk lot energy,science,science
3,xxbos guide quackery health fraud intelligent decisions which operated network web sites mailing lists maintained center inquiry sites focus health frauds myths fads fallacies misconduct their main goal provide information difficult impossible get xxunk help visitors special areas interest sites cover autism chiropractic dentistry xxunk marketing many xxunk areas internet health pilot site provides links hundreds reliable health sites contains large library legal cases xxunk board actions government xxunk regulatory actions against questionable medical products these accessed through visit our affiliated sites menu above their contents searched once through page also offer health fraud discussion list more than xxunk members consumer health digest free weekly email newsletter summarizes scientific reports legislative developments enforcement actions information relevant consumer protection consumer its primary focus health occasionally includes scams our homepage hits xxunk million march 2 xxrep 3 0 million august xxunk million 2009 million 2019 new edition most comprehensive text available field,science,science
4,xxbos masks required inside our care facilities vaccinating eligible patients learn boosters additional doses testing patient care visitor guidelines more testing locations masks required inside our care facilities vaccinating eligible patients learn more vaccines boosters additional doses testing patient care visitor guidelines find more testing locations popular searches welcome been our care facilities while will notice few changes since last visit please review page learn about extra safety measures place steps ask take more safely care our patients monitor our communities masks required everyone age inside our care facilities even fully vaccinated boosted against mask will provided need following face coverings allowed gaiters masks exhalation valves clear face masks our staff members may wear additional personal protective equipment depending care providing increase xxunk xxunk omicron xxunk which highly xxunk such masking remains xxunk patients visitors johns medicine xxunk xxunk vaccination status johns medicine locations both clinical xxunk areas because patients,science,science
5,xxbos google has xxunk war independent media has xxunk blocking emails getting our readers recommend free xxunk email receiving service free encrypted email send receive service xxunk continue address natural news world currently live xxunk freedom engineered xxunk experiment xxunk xxunk living fake food social media xxunk xxunk xxunk supply fake xxunk money xxunk xxunk xxunk xxunk xxunk more power read more natural news growing portion people who usually xxunk xxunk xxunk xxunk xxunk xxunk after nearly two years worth zero policy xxunk social media posts xxunk china xxunk xxunk now read more natural news while tens millions people world over long xxunk world economic forum its founder just xxunk global xxunk their xxunk confirmed last interview china state television public xxunk china controlled read more natural news administration energy department official who has generated xxunk controversy legal hot water facing xxunk xxunk news official graduate now xxunk xxunk xxunk,pseudoscience,pseudoscience
6,xxbos today xxunk code red need xxunk xxunk back life many feel like xxunk xxunk room xxunk insights already xxunk xxunk leave our xxunk xxunk setting chaos next xxunk xxunk into political xxunk right thing our time need want xxunk people action more than xxunk xxunk because great ideas never xxunk off light xxunk our xxunk which happened while our nation xxunk pandemic xxunk our nations xxunk learned xxunk politics medicine get solutions why running united states help xxunk problems help heal spent xxunk years xxunk lifetime challenging xxunk comes xxunk xxunk xxunk xxunk looks xxunk xxunk seeking safety their loved ones being xxunk into xxunk expected days xxunk countless people saved broken xxunk xxunk personal xxunk xxunk xxunk give people second xxunk sometimes life changes xxunk making plans growing xxunk xxunk xxunk xxunk xxunk father xxunk xxunk xxunk xxunk xxunk xxunk floor loved country much anyone already here because,pseudoscience,pseudoscience
7,xxbos xxunk xxunk over remarks black xxunk xxunk xxunk dies top group through goal difference xxunk announce joint xxunk best album xxunk xxunk volcano xxunk xxunk volcano xxunk allow police xxunk robots xxunk cost effective way xxunk xxunk xxunk cost effective way xxunk xxunk killed celebrating world cup loss report xxunk 2022 revealed top stories wales xxunk xxunk really xxunk friend says raise concerns over cost new care service people eating pet food says community xxunk 600 energy payment xxunk until after xxunk social media joy need know viral xxunk man xxunk fans video viral xxunk man xxunk fans comes next wales after world cup exit keep latest sport xxunk xxunk world cup xxunk return home wait start historic series xxunk fight talks getting xxunk dies after xxunk xxunk xxunk discover xxunk ancient used eat quiz much know about xxunk xxunk capsule breaks distance record space xxunk xxunk loved xxunk,science,pseudoscience


In [23]:
test_sources = {'https://infowarslife.com/':'pseudoscience',
'https://www.bbc.com/news/':'science',
'https://www.dailymail.co.uk/':'pseudoscience',
'https://www.si.edu/explore/science':'science',
'https://www.foxnews.com/opinion':'pseudoscience',
'https://www.disclose.tv/':'pseudoscience',
'https://www.snopes.com/top/':'science',
'https://www.theskepticsguide.org/about':'science',
'https://www.cdc.gov/':'science',
'https://www.motherjones.com/':'pseudoscience',
'https://www.huffpost.com/':'pseudoscience',
'https://arstechnica.com/':'science',
'https://nationalreport.net/':'pseudoscience',
'https://newspunch.com/':'pseudoscience',
'https://www.trunews.com/':'pseudoscience'}

In [25]:
d_pred = {}

for source in test_sources:
    page = get_page_all(source, k, max_words, ignore_words)
    length = len(page.cleaned_text)
    if  length < min_words:
        print("ERROR:",source,length,"words")
    else:
        common_words = ' '.join([count[0] for count in page.most_common_words])
        text = ' '.join(page.cleaned_text)
        with learn.no_bar(), learn.no_logging():
            prediction = learn.predict(text)
        if prediction[0] == "science":
            p = prediction[2][1].item()
        else:
            p = prediction[2][0].item()
        d_pred[source] = [test_sources[source], prediction[0], p]

df = pd.DataFrame.from_dict(d_pred, orient='index', columns=['actual', 'prediction', 'probability'])
df

ERROR: https://www.trunews.com/ 17 words


Unnamed: 0,actual,prediction,probability
https://infowarslife.com/,pseudoscience,pseudoscience,0.934633
https://www.bbc.com/news/,science,science,0.586406
https://www.dailymail.co.uk/,pseudoscience,pseudoscience,0.829928
https://www.si.edu/explore/science,science,pseudoscience,0.530939
https://www.foxnews.com/opinion,pseudoscience,pseudoscience,0.579585
https://www.disclose.tv/,pseudoscience,pseudoscience,0.860695
https://www.snopes.com/top/,science,science,0.781662
https://www.theskepticsguide.org/about,science,science,0.76114
https://www.cdc.gov/,science,pseudoscience,0.724096
https://www.motherjones.com/,pseudoscience,pseudoscience,0.66355


In [None]:
#learn.save('2022.11.29 Model v2')

In [None]:
#learn = load_learner('models/2022.11.28 Model.pth', cpu=False)