# data

> Web scraping and tools for data collection and processing

In [1]:
#| default_exp data

In [1]:
#| export
import warnings
warnings.filterwarnings('ignore')
import requests
from bs4 import BeautifulSoup
import enchant
import re
import random
from collections import Counter
from fastai.text.all import *

In [2]:
## Utility Function to Check GPU Status
def check_gpu():
    print("CUDA Available: ", torch.cuda.is_available())
    num_devices = torch.cuda.device_count()
    if num_devices > 0:
        for device in range(0,num_devices):
            print("Device", device, "|", torch.cuda.get_device_name(device), 
            "| Allocated:", round(torch.cuda.memory_allocated(device)/1024**3,1), "GB",
            "| Cached:", round(torch.cuda.memory_reserved(device)/1024**3,1), "GB")

torch.cuda.empty_cache()

In [3]:
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.0 GB | Cached: 0.0 GB


## Web Scraper

In [80]:
#| export
class Webpage:
    def __init__(self, url):
        self.url = url
        self.html = ""
        self.links = []
        self.text = []
        self.cleaned_text = []
        self.most_common_words = []

    def get_html(self, timeout = 5):
        user_agents = [ 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 
            'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148', 
            'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36' 
        ] 
        user_agent = random.choice(user_agents) 
        headers = {'User-Agent': user_agent} 
        page = requests.get(self.url, timeout=timeout, headers=headers)
        self.html = BeautifulSoup(page.text, "html.parser")

    def get_html_anchors(self, keyword="http"):
        for anchor in self.html.findAll('a'):
            link = anchor.get('href')
            if link == None or link == "":
                continue
            if keyword in link:
                self.links.append(link)
                
    def get_html_text(self, tags=["p"]):
        for tag in tags:
            for p in self.html.findAll(tag):
                p_text = p.getText().strip()
                if p_text == None or p_text == '':
                    continue
                self.text.append(p_text)

    def clean_text(self, enchant_dict="en_US"):
        rx = "[^a-zA-Z0-9 ]+"
        all_text = ' '.join(self.text)
        regex_text = re.sub(rx,'',all_text).strip()
        split = regex_text.split()
        if enchant_dict != "": d = enchant.Dict(enchant_dict)
        for word in split:
            if enchant_dict == "":
                self.cleaned_text.append(word)
            elif d.check(word): 
                self.cleaned_text.append(word)

    def k_common_words(self, k=10, ignore=["the","to","of","and","a","in","on","is","for","by"]):
        if self.cleaned_text == "":
            text = self.text
        else:
            text = self.cleaned_text
        all_text = ' '.join(text).lower()
        split = all_text.split()
        split_ignore = [word for word in split if word not in ignore]
        counts = Counter(split_ignore)
        k_most_common = counts.most_common(k)
        self.most_common_words = k_most_common

In [81]:
url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"
common_english = Webpage(url)
common_english.get_html()
english_words = common_english.html.getText().lower()
english_words = english_words.split('\n')
print(len(english_words),"most common English words")
#english_words

1000 most common English words


In [82]:
pseudo_sources = ["http://www.ageofautism.com/",
 "http://www.naturalnews.com", 
 "https://foodbabe.com/starthere/",
 "http://www.chopra.com",
 "https://www.mercola.com/",
 "https://www.history.com/",
 "https://doctoroz.com/",
 "https://www.disclose.tv/",
 "https://christiananswers.net/",
 "https://heartland.org/"]

science_sources = ["https://sciencebasedmedicine.org/",
 "https://www.hopkinsmedicine.org/gim/research/method/ebm.html",
 "https://www.bbc.com/news/science_and_environment",
 "https://www.nature.com/",
 "https://www.science.org/",
 "https://www.snopes.com/top/",
 "https://quackwatch.org/",
 "https://www.skepdic.com/",
 "http://scibabe.com/",
 "http://pandasthumb.org/",
 "https://skepticalscience.com/",
 "https://www.cdc.gov/vaccinesafety/concerns/autism.html"]

In [83]:
url = pseudo_sources[0]
test_page = Webpage(url)
test_page.get_html()
test_page.get_html_anchors()
test_page.get_html_text(tags=["p","h1","h2","h3","span"])
test_page.clean_text()
test_page.k_common_words(k=5,ignore=english_words[:30])
test_page.most_common_words
test_page.cleaned_text

['Mary',
 'Holland',
 'Polly',
 'host',
 'a',
 'program',
 'on',
 'TV',
 'also',
 'available',
 'as',
 'audio',
 'on',
 'You',
 'can',
 'find',
 'all',
 'of',
 'the',
 'programming',
 'on',
 'TV',
 'HERE',
 'In',
 'This',
 'Week',
 'with',
 'Mary',
 'Holland',
 'Health',
 'Defense',
 'president',
 'and',
 'Polly',
 'programming',
 'manager',
 'Mary',
 'and',
 'Polly',
 'discuss',
 'the',
 'latest',
 'news',
 'on',
 'vaccines',
 'and',
 'other',
 'issues',
 'Click',
 'HERE',
 'This',
 'week',
 'Mary',
 'Holland',
 'Health',
 'Defense',
 'president',
 'and',
 'Polly',
 'programming',
 'manager',
 'covered',
 'the',
 'latest',
 'headlines',
 'on',
 'Big',
 'and',
 'other',
 'issues',
 'including',
 'news',
 'that',
 'the',
 'Senate',
 'signaled',
 'its',
 'time',
 'to',
 'move',
 'on',
 'from',
 'the',
 'pandemic',
 'with',
 'a',
 'bipartisan',
 '6236',
 'to',
 'end',
 'the',
 'federal',
 'governments',
 'emergency',
 'declaration',
 'on',
 'the',
 'pandemic',
 'Polly',
 'and',
 'Mary',
 

In [84]:
#| export
def get_page_all(url, k, ignore_words):
    page = Webpage(url)
    page.get_html()
    page.get_html_text(tags=["p","h1","h2","h3","span"])
    page.clean_text()
    page.k_common_words(k=k, ignore=ignore_words)
    page.get_html_anchors()
    return page

def get_all_links(url, dict, k, min_text_len=50, ignore_words=[], ignore_filenames=[".mp3",".jpg",".png"]):
    page = get_page_all(url, k, ignore_words)
    text = ' '.join(page.cleaned_text)
    dict[url] = [text, page.most_common_words]
    print(url,"Contains",len(page.links),"Links")

    for link in page.links:
        if all(x not in link for x in ignore_filenames):
            try:
                page = get_page_all(link, k, ignore_words)
                text = ' '.join(page.cleaned_text)
                if len(text) < min_text_len: continue
                dict[link] = [text, page.most_common_words]
            except:
                pass

In [85]:
k = 30 # words
min_text_len = 50
max_text_len = 2000
ignore_words = english_words[:50]
ignore_filenames = [".mp3",".jpg",".png",".mp4","facebook.com","twitter.com"]

In [86]:
#| hide
#d_pse = {}
#get_all_links(pseudo_sources[2], d_pse, k, min_text_len, ignore_words, ignore_filenames)
#d_pse

In [87]:
d_pse = {}
d_sci = {}

for source in pseudo_sources:
    get_all_links(source, d_pse, k, min_text_len, ignore_words, ignore_filenames)
for source in science_sources:
    get_all_links(source, d_sci, k, min_text_len, ignore_words, ignore_filenames)


http://www.ageofautism.com/ Contains 697 Links
http://www.naturalnews.com Contains 324 Links
https://foodbabe.com/starthere/ Contains 122 Links
http://www.chopra.com Contains 100 Links
https://www.mercola.com/ Contains 125 Links
https://www.history.com/ Contains 83 Links
https://doctoroz.com/ Contains 133 Links
https://www.disclose.tv/ Contains 140 Links
https://christiananswers.net/ Contains 9 Links
https://heartland.org/ Contains 142 Links
https://sciencebasedmedicine.org/ Contains 259 Links
https://www.hopkinsmedicine.org/gim/research/method/ebm.html Contains 103 Links
https://www.bbc.com/news/science_and_environment Contains 131 Links
https://www.nature.com/ Contains 73 Links
https://www.science.org/ Contains 29 Links
https://www.snopes.com/top/ Contains 35 Links
https://quackwatch.org/ Contains 136 Links
https://www.skepdic.com/ Contains 103 Links
http://scibabe.com/ Contains 118 Links


In [None]:
count_pse = Counter()
count_sci = Counter()
for link in d_pse:
    count_pse+=Counter(dict(d_pse[link][1]))
for link in d_sci:
    count_sci+=Counter(dict(d_sci[link][1]))

print("#### Pseudoscience",k,"Most Common Words ####\n",count_pse.most_common(k),"\n\n")
print("#### Science",k,"Most Common Words ####\n",count_sci.most_common(k),"\n\n")

#### Pseudoscience 30 Most Common Words ####
 [('autism', 6336), ('health', 5135), ('our', 4567), ('my', 3674), ('about', 3659), ('has', 3635), ('more', 3572), ('vaccine', 3460), ('will', 3387), ('posted', 3351), ('any', 3249), ('2022', 3161), ('their', 2922), ('age', 2921), ('food', 2888), ('its', 2730), ('am', 2683), ('us', 2216), ('who', 2211), ('if', 2135), ('so', 1947), ('news', 1915), ('no', 1731), ('may', 1660), ('been', 1652), ('these', 1628), ('children', 1614), ('information', 1590), ('like', 1566), ('natural', 1492)] 


#### Science 30 Most Common Words ####
 [('0', 28983), ('j', 3412), ('climate', 3092), ('n', 2979), ('2', 2643), ('m', 2593), ('r', 2519), ('more', 2282), ('00000', 2124), ('p', 2104), ('c', 2050), ('s', 1935), ('about', 1888), ('change', 1781), ('scholar', 1682), ('2022', 1659), ('science', 1655), ('d', 1624), ('medicine', 1607), ('global', 1591), ('our', 1450), ('g', 1443), ('3', 1424), ('612', 1410), ('l', 1332), ('data', 1303), ('t', 1299), ('emissions', 

## Data Preparation

In [None]:
d_all = {}
for link in d_pse:
    text = d_pse[link][0]
    if len(text) > max_text_len: text = text[:max_text_len]
    common_words = ' '.join([count[0] for count in d_pse[link][1]])
    if common_words != '' and link not in d_all:
        d_all[link] = [text, common_words, 'pseudoscience']

for link in d_sci:
    text = d_sci[link][0]
    if len(text) > max_text_len: text = text[:max_text_len]
    common_words = ' '.join([count[0] for count in d_sci[link][1]])
    if common_words != '' and link not in d_all:
        d_all[link] = [text, common_words, 'science']

df = pd.DataFrame.from_dict(d_all, orient='index', columns=['text', 'common_words', 'label'])
df.head()

Unnamed: 0,text,common_words,label
http://www.ageofautism.com/,host a program on also available as audio on you can find all of the programming on here in this week with health defense president and programming manager and discuss the latest news on vaccines and other issues click here this week health defense president and programming manager covered the latest headlines on big and other issues including news that the senate signaled its time to move on from the pandemic with a bipartisan 6236 to end the federal governments emergency declaration on the pandemic and also discussed that is pushing the claiming that children are at high risk during the ...,autism age vaccine 2022 am posted health our 0600 my about will its their comments us children current her affairs more who me been reading public defense new him continue,pseudoscience
https://www.ageofautism.com/,host a program on also available as audio on you can find all of the programming on here in this week with health defense president and programming manager and discuss the latest news on vaccines and other issues click here this week health defense president and programming manager covered the latest headlines on big and other issues including news that the senate signaled its time to move on from the pandemic with a bipartisan 6236 to end the federal governments emergency declaration on the pandemic and also discussed that is pushing the claiming that children are at high risk during the ...,autism age vaccine 2022 am posted health our 0600 my about will its their comments us children current her affairs more who me been reading public defense new him continue,pseudoscience
https://www.ageofautism.com/donate.html,hello your donation to autism age is tax now use for secure online donations scroll down for their easy to use form you can always send us a paper or electronic check as well email me at any time with ideas suggestions or gentle critiques our is you ed cause unknown the epidemic of sudden deaths in 2021 2022 health defense transcend fear a blueprint for mindful leadership in public health jr f the real bill gates big and the global war on democracy and public health health defense donate click the cover buy the book shop amazon support recent comments past current contributors connect sear...,health donate defense public hello donation autism age tax now secure online donations scroll down their easy form always send us paper electronic check well email me any time ideas,pseudoscience
https://www.ageofautism.com/contact-us.html,autism age box 110546 ct 06611 ed cause unknown the epidemic of sudden deaths in 2021 2022 health defense transcend fear a blueprint for mindful leadership in public health jr f the real bill gates big and the global war on democracy and public health health defense donate click the cover buy the book shop amazon support recent comments past current contributors connect search donate contact us top,health defense public donate autism age box 110546 ct 06611 ed cause unknown epidemic sudden deaths 2021 2022 transcend fear blueprint mindful leadership jr f real bill gates big global,pseudoscience
https://www.ageofautism.com/exclusives.html,editorials from the series by the and here i come you had me at an elaborate fraud series deer special report what do epidemiological studies really tell us a note from there are 16 epidemiological studies here on vaccines and autism these studies represent the most often cited papers by scientists public health officials and members of the media when trying to refute any evidence of an association between vaccinations and autism there are serious methodological limitations design flaws conflicts of interest or other problems related to each of these 16 studies these flaws have been pointe...,autism studies health these series here public epidemiological 16 vaccines officials limitations flaws study defense donate editorials come me elaborate fraud deer special report do really tell us note represent,pseudoscience


In [None]:
dls = TextDataLoaders.from_df(df, bs=8, text_col='text', label_col='label')
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,xxbos 36 xxwrep 4 0 v 0 g 0 l o n j 0 m 0 z 0 y xxwrep 3 0 k xxwrep 4 0 1 0 0 a 0 o 0 xxunk 0 xxunk 0 xxunk 0 ex 0 0 xxunk 0 xxunk 0 xxunk xxwrep 3 0 xxunk 0 xxunk 0 xxunk xxwrep 3 0 xxunk 0 xxunk 0 xxunk 0 c o 0 chum 0 xxunk 0 xxunk 0 xxunk 0 v 0 0 xxunk 0 xxunk 0 xxunk xxwrep 3 0 xxunk 0 xxunk 0 xxunk 0 u 0 0 xxunk 0 xxunk 0 xxunk 0 3 e s i 0 s 0 xxunk 0 xxunk 0 xxunk 0 m f y 0 0 xxunk 0 xxunk 0 xxunk xxwrep 3 0 xxunk 0 xxunk 0 xxunk xxwrep 3 0 xxunk 0 xxunk 0 xxunk 0 is 6 0 f 0 xxunk 0 xxunk 0,science
1,xxbos c y 08 y y 73 xxwrep 13 y nu s v 12 g ab f 0 xxunk xxunk 0 xxunk xxunk 0 xxunk xxunk 0 xxunk xxunk 0 20211 xxunk 0 xxunk xxunk 0 xxunk xxunk 0 xxunk xxunk 0 xxunk xxunk 0 xxunk xxunk 0 xxunk xxunk 0 xxunk xxunk 0 xxunk xxunk 0 0 g p x 0 0 xxunk version xxunk build xxunk xxunk xxunk 0 0 b x n f k b xxwrep 3 40 xxunk u h f z at m s b w a q w 0 xxunk 0 xxunk 0 xxunk 0 0 r xxunk 0 r xxunk 0 r xxunk 0 r xxunk 0 r xxunk 0 r xxunk 0 r xxunk 0 r xxunk 0 r 170 0 r xxunk 0 r xxunk 0 00 595276 50 0 00 595276 0 0 xxunk 0 xxunk xxunk xxrep 3 6,science
2,xxbos in stock ready to ship d verified customer team member hi the of ever since we launched our small company i have wanted to make a snack bar that i would be happy to eat you see always been the type of person to reward myself at the end of a long day with a little treat it was a xxunk i d grab my favorite cup of tea and a treat then i d sit on the xxunk and enjoy it it was a little me moment that would help me xxunk from the day the problem sometimes i would eat a piece of chocolate other times it was another snack that would feel good in the moment and horrible later on i needed something better and why i set out to create something better i wanted a snack bar a snack bar that i could happily eat,pseudoscience


In [None]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(4, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.606731,0.441208,0.798077,00:23


epoch,train_loss,valid_loss,accuracy,time
0,0.484572,0.402184,0.804487,00:41
1,0.395984,0.361449,0.858974,00:41
2,0.311677,0.279523,0.897436,00:41
3,0.213757,0.2505,0.907051,00:41


In [None]:
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.7 GB | Cached: 1.9 GB


In [None]:
learn.show_results()

Unnamed: 0,text,category,category_
0,xxbos xxwrep 7 0 r 365 xxwrep 9 0 r xxunk xxwrep 24 0 r 373 xxwrep 9 0 r xxunk xxwrep 9 0 r 381 xxwrep 8 0 384 xxrep 10 0 xxunk xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5,pseudoscience,pseudoscience
1,xxbos c e 0 xxunk r 0 0 18 0 0 xxunk 0 xxunk xxunk 0 xxunk 0 xxunk 0 xxunk 0 xxunk 0 19 xxwrep 3 0 xxunk 0 xxunk 0 xxunk 0 em h t 0 0 22 0 0 384 0 384 xxunk 0 xxunk 0 xxunk 0 23 xxwrep 3 0 xxunk 0 xxunk 0 xxunk 0 a 0 0 26 0 0 xxunk 0 xxunk 2903 0 xxunk 0 xxunk 0 xxunk 0 27 0 0 m 0 xxunk 0 xxunk 0 xxunk 0 xxunk 0 0 30 0 0 384 0 384 2903 0 1 xxrep 3 7 0 xxunk 0 31 xxwrep 3 0 xxunk 0 xxunk 0 xxunk 0 s p q y o 0 0 34 0 0 387 0 387 2903 0 xxunk 0 xxunk 0 xxunk 0 35 xxwrep 3 0 xxunk 0 xxunk 0 xxunk 0 e 0,science,science
2,xxbos och p system man fr den vi i av evolution one may also study the more restricted problem of ancestry and compare a model with common ancestry of the two species with a unique origin model according to which each species is founded by one single couple and carter 2014 carter 2018 and 2019 och xxunk den hr med vi vi p den man r om r i par en man och en om och om den r i art en fr 7 xxrep 4 0 r sedan d 1 xxrep 4 0 man men fr den variation vi vi d till 5 xxrep 5 0 r hr fr xxunk ha fr 6 xxrep 3 0 till 1 xxrep 4 0 r sedan om man man r i av genetic modeling of human history part 1 comparison of common xxunk and unique origin approaches och genetic modeling of human,science,science
3,xxbos all we know next is that for no apparent reason john pulled that control rod up by about twenty inches at the end of we had the unused heart of a nuclear bomb sitting around waiting to fuck shit up scientists at national laboratory be expected to just leave it alone it was so shiny so new so full of untapped data receive notifications about new blog entries and a free puppy as long as supplies out of puppies smash this subscribe button do it copyright 2022 theme by themes fission come for the science stay for the dirty jokes daily mos the reactor daily mos the xxunk core 10 2021 0 16 2021 2 tweet support on search for i ca nt stop getting on about 2 8 5 xxwrep 3 2 5 3 3 xxwrep 3 2 3 xxwrep 4 2 5 2 2 xxwrep 3 3,science,science
4,xxbos when i recall the scientist who first attempted to explain how xxunk of rock xxunk across the planet not surprised that the reaction was all due respect fuck yourself the ground gave way under the massive amount of weight to reveal a giant underground xxunk xxunk with natural gas so they did the responsible thing and xxunk the fucking pit on fire a mud volcano has been erupting 50 swimming pools worth of mud onto the east java province of for over a decade why receive notifications about new blog entries and a free puppy as long as supplies out of puppies smash this subscribe button do it copyright 2022 theme by themes xxunk come for the science stay for the dirty jokes daily mos xxunk xxunk daily mos the fifty year gas fire daily mos the mud volcano may 21 2021 0 4 2021 2 8 2021 0,science,science
5,xxbos a xxunk xxunk into your work xxunk you of killing people and demands your bodily xxunk all for making some delicious xxunk ice cream its been hard to get people to take some common sense advice during this pandemic but hand washing was an easy one to get people on board with the first guy who said hey maybe wash your hands when doctoring was beaten to death in an insane asylum receive notifications about new blog entries and a free puppy as long as supplies out of puppies smash this subscribe button do it copyright 2022 theme by themes come for the science stay for the dirty jokes daily mos the narrative of xxunk daily mos the invention of march 7 2021 1 15 2021 0 tweet support on search for i ca nt stop getting on about 2 8 5 xxwrep 3 2 5 3 3 xxwrep,science,science
6,xxbos all we know next is that for no apparent reason john pulled that control rod up by about twenty inches in 1951 they started dumping nuclear waste into lake and all was fine for a while really why ca nt you just dump a bunch of radioactive materials into a lake but have to be pretty damn determined to get radiation poisoning from a banana receive notifications about new blog entries and a free puppy as long as supplies out of puppies smash this subscribe button do it copyright 2022 theme by themes radiation come for the science stay for the dirty jokes daily mos the reactor daily mos radioactive lake daily mos the mildly radioactive banana 10 2021 0 march 3 2021 0 2 2021 2 tweet support on search for i ca nt stop getting on about 2 8 5 xxwrep 3 2 5 3 3 xxwrep,science,science
7,xxbos as the story goes a patient named bill asked for something that could help with his sexual weakness which i think was speak for raging xxunk medicine please and thank you the juice racket is based on the biblical diet of king snake oil was wronged snake oil salesmen on the other hand every last one receive notifications about new blog entries and a free puppy as long as supplies out of puppies smash this subscribe button do it copyright 2022 theme by themes pseudoscience come for the science stay for the dirty jokes daily mos the great goat xxunk xxunk daily mos the biblical roots of daily mos the xxunk of snake oil march 21 2021 0 22 2021 1 1 2021 9 tweet support on search for i ca nt stop getting on about 2 8 5 xxwrep 3 2 5 3 3 xxwrep 3 2 3,science,science


In [None]:
test_sources = ['https://infowarslife.com/',
'https://www.bbc.com/news/',
'https://www.dailymail.co.uk/',
'https://www.si.edu/explore/science',
'https://www.foxnews.com/opinion',
'https://www.disclose.tv/',
'https://www.snopes.com/top/',
'https://www.theskepticsguide.org/about',
'https://www.cdc.gov/',
'https://www.motherjones.com/',
'https://www.huffpost.com/']

In [None]:
d_pred = {}

for source in test_sources:
    page = get_page_all(source, k, ignore_words)
    length = len(page.cleaned_text)
    if  length < min_text_len:
        print("ERROR:",source,length,"words")
    else:
        common_words = ' '.join([count[0] for count in page.most_common_words])
        text = page.cleaned_text
        with learn.no_bar(), learn.no_logging():
            prediction = learn.predict(text)
        if prediction[0] == "science":
            p = prediction[2][1].item()
        else:
            p = prediction[2][0].item()
        d_pred[source] = [prediction[0], p]

df = pd.DataFrame.from_dict(d_pred, orient='index', columns=['prediction', 'probability'])
df

Unnamed: 0,prediction,probability
https://infowarslife.com/,pseudoscience,0.747941
https://www.bbc.com/news/,science,0.978869
https://www.dailymail.co.uk/,pseudoscience,0.626198
https://www.si.edu/explore/science,science,0.955084
https://www.foxnews.com/opinion,pseudoscience,0.954521
https://www.disclose.tv/,pseudoscience,0.698332
https://www.snopes.com/top/,science,0.948207
https://www.theskepticsguide.org/about,science,0.991358
https://www.cdc.gov/,science,0.901797
https://www.motherjones.com/,pseudoscience,0.847322


In [39]:
#learn.save('2022.11.28 Model')

Path('models/2022.11.28 Model.pth')

In [40]:
#learn = load_learner('models/2022.11.28 Model.pth', cpu=False)