# data

> Web scraping and tools for data collection and processing

In [1]:
#| default_exp data

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sbavery/pseudometer/blob/main/nbs/01_data.ipynb)

In [2]:
#| export
import warnings
warnings.filterwarnings('ignore')
import requests
from bs4 import BeautifulSoup
import enchant
import re
import random
from collections import Counter
from fastai.text.all import *
import hashlib
import pickle

In [3]:
## Utility Function to Check GPU Status
def check_gpu():
    print("CUDA Available: ", torch.cuda.is_available())
    num_devices = torch.cuda.device_count()
    if num_devices > 0:
        for device in range(0,num_devices):
            print("Device", device, "|", torch.cuda.get_device_name(device), 
            "| Allocated:", round(torch.cuda.memory_allocated(device)/1024**3,1), "GB",
            "| Cached:", round(torch.cuda.memory_reserved(device)/1024**3,1), "GB")

torch.cuda.empty_cache()

In [4]:
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.0 GB | Cached: 0.0 GB


## Web Scraper

In [5]:
#| export
class Webpage:
    def __init__(self, url):
        self.url = url
        self.hash = self.get_hash_str()
        self.requested = False
        self.page_text = ""
        self.html = ""
        self.links = []
        self.text = []
        self.cleaned_text = []
        self.most_common_words = []
    
    def get_page(self, headers, min_size, max_size):
        r = requests.get(self.url, stream=True, headers=headers)
        content_length = int(r.headers.get('Content-Length', 0))
        data = []
        length = 0

        if content_length > max_size:
            return None

        for chunk in r.iter_content(1024):
            data.append(chunk)
            length += len(chunk)
            if length > max_size:
                return None
        r._content = b''.join(data)
        if len(r.text) < min_size: return None
        return r.text

    def get_page_html(self, min_size=1000, max_size=2000000):
        user_agents = [ 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', 
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', 
            'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148', 
            'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36' 
        ] 
        user_agent = random.choice(user_agents) 
        headers = {'User-Agent': user_agent} 
        self.page_text = self.get_page(headers, min_size, max_size)
        self.html = BeautifulSoup(self.page_text, "html.parser")
        self.requested = True

    def get_hash_str(self, inp=""):
        return hashlib.sha3_256((self.url+inp).encode()).hexdigest()

    def get_html_anchors(self, keyword="http"):
        for anchor in self.html.findAll('a'):
            link = anchor.get('href')
            if link == None or link == "":
                continue
            if keyword in link:
                self.links.append(link)
                
    def get_html_text(self, tags=["p"]):
        for tag in tags:
            for p in self.html.findAll(tag):
                p_text = p.getText().strip()
                if p_text == None or p_text == '':
                    continue
                self.text.append(p_text)

    def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], min_word_len=2):
        rx = "[^a-zA-Z0-9 ]+"
        all_text = ' '.join(self.text).lower()
        regex_text = re.sub(rx,'',all_text).strip()
        split = regex_text.split()
        split = [word for word in split if word not in ignore]
        if enchant_dict != "": d = enchant.Dict(enchant_dict)
        for word in split:
            if len(self.cleaned_text) >= max_words: break
            if len(word) > min_word_len:
                if enchant_dict == "":
                    self.cleaned_text.append(word)
                elif d.check(word): 
                    self.cleaned_text.append(word)

    def k_common_words(self, k=10, ignore=["the","to","of","and","a","in","on","is","for","by"]):
        if self.cleaned_text == "":
            text = self.text
        else:
            text = self.cleaned_text
        all_text = ' '.join(text).lower()
        split = all_text.split()
        split_ignore = [word for word in split if word not in ignore]
        counts = Counter(split_ignore)
        k_most_common = counts.most_common(k)
        self.most_common_words = k_most_common

    def save_text(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.cleaned_text, file)
        file.close()

    def load_text(self, path, fname):
        file = open(path+fname, 'rb')
        self.cleaned_text = pickle.load(file)
        file.close()

    def save_links(self, path, fname):
        file = open(path+fname, 'wb')
        pickle.dump(self.links, file)
        file.close()

    def load_links(self, path, fname):
        file = open(path+fname, 'rb')
        self.links = pickle.load(file)
        file.close()

In [6]:
url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"
common_english = Webpage(url)
common_english.get_page_html(min_size=1000)
english_words = common_english.html.getText().lower()
english_words = english_words.split('\n')
print(len(english_words),"most common English words")
#english_words

1000 most common English words


In [7]:
pseudo_sources = ["http://www.ageofautism.com/",
 "http://www.naturalnews.com", 
 "https://foodbabe.com/starthere/",
 "http://www.chopra.com",
 "https://www.mercola.com/",
 "https://www.history.com/",
 "https://doctoroz.com/",
 "https://www.disclose.tv/",
 "https://christiananswers.net/",
 "https://heartland.org/",
 "https://www.dailymail.co.uk/",
 "https://www.motherjones.com/"]

science_sources = ["https://sciencebasedmedicine.org/",
 "https://www.hopkinsmedicine.org/gim/research/method/ebm.html",
 "https://www.bbc.com/news/science_and_environment",
 "https://www.nature.com/",
 "https://www.science.org/",
 "https://www.snopes.com/top/",
 "https://quackwatch.org/",
 "https://www.skepdic.com/",
 "http://scibabe.com/",
 "http://pandasthumb.org/",
 "https://skepticalscience.com/",
 "https://www.cdc.gov/"]

In [8]:
url = science_sources[7]
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)

In [9]:
test_page = Webpage(url)
test_page.get_page_html()
test_page.get_html_text()
test_page.get_html_anchors()
test_page.clean_html_text(500)
test_page.save_text(path, test_page.hash+'.text')
test_page.save_links(path, test_page.hash+'.links')

In [10]:
new_page = Webpage(url)
fname_text = new_page.hash+'.text'
fname_links = new_page.hash+'.links'
if os.path.isfile(path+fname_text): 
    new_page.load_text(path, fname_text)
    print("Loading Text")
else:
    new_page.get_page_html()
    new_page.get_html_text(tags=["p","h1","h2","h3","span"])
    new_page.clean_html_text(500, ignore=english_words[:50])
    new_page.save_text(path, fname_text)

if os.path.isfile(path+fname_links): 
    new_page.load_links(path, fname_links)
    print("Loading Links")
else:
    new_page.get_page_html()
    new_page.get_html_anchors()
    new_page.save_links(path, fname_links)
new_page.k_common_words(k=5,ignore=english_words[:50])
new_page.cleaned_text

Loading Text
Loading Links


['the',
 'skeptics',
 'dictionary',
 'features',
 'definitions',
 'arguments',
 'and',
 'essays',
 'hundreds',
 'strange',
 'beliefs',
 'amusing',
 'deceptions',
 'and',
 'dangerous',
 'delusions',
 'also',
 'features',
 'dozens',
 'entries',
 'logical',
 'fallacies',
 'cognitive',
 'biases',
 'perception',
 'science',
 'and',
 'philosophy',
 'also',
 'posted',
 'are',
 'over',
 'years',
 'reader',
 'comments',
 'date',
 'status',
 'entry',
 'reader',
 'comments',
 'natural',
 'cancer',
 'cures',
 'revision',
 'argument',
 'ignorance',
 'reader',
 'comments',
 'reader',
 'comments',
 'psychokinesis',
 'reader',
 'comments',
 'sample',
 'the',
 'skeptics',
 'dictionary',
 '1858',
 'grotto',
 'the',
 'river',
 'gave',
 'near',
 'peasant',
 'named',
 'claimed',
 'that',
 'the',
 'virgin',
 'identifying',
 'herself',
 'the',
 'immaculate',
 'conception',
 'appeared',
 'her',
 'some',
 'think',
 'such',
 'great',
 'number',
 'have',
 'provided',
 'opportunity',
 'channel',
 'short',
 'theol

In [11]:
#| export
def get_page_all(url, k, max_words, ignore_words, path = None):
    page = Webpage(url)
    fname_text = page.hash+'.text'
    fname_links = page.hash+'.links'
    if path == None:
        page.get_page_html()
        page.get_html_text(tags=["p","h1","h2","h3","span"])
        page.get_html_anchors()
        page.clean_html_text(max_words, ignore=english_words[:50])
    else:
        if os.path.isfile(path+fname_text): 
            page.load_text(path, fname_text)
        else:
            page.get_page_html()
            page.get_html_text(tags=["p","h1","h2","h3","span"])
            page.clean_html_text(max_words, ignore=english_words[:50])
            page.save_text(path, fname_text)

        if os.path.isfile(path+fname_links): 
            page.load_links(path, fname_links)
        else:
            if page.html == "": page.get_page_html()
            page.get_html_anchors()
            page.save_links(path, fname_links)

    if page.cleaned_text is not None:
        page.k_common_words(k=k, ignore=ignore_words)
    return page

def get_all_links(url, dict, k, min_words=20, max_words=500, ignore_words=[], ignore_filenames=[".mp3",".jpg",".png"], max_links="", path=None):
    page = get_page_all(url, k, max_words, ignore_words, path)
    if page.cleaned_text is not []:
        dict[url] = [page.cleaned_text, page.most_common_words]
        print(url,"Contains",len(page.links),"Links")
        if max_links == "" or max_links > len(page.links): max_links=len(page.links)
        
        for link in page.links[:max_links]:
            if all(x not in link for x in ignore_filenames):
                try:
                    page = get_page_all(link, k, max_words, ignore_words, path)
                    if page.cleaned_text is not []:
                        if len(page.cleaned_text) < min_words: continue
                        dict[link] = [page.cleaned_text, page.most_common_words]
                except:
                    pass
    else:
        print(url,"returned None, Skipping...")

In [12]:
k = 30 # words
min_words = 50
max_words = 450
max_links = 100
ignore_words = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on'] #english_words[:20]
ignore_filenames = [".mp3",".jpg",".png",".mp4",".jfif","facebook.com","twitter.com"]

In [13]:
#| hide
#d_pse = {}
#get_all_links(pseudo_sources[2], d_pse, k, min_text_len, ignore_words, ignore_filenames)
#d_pse

In [14]:
d_pse = {}
d_sci = {}
path = os.getcwd()+'/data/'
if os.path.isdir(path) is False: os.mkdir(path)
path_pse = path+'pseudoscience/'
path_sci = path+'science/'
if os.path.isdir(path_pse) is False: os.mkdir(path_pse)
if os.path.isdir(path_sci) is False: os.mkdir(path_sci)

for source in pseudo_sources:
    get_all_links(source, d_pse, k, min_words, max_words, ignore_words, ignore_filenames, max_links, path_pse)
for source in science_sources:
    get_all_links(source, d_sci, k, min_words, max_words, ignore_words, ignore_filenames, max_links, path_sci)


http://www.ageofautism.com/ Contains 705 Links
http://www.naturalnews.com Contains 323 Links
https://foodbabe.com/starthere/ Contains 124 Links
http://www.chopra.com Contains 100 Links
https://www.mercola.com/ Contains 125 Links
https://www.history.com/ Contains 84 Links
https://doctoroz.com/ Contains 28 Links


In [None]:
count_pse = Counter()
count_sci = Counter()
for link in d_pse:
    count_pse+=Counter(dict(d_pse[link][1]))
for link in d_sci:
    count_sci+=Counter(dict(d_sci[link][1]))

print("#### Pseudoscience",k,"Most Common Words ####\n",count_pse.most_common(k),"\n\n")
print("#### Science",k,"Most Common Words ####\n",count_sci.most_common(k),"\n\n")

#### Pseudoscience 30 Most Common Words ####
 [('health', 2767), ('our', 2056), ('2022', 2031), ('vaccine', 1420), ('food', 1282), ('autism', 1175), ('posted', 1158), ('its', 1091), ('comment', 1090), ('information', 863), ('news', 831), ('comments', 726), ('new', 720), ('children', 714), ('after', 657), ('public', 646), ('email', 621), ('any', 582), ('twitter', 546), ('age', 522), ('also', 509), ('please', 494), ('natural', 492), ('free', 478), ('just', 462), ('vaccines', 456), ('climate', 451), ('years', 450), ('medical', 440), ('share', 437)] 


#### Science 30 Most Common Words ####
 [('health', 1678), ('medicine', 1468), ('science', 1288), ('our', 1251), ('2022', 1165), ('climate', 1137), ('14106144685', 930), ('services', 905), ('menu', 895), ('711', 868), ('human', 818), ('news', 748), ('new', 731), ('department', 719), ('information', 714), ('patients', 684), ('care', 665), ('its', 628), ('research', 625), ('access', 623), ('used', 567), ('share', 565), ('johns', 527), ('change

In [None]:
for link in d_pse:
    if link in d_sci.keys():
        print(link)

https://cookiedatabase.org/tcf/purposes/


## Data Preparation

In [None]:
d_all = {}
for link in d_pse:
    text = d_pse[link][0]
    if len(text) > max_words: text = text[:max_words]
    common_words = ' '.join([count[0] for count in d_pse[link][1]])
    if link not in d_all:
        d_all[link] = [' '.join(text), common_words, 'pseudoscience']

for link in d_sci:
    text = d_sci[link][0]
    if len(text) > max_words: text = text[:max_words]
    common_words = ' '.join([count[0] for count in d_sci[link][1]])
    if link not in d_all:
        d_all[link] = [' '.join(text), common_words, 'science']

df = pd.DataFrame.from_dict(d_all, orient='index', columns=['text', 'common_words', 'label'])
df.head()

Unnamed: 0,text,common_words,label
http://www.ageofautism.com/,during spring kvetched twitter slapped sensitive content label our tweets included link photo before musk stepped see imagine twitter became open again hope twitter exposes censorship many faced especially crackdown mean want see overtly controversial despicable content allowed who defines pornography right know see defender twitter ditches misinformation policy plans reveal internal files free speech suppression effective twitter longer enforcing misleading information policy company adding will soon reveal internal files free speech suppression which could shed light twitters past action...,twitter autism 2022 vaccine age deaths content link policy information its also sudden between posted comments epidemic gender issues take misinformation accounts analysis health current affairs cause unknown 2021 young,pseudoscience
https://www.ageofautism.com/,during spring kvetched twitter slapped sensitive content label our tweets included link photo before musk stepped see imagine twitter became open again hope twitter exposes censorship many faced especially crackdown mean want see overtly controversial despicable content allowed who defines pornography right know see defender twitter ditches misinformation policy plans reveal internal files free speech suppression effective twitter longer enforcing misleading information policy company adding will soon reveal internal files free speech suppression which could shed light twitters past action...,twitter autism 2022 vaccine age deaths content link policy information its also sudden between posted comments epidemic gender issues take misinformation accounts analysis health current affairs cause unknown 2021 young,pseudoscience
https://www.ageofautism.com/exclusives.html,editorials series here come elaborate fraud series deer special report epidemiological studies really tell note epidemiological studies here vaccines autism these studies represent most often cited papers scientists public health officials members media trying refute any evidence association between vaccinations autism serious methodological limitations design flaws conflicts interest problems related these studies these flaws been pointed government officials researchers medical review panels even authors studies themselves taken together limitations these studies make impossible conclude...,autism studies health series here public epidemiological vaccines officials limitations flaws study defense donate editorials elaborate fraud deer special report really tell note represent often cited papers scientists members media,pseudoscience
https://www.ageofautism.com/science/,tom urged get their bivalent vaccine booster yesterday twitter kindly let him followers know about week class starting week through countermeasures injury compensation program program absolves corporations whose products harm during pandemic take course both live recorded version national vaccine injury compensation program countermeasures injury compensation program used emergency authorized course students will learn structure function defects programs created congress award compensation adults children potentially actually harmed killed vaccines will review variety important topics revi...,2020 autism health high compensation information consequence disease vaccine injury program march course available age public children parents posted government increase letter 2019 countermeasures during national new 2022 science comments,pseudoscience
https://www.ageofautism.com/a-welcome-from-dan-olmste.html,welcome age autism daily web newspaper autism epidemic donate please either donate button right sidebar secure accepts credit cards send check autism age box 110546 06611 donations tax deductible our nonprofit 471831987 thank published give voice those who believe autism environmentally induced illness treatable children recover most part major media united states interested point view wont investigate causes possible biomedical treatments autism independently listen most important people parents many whom witnessed autistic regression medical illness after vaccinations those things more b...,autism comments donate health age our those epidemic believe illness medical defense public welcome daily web newspaper please either button right sidebar secure accepts credit cards send check box 110546,pseudoscience


In [None]:
dls = TextDataLoaders.from_df(df, bs=8, text_col='text', label_col='label')
dls.show_batch(max_n=3)

Unnamed: 0,text,category
0,xxbos och system man den evolution may also study more restricted problem ancestry compare model common ancestry two species unique origin model according which species founded single couple carter 2014 carter 2018 2019 och xxunk den med den man par man och och den art 7 xxrep 4 0 sedan 1 xxrep 4 0 man men den variation till 5 xxrep 5 0 scenario 6 xxrep 3 0 till 1 xxrep 4 0 sedan man man genetic modeling human history part comparison common descent unique origin approaches och genetic modeling human history part unique origin algorithm 2016 med med till discovery institute human origin possible 2019 man sin man par 500 xxrep 3 0 sedan med men 100 xxrep 3 0 discovery institute den den genetic modeling human history part comparison common descent unique origin approaches 2016 man argue unique origin model where humanity arose single couple created diversity seems,science
1,xxbos heartland institute proud make available parry memorial library nations best libraries limited government nearly 2 xxrep 4 0 library located heartland xxunk north heights library online database also book wish list amazon check our list then check collection books may willing donate see book donation guidelines below parry library holds nearly 2 xxrep 4 0 books history economics education environment issues health care policy law philosophy topics collection will special interest students scholars studying economics political science elected officials members their xxunk concerned citizens watch grand opening presentations below read 2016 parry library accepted membership rails reaching across library system rails serves approximately 1300 academic public school special library agencies northern collection offers books following topic areas constantly updated collection available online online searchable database library open public 900 500 through admission fee visitors asked call 3123774 xxrep 3 0 make appointment library time lending library patrons study space,pseudoscience
2,xxbos continuing browse our site agree our cookies revised privacy policy terms service agree group nations including declaring digital vaccine read more future headed into world where everyone will augmented technology lower class controlled read more evidence shows unprocessed red meat health risk however vilification natural raw meat products raises potential during victory gardens produced produce many now rediscovering many benefits gardening offers fresh inventory control plan just about everything earth land water minerals plants animals food energy genetically engineered foods grain crops like wheat oats barley common sources exposure toxic chemical linked even small amount just these vitamins risk dying even most severe cases drops like rock fact before musk bought twitter social media giant suspended 11 xxrep 3 0 accounts removed 1 xxrep 5 0 pieces content violating its help medical records more than 1 xxrep 3 0 health care workers interacting directly patients researchers concluded world experiencing,pseudoscience


In [None]:
torch.cuda.empty_cache()

In [None]:
learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
learn.fine_tune(4, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.580649,0.391814,0.84669,00:34


epoch,train_loss,valid_loss,accuracy,time
0,0.475741,0.460909,0.839721,01:08
1,0.452111,0.252557,0.909408,01:10
2,0.359433,0.267908,0.885017,01:10
3,0.282798,0.250456,0.909408,01:11


In [None]:
check_gpu()

CUDA Available:  True
Device 0 | NVIDIA GeForce RTX 3050 Ti Laptop GPU | Allocated: 0.4 GB | Cached: 2.2 GB


In [None]:
learn.show_results()

Unnamed: 0,text,category,category_
0,xxbos 365 369 373 xxunk xxunk xxunk xxrep 10 0 xxunk xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep 5 0 xxrep,pseudoscience,pseudoscience
1,xxbos 2022 before musk bought twitter social media giant suspended 11 xxrep 3 0 accounts removed 1 xxrep 5 0 pieces content violating its misinformation policy between 2020 2022 alone now musk owns has made 2022 help medical records more than 1 xxrep 3 0 health care workers interacting directly patients researchers concluded medical masks provide little benefit over respirators specifically researchers 2022 world experiencing strongest decline birth rates over 100 years its unprecedented figure yet except laypersons like educators independent researchers statisticians true health official making comments 2022 more than 230 million having received least two shots first time since shots came more vaccinated than persons dying infection just those eligible new boosters 2022 effort discredit thoroughly sourced article risks benefits treatment goes great lengths literally prove pill does indeed prevent deaths 2022 now he s finally under deposition role pandemic censorship went national institute allergy infectious diseases director,pseudoscience,pseudoscience
2,xxbos guide legal matters 2017 federal court ordered aka belle pay engaging unconscionable conduct relating sale her book app whole pantry described below court found falsely stated been diagnosed brain cancer given only four months live after rejecting standard treatment cured herself natural methods portion proceeds book described her methods would donated charities court also ordered pay costs consumer affairs which initiated case against her 2018 reported paid money consumer affairs seeking court order enable xxunk her xxunk court consumer law xxunk 247 xxunk first respondent road xxunk ltd xxunk xxunk xxunk xxunk second respondent misleading deceptive conduct consumer law consumer law engaged conduct trade commerce which misleading deceptive likely mislead deceive contrary consumer law approximately 2013 made claims connection development promotion sale whole pantry app whole pantry book second respondent engaged conduct trade commerce which misleading deceptive likely mislead deceive contrary consumer law consumer law about 2013 made claims,science,pseudoscience
3,xxbos children today previous generations proof news coverage see every day site shows what s happening schools around increasingly disabled chronically ill education system has accommodate them long associated autism like sensory issues repetitive behaviors lack social skills now problems affecting mainstream students blame predictably placed bad parenting otherwise known trauma home addressing mental health needs important academics modern educators unrecognized disaster here about children who ca nt learn behave like children always been expected childhood has become chilling portent future mankind join our mailing list never miss update thanks message sent medical center host sensory friendly holiday event schools unable meet diverse levels debt new autism school open sped workers parents want meeting xxunk xxunk school marks 11th anniversary sensory room opening parents take diagnose dump treatment autism cape sensory friendly safe place watch parade lights special high school add autism places increasing demand new suspensions disproportionately affect students,pseudoscience,pseudoscience
4,xxbos can not manufacture any products come oil which supporting eight billion now planet conference held attracted global elites more than four hundred private jets attendees recognize climate change occurring like has four billion years seems most lacked basic energy literacy starts knowledge renewable energy only intermittent electricity generated unreliable breezes sunshine wind turbines solar panels can not manufacture anything billion planet xxunk xxunk facts like wind turbines solar panels can not manufacture any oil derivatives basis thousands products foundation societies economies around world fossil fuel products reasons world xxunk billion less than two hundred years much world leaders wish rid world emissions fossil fuels world has yet identify replacement oil derivatives basis more than 6 xxrep 3 0 products fuels our various transportation infrastructures cop attendees should also know crude oil useless unless manufactured into something xxunk like fuels xxunk transportation infrastructures ships jets derivatives make more than 6,pseudoscience,pseudoscience
5,xxbos exchange consists seven posts comment reply plus post translation mostly google translate original 2020 posts comments kindly corrected translation welcome comment xxunk thumb keep mind posts these xxunk comments written two years ago thus can not take more recent comments here into account hope publish sections seven consecutive chose day week without remembering its connection ancient days readers these authors invited contribute further comments our comment system weeks post starts introduction followed first posts comment followed xxunk two years ago professors informatics respectively published article titled using statistical methods model molecular machines systems journal theoretical biology regular scientific intelligent design community publication seen breakthrough close reading paper however convincing seven blog posts blog run staff institute biology environmental science university therefore detailed critique four specific points also paper could published article title using statistical methods model well xxunk article clearly statistical methods been used demonstrate cell however calculations presented,science,science
6,xxbos food babe whenever enter conventional grocery store get heart xxunk might think kidding about body gets heated face starts xxunk end saying about times before leave store think now would emotions under control think why passionate about fixing food system know too because keep emailing commenting xxunk xxunk things seeing ca nt thank enough investigation been asking here new food babe definitely want read before ever deli counter again received emails comments social media expressing been led believe boars head deli meat than brands wanted know really better just got really good marketing compromise elsewhere sure makes sound like add cheap additives their deli meats really emails would love eat boars head deli meats xxunk disease seemed leading company deli meats called company directly ingredient lists favorite deli meats discovered caramel coloring which know good lot their offerings could food babe investigate further start campaign get them take offending,pseudoscience,pseudoscience
7,xxbos google has declared war independent media has begun blocking emails getting our readers recommend free uncensored email receiving service free encrypted email send receive service okay continue address 2022 huff tags big tech brainwashed computing conspiracy deception deep state musk glitch globalist narrative information technology mind control propaganda psychological operations social media tech giants technocrats article may contain statements reflect opinion author natural news portion twitters user base propagandists pushing psychological operations ops according musk new head twitter empire amount pro ops twitter ridiculous musk tweeted adding joke least new verified will pay privilege related 2014 musk warned artificial intelligence has potential completely wipe humanity later asked explain nature these ops musk clarified most them appear rather than professional adding its mostly basic simply put operation usually led state actors such military aims disseminate propaganda purpose psychological warfare manipulation targets thoughts beliefs unclear specifically musk referring pro ops only,pseudoscience,pseudoscience


In [None]:
test_sources = {'https://infowarslife.com/':'pseudoscience',
'https://www.bbc.com/news/':'science',
'https://www.dailymail.co.uk/':'pseudoscience',
'https://www.si.edu/explore/science':'science',
'https://www.foxnews.com/opinion':'pseudoscience',
'https://www.disclose.tv/':'pseudoscience',
'https://www.snopes.com/top/':'science',
'https://www.theskepticsguide.org/about':'science',
'https://www.cdc.gov/':'science',
'https://www.motherjones.com/':'pseudoscience',
'https://www.huffpost.com/':'pseudoscience',
'https://arstechnica.com/':'science',
'https://nationalreport.net/':'pseudoscience',
'https://newspunch.com/':'pseudoscience'}

In [None]:
d_pred = {}

for source in test_sources:
    page = get_page_all(source, k, max_words, ignore_words)
    length = len(page.cleaned_text)
    if  length < min_words:
        print("ERROR:",source,length,"words")
    else:
        common_words = ' '.join([count[0] for count in page.most_common_words])
        text = ' '.join(page.cleaned_text)
        with learn.no_bar(), learn.no_logging():
            prediction = learn.predict(text)
        if prediction[0] == "science":
            p = prediction[2][1].item()
        else:
            p = prediction[2][0].item()
        d_pred[source] = [test_sources[source], prediction[0], p]

df = pd.DataFrame.from_dict(d_pred, orient='index', columns=['actual', 'prediction', 'probability'])
df

Unnamed: 0,actual,prediction,probability
https://infowarslife.com/,pseudoscience,pseudoscience,0.701568
https://www.bbc.com/news/,science,science,0.834534
https://www.dailymail.co.uk/,pseudoscience,pseudoscience,0.986453
https://www.si.edu/explore/science,science,science,0.80199
https://www.foxnews.com/opinion,pseudoscience,pseudoscience,0.958947
https://www.disclose.tv/,pseudoscience,pseudoscience,0.981773
https://www.snopes.com/top/,science,science,0.983805
https://www.theskepticsguide.org/about,science,science,0.800767
https://www.cdc.gov/,science,science,0.985176
https://www.motherjones.com/,pseudoscience,science,0.903082


In [None]:
#learn.save('2022.11.29 Model v2')

In [None]:
#learn = load_learner('models/2022.11.28 Model.pth', cpu=False)