In [1]:
import threading 
from queue import Queue
import feedparser
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
import requests
from lxml import html
import pickle
import datetime
import pandas as pd
from operator import itemgetter
from time import sleep

In [2]:
import re
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [3]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import ui
from selenium.webdriver.support.ui import WebDriverWait, Select

from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.set_window_size(1920, 1080)

In [4]:
class ListeningRSS(threading.Thread):
    def __init__(self, job):
        threading.Thread.__init__(self)
        self.stoprequest = threading.Event()
        self.mur2job = job
        self.memory = {}
        self.memoryname = 'data/journal_all_taylor_and_francis_feeds.pickle'
        try:
            with open(self.memoryname, 'rb') as handle:
                self.memory = pickle.load(handle)
        except:
            pass
        
        self.feeds = [ 
            [ "https://www.tandfonline.com/feed/rss/tjom20", "Journal of Maps" ], 
            ['https://www.tandfonline.com/feed/rss/uaar20', 'Arctic, Antarctic, and Alpine Research']
        ] 
        
    def join(self, timeout=None):
        self.stoprequest.set()
        super(ListeningRSS, self).join(timeout)
    def run(self):
        print(self.memory)
        while True:
            for feedl in self.feeds:
                feedurl = feedl[0]
                feed = feedparser.parse(feedurl)
                # add feed if it is not in the dictionary
                if feedurl not in self.memory.keys():
                    self.memory[feedurl] = { 'lasturl': '' }
                # check for new feed
                newfeedIdx = 0            
                
                for item in feed['items']:
                    journal = item['dc_source']
                    if journal not in self.memory.keys():
                        self.memory[journal] = {}
                    
                    # get what is possible from the RSS
                    url = item['id']
                    
                    # check url is in the last on or not
                    if url not in self.memory[journal].keys():
                        self.memory[journal][url] = datetime.datetime.now() 
                        # keep memory under control, so drop last observation
                        if len(self.memory[journal].keys()) > 200:
                            # drop oldest
                            keys = dict(sorted((value, key) for (key,value) in self.memory[journal].items()))
                            key = keys[list(keys.keys())[0]]
                            self.memory[journal].pop( key )
                        # pickle the memory
                        with open(self.memoryname, 'wb') as handle:
                            pickle.dump(self.memory, handle, protocol=pickle.HIGHEST_PROTOCOL)
                    else:
                        continue
                            
                    
                    title = item['title']
                    doi = item['dc_identifier']
                    
                    # visit the article page to get the other information
                    try:
                        driver.get(url)
                    except:
                        print(url)
                        continue
                    content = driver.page_source
                    # load the page content in BeautifulSoup
                    page_soup = BeautifulSoup(content, features="lxml")
                    # abstract
                    abstract = ""
                    for abstdiv in page_soup.find_all("div", {'class':['abstractInFull']}):
                        for ap in abstdiv.find_all("p"):
                            abstract = abstract + (" ".join(ap.get_text().splitlines()
                                                           ).replace("\xa0", " ") ).replace("ABSTRACT", '')
                    if len(abstract) == 0:
                        continue
                    
                    # keywords
                    keywords = ""
                    date = ""
                    language = ""
                    for m in page_soup.find_all("meta"):
                        if m.has_attr("name"):
                            if m['name'] == "dc.Subject":
                                keywords = m['content'].replace(";", "#")  
                            elif m['name'] == "dc.Date":
                                date = datetime.datetime.strptime(m['content'], "%d %b %Y").strftime('%Y-%m-%d')
                            elif m['name'] == "dc.Language":
                                language = m['content']
                                
                    
                    # writers info
                    writers = []
                    for wa in page_soup.find_all("a", {'class':['entryAuthor']}):                    
                        for wai in wa.find_all("span", {'class':['overlay']}):
                            writers.append( wa.contents[0] )
                            writers[-1] = writers[-1]+"--"+wai.contents[0]
                        for wai in wa.find_all("span", {'class':['orcid-author']}):
                            writers[-1] = writers[-1]+"---"+wai.get_text()
                    if len(writers) > 0:
                        writers =  "#".join(writers)
                    
                    # send data to loader
                    self.mur2job.put({
                        'journal': journal,
                        'url': url,
                        'title': title,
                        'writers': writers,
                        'doi': doi,
                        'abstract': abstract,
                        'keywords': keywords,
                        'date': date,
                        'language': language
                    })
                    
            # check every 6 hour
            print("Wait for next run")
            sleep(21600)
                        


In [5]:
class Feed_mur2(threading.Thread):
    def __init__(self, job):
        self.filename = 'data/journal_all_taylor_and_francis.csv'
        threading.Thread.__init__(self)
        self.stoprequest = threading.Event()
        self.job = job
        self.df =  pd.DataFrame({'url': [], 'journal': [],
                                'title': [], 'writer': [], 
                                'doi': [], 'abstract': [],
                                'keyword': [], 'publishdate': [],
                                'language': [] })
        # try to read old df
        try:
            self.df = pd.read_csv(self.filename, sep="|")
            print(self.df.tail())
        except:
            pass
    def join(self, timeout=None):
        self.stoprequest.set()
        super(Feed_mur2, self).join(timeout)
    def run(self):
        while True:
            task = None
            try:
                task = self.job.get(True, 0.001) 
            except Exception as e:
                continue

            # save to dataframe at the moment
            self.df = self.df.append(dict(zip(self.df.columns,
                                    [
                                        task['url'], task['journal'],
                                        task['title'], task['writer'], 
                                        task['doi'], task['abstract'],
                                        task['keyword'], task['publishdate'],
                                        task['language']
                                    ])), ignore_index=True)
            
            # save
            self.df.to_csv(self.filename, sep="|", index=False)

In [6]:
threadLock = threading.Lock()
threads = []

# queue for the jobs
jobs = Queue(maxsize=0)
 
listenerRss = ListeningRSS(jobs)
listenerRss.start()
threads.append(listenerRss)
 
feedmur2 = Feed_mur2(jobs)
feedmur2.start()
threads.append(feedmur2)
 
# Wait for all threads to complete
for t in threads:
    t.join()
 
 
print("Exiting Main Thread")

{'https://www.tandfonline.com/feed/rss/tjom20': {'lasturl': ''}, 'Journal of Maps': {'https://www.tandfonline.com/doi/full/10.1080/17445647.2019.1685605?af=R': datetime.datetime(2020, 10, 27, 17, 23, 51, 872890), 'https://www.tandfonline.com/doi/full/10.1080/17445647.2019.1689373?af=R': datetime.datetime(2020, 10, 27, 17, 23, 55, 148792), 'https://www.tandfonline.com/doi/full/10.1080/17445647.2019.1689858?af=R': datetime.datetime(2020, 10, 27, 17, 23, 58, 348391), 'https://www.tandfonline.com/doi/full/10.1080/17445647.2019.1690595?af=R': datetime.datetime(2020, 10, 27, 17, 24, 2, 13132), 'https://www.tandfonline.com/doi/full/10.1080/17445647.2019.1692082?af=R': datetime.datetime(2020, 10, 27, 17, 24, 4, 998394), 'https://www.tandfonline.com/doi/full/10.1080/17445647.2019.1692700?af=R': datetime.datetime(2020, 10, 27, 17, 24, 7, 957268), 'https://www.tandfonline.com/doi/full/10.1080/17445647.2019.1698472?af=R': datetime.datetime(2020, 10, 27, 17, 24, 17, 650079), 'https://www.tandfonlin

Wait for next run
Wait for next run
Wait for next run


KeyboardInterrupt: 