In [None]:
import threading 
from queue import Queue
import feedparser
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
import requests
from lxml import html
import pickle
import datetime
import pandas as pd
from operator import itemgetter
from time import sleep
import random

In [None]:
import re
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import ui
from selenium.webdriver.support.ui import WebDriverWait, Select

from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.set_window_size(1920, 1080)

In [None]:
jname = "iop"
version = '1.1.0'

In [None]:
class ListeningRSS(threading.Thread):
    def __init__(self, job):
        threading.Thread.__init__(self)
        self.stoprequest = threading.Event()
        self.mur2job = job
        self.memory = {}
        self.memoryname = 'data/journal_all_'+jname+'_'+version+'_feeds.pickle'
        try:
            with open(self.memoryname, 'rb') as handle:
                self.memory = pickle.load(handle)
        except:
            pass
        
        self.feeds = pd.read_csv("doaj_journals.csv", sep="|")
        self.feeds = self.feeds[self.feeds['publisher'] == "IOP Publishing"]
        self.feeds = self.feeds.to_dict('records')
        
    def join(self, timeout=None):
        self.stoprequest.set()
        super(ListeningRSS, self).join(timeout)
    def run(self):
        print(self.memory)
        while True:
            for feedl in self.feeds:
                
                feedurl = 'https://iopscience.iop.org/journal/rss/'+feedl['eissn']
                print(feedurl)
                feed = feedparser.parse(feedurl)
                # print(feed)
                # add feed if it is not in the dictionary
                if feedurl not in self.memory.keys():
                    self.memory[feedurl] = { 'lasturl': '' }
                # check for new feed
                newfeedIdx = 0 
                for item in feed['items']:
                    journal = item['prism_publicationname']
                    if journal not in self.memory.keys():
                        self.memory[journal] = {}
                    
                    # get what is possible from the RSS
                    url = item['link']
                    
                    # check url is in the last on or not
                    if url not in self.memory[journal].keys():
                        self.memory[journal][url] = datetime.datetime.now() 
                        # keep memory under control, so drop last observation
                        if len(self.memory[journal].keys()) > 200:
                            # drop oldest
                            keys = dict(sorted((value, key) for (key,value) in self.memory[journal].items()))
                            key = keys[list(keys.keys())[0]]
                            self.memory[journal].pop( key )
                        # pickle the memory
                        with open(self.memoryname, 'wb') as handle:
                            pickle.dump(self.memory, handle, protocol=pickle.HIGHEST_PROTOCOL)
                    else:
                        continue                            
                    
                    title = item['title']
                    doi = item['prism_doi']
                    
                    
                    abstract = item['description']
                    date = None
                    try:
                        date = datetime.datetime.strptime(item['prism_coverdisplaydate'], "%d/%B/%Y").strftime('%Y-%m-%d') 
                    except:
                        try:
                            date = datetime.datetime.strptime(item['updated'], "%Y-%m-%dT%H:%M:%SZ").strftime('%Y-%m-%d') 
                        except:
                            print('Error processing date!')
                    keywords = ""                    
                    language = "en"
                    writers = item['author'].replace(" and ", ",").replace(",","#")
                    
                    # data dict
                    datadict = {
                        'journal_title': journal,
                        'url': url,
                        'title': title,
                        'writer': writers,
                        'doi': doi,
                        'abstract': abstract,
                        'keyword': keywords,
                        'publishdate': date,
                        'language': language,
                        'journal_eissn': feedl['eissn'],
                        'journal_pissn': feedl['pissn'],
                        'category': feedl['categories']
                    }
                    print(datadict)
                    
                    # send data to loader
                    self.mur2job.put(datadict)
                    
                    # random sleep
                    sleep(random.uniform(150, 450))
                    
            # random sleep
            sleep(random.uniform(150, 450))
                    
            # check every 6-12 hour 
            print("Wait for next run")
            sleep( random.uniform(21600, 43200) )
                        


In [None]:
class Feed_mur2(threading.Thread):
    def __init__(self, job):
        self.filename = 'data/journal_all_'+jname+'_'+version+'.pandas'
        threading.Thread.__init__(self)
        self.stoprequest = threading.Event()
        self.job = job
        self.df =  pd.DataFrame({'url': [], 'journal_title': [],
                                'title': [], 'writer': [], 
                                'doi': [], 'abstract': [],
                                'keyword': [], 'publishdate': [],
                                'language': [],
                                'category': [],
                                'journal_eissn': [], 'journal_pissn': []})
        # try to read old df
        try:
            self.df = pd.read_pickle(self.filename)
            print(self.df.tail())
            print(len(self.df))
        except:
            pass
    def join(self, timeout=None):
        self.stoprequest.set()
        super(Feed_mur2, self).join(timeout)
    def run(self):
        while True:
            task = None
            try:
                task = self.job.get(True, 0.001) 
            except Exception as e:
                continue

            # save to dataframe at the moment
            self.df = self.df.append(dict(zip(self.df.columns,
                                    [
                                        task['url'], task['journal_title'],
                                        task['title'], task['writer'], 
                                        task['doi'], task['abstract'],
                                        task['keyword'], task['publishdate'],
                                        task['language'], 
                                        task['category'],
                                        task['journal_eissn'], task['journal_pissn']
                                    ])), ignore_index=True)
            
            # drop duplicates
            self.df.drop_duplicates(inplace=True)
            print(len(self.df))
            
            # save
            self.df.to_pickle(self.filename)

In [None]:
threadLock = threading.Lock()
threads = []

# queue for the jobs
jobs = Queue(maxsize=0)
 
listenerRss = ListeningRSS(jobs)
listenerRss.start()
threads.append(listenerRss)
 
feedmur2 = Feed_mur2(jobs)
feedmur2.start()
threads.append(feedmur2)
 
# Wait for all threads to complete
for t in threads:
    t.join()
 
 
print("Exiting Main Thread")