In [None]:
import threading 
from queue import Queue
import feedparser
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
import requests
from lxml import html
import pickle
import datetime
import pandas as pd
from operator import itemgetter
from time import sleep
import random

In [None]:
import re
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import ui
from selenium.webdriver.support.ui import WebDriverWait, Select

from selenium.webdriver.firefox.options import Options
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.set_window_size(1920, 1080)

In [None]:
jname = "iop"

In [None]:
class ListeningRSS(threading.Thread):
    def __init__(self, job):
        threading.Thread.__init__(self)
        self.stoprequest = threading.Event()
        self.mur2job = job
        self.memory = {}
        self.memoryname = 'data/journal_all_'+jname+'_feeds.pickle'
        try:
            with open(self.memoryname, 'rb') as handle:
                self.memory = pickle.load(handle)
        except:
            pass
        
        self.feeds = pd.read_csv('journal_all_iop_rss.csv', sep="|")
        self.feeds = self.feeds.to_dict('records')
        
    def join(self, timeout=None):
        self.stoprequest.set()
        super(ListeningRSS, self).join(timeout)
    def run(self):
        print(self.memory)
        while True:
            for feedl in self.feeds:
                feedurl = feedl['url']
                feed = feedparser.parse(feedurl)
                # print(feed)
                # add feed if it is not in the dictionary
                if feedurl not in self.memory.keys():
                    self.memory[feedurl] = { 'lasturl': '' }
                # check for new feed
                newfeedIdx = 0 
                for item in feed['items']:
                    journal = item['prism_publicationname']
                    if journal not in self.memory.keys():
                        self.memory[journal] = {}
                    
                    # get what is possible from the RSS
                    url = item['link']
                    
                    # check url is in the last on or not
                    if url not in self.memory[journal].keys():
                        self.memory[journal][url] = datetime.datetime.now() 
                        # keep memory under control, so drop last observation
                        if len(self.memory[journal].keys()) > 200:
                            # drop oldest
                            keys = dict(sorted((value, key) for (key,value) in self.memory[journal].items()))
                            key = keys[list(keys.keys())[0]]
                            self.memory[journal].pop( key )
                        # pickle the memory
                        with open(self.memoryname, 'wb') as handle:
                            pickle.dump(self.memory, handle, protocol=pickle.HIGHEST_PROTOCOL)
                    else:
                        continue                            
                    
                    title = item['title']
                    doi = item['prism_doi']
                    # visit the article page to get the other information
                    try:
                        driver.get(url)
                    except:
                        print("Error in url:", url)
                        continue
                    content = driver.page_source
                    # load the page content in BeautifulSoup
                    page_soup = BeautifulSoup(content, features="lxml")
                    # abstract
                    abstract = ""
                    for abstdiv in page_soup.find_all("div", {'class':['wd-jnl-art-abstract']}):
                        for ap in abstdiv.find_all("p"):
                            abstract = abstract + (" ".join(ap.get_text().splitlines()
                                                           ).replace("\xa0", " ") ).replace("Abstract", '')
                    if len(abstract) == 0:
                        print("No abstract", url)
                        continue
                    # make some action to not look as boot
                    element = driver.find_element_by_tag_name('body')
                    action = webdriver.ActionChains(driver)
                    action.move_to_element(element)
                    try:
                        action.move_by_offset( random.uniform(0, 120),  random.uniform(200, 500)).perform()
                    except:
                        pass
                    sleep( random.uniform(20, 120) )                    
                    try:
                        action.move_by_offset( random.uniform(0, 120),  random.uniform(-100, 500)).perform()
                    except:
                        pass
                    element.send_keys(Keys.END)

                    # keywords
                    keywords = ""
                    date = ""
                    language = ""
                    writers = []
                    # this flag to catch the author institute
                    thiswriter = False
                    for m in page_soup.find_all("meta"):
                        if m.has_attr("name"):
                            if m['name'] == "dc.Subject":
                                keywords = m['content'].replace(";", "#")  
                            elif m['name'] == "citation_online_date":
                                try:
                                    date = datetime.datetime.strptime(m['content'], "%Y/%m/%d").strftime('%Y-%m-%d')
                                except Exception as e:
                                    try:
                                        date = datetime.datetime.strptime(m['content'], "%Y-%m-%d").strftime('%Y-%m-%d')
                                    except Exception as e:
                                        print("Datetime error:", m['content'], "url:", url)
                            elif m['name'] == "citation_language":                                
                                language = m['content']
                            elif  m['name'] == 'citation_author':
                                writer = m['content']
                                thiswriter = True
                                writers.append(writer)
                            elif  m['name'] == 'citation_author_institution' and thiswriter:
                                writers[-1] = writers[-1]+"--"+ m['content']

                            if not m['name'] == 'citation_author':
                                thiswriter = False
                    writers =  "#".join(writers)                      
                    
                    # data dict
                    datadict = {
                        'journal_title': journal,
                        'url': url,
                        'title': title,
                        'writer': writers,
                        'doi': doi,
                        'abstract': abstract,
                        'keyword': keywords,
                        'publishdate': date,
                        'language': language,
                        'journal_eissn': feedl['eissn'],
                        'journal_pissn': feedl['pissn'],
                        'category': feedl['category']
                    }
                    
                    # send data to loader
                    self.mur2job.put(datadict)
                    
                    # 
                    sleep(random.uniform(150, 450))
                    
            # check every 6-12 hour 
            print("Wait for next run")
            sleep( random.uniform(21600, 43200) )
                        


In [None]:
class Feed_mur2(threading.Thread):
    def __init__(self, job):
        self.filename = 'data/journal_all_'+jname+'.pandas'
        threading.Thread.__init__(self)
        self.stoprequest = threading.Event()
        self.job = job
        self.df =  pd.DataFrame({'url': [], 'journal_title': [],
                                'title': [], 'writer': [], 
                                'doi': [], 'abstract': [],
                                'keyword': [], 'publishdate': [],
                                'language': [],
                                'category': [],
                                'journal_eissn': [], 'journal_pissn': []})
        # try to read old df
        try:
            self.df = pd.read_pickle(self.filename)
            print(self.df.tail())
        except:
            pass
    def join(self, timeout=None):
        self.stoprequest.set()
        super(Feed_mur2, self).join(timeout)
    def run(self):
        while True:
            task = None
            try:
                task = self.job.get(True, 0.001) 
            except Exception as e:
                continue

            # save to dataframe at the moment
            self.df = self.df.append(dict(zip(self.df.columns,
                                    [
                                        task['url'], task['journal_title'],
                                        task['title'], task['writer'], 
                                        task['doi'], task['abstract'],
                                        task['keyword'], task['publishdate'],
                                        task['language'], 
                                        task['category'],
                                        task['journal_eissn'], task['journal_pissn']
                                    ])), ignore_index=True)
            
            # save
            self.df.to_pickle(self.filename)

In [None]:
threadLock = threading.Lock()
threads = []

# queue for the jobs
jobs = Queue(maxsize=0)
 
listenerRss = ListeningRSS(jobs)
listenerRss.start()
threads.append(listenerRss)
 
feedmur2 = Feed_mur2(jobs)
feedmur2.start()
threads.append(feedmur2)
 
# Wait for all threads to complete
for t in threads:
    t.join()
 
 
print("Exiting Main Thread")