<a href="https://colab.research.google.com/github/ravi-prakash1907/A-tracking-of-COVID-19/blob/master/Data_Collection_by_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using Web Scraping

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

**From arXiv.org**

In [73]:
url = 'https://arxiv.org/search/?searchtype=all&query=cyber-security+AND+blockchain&abstracts=show&size=200&order='
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
info = []
c = 0

for tag in soup.find_all("li"): 
  for div in tag.find_all("div"):
    if div['class'] == list(['tags', 'is-inline-block']):
      c += 1
      tags = []
      #print(c)

      for s in div.find_all("span"):
        dataString = s['data-tooltip']
        tags.append(dataString)
      info.append(tags)

info

[['Cryptography and Security'],
 ['Cryptography and Security', 'Distributed, Parallel, and Cluster Computing'],
 ['Computers and Society'],
 ['Cryptography and Security', 'Distributed, Parallel, and Cluster Computing'],
 ['Cryptography and Security', 'Distributed, Parallel, and Cluster Computing'],
 ['Networking and Internet Architecture'],
 ['Cryptography and Security', 'Systems and Control'],
 ['Cryptography and Security', 'Human-Computer Interaction'],
 ['Cryptography and Security'],
 ['Cryptography and Security'],
 ['Cryptography and Security'],
 ['Cryptography and Security'],
 ['Cryptography and Security'],
 ['Cryptography and Security'],
 ['Cryptography and Security'],
 ['Cryptography and Security'],
 ['Cryptography and Security'],
 ['Cryptography and Security'],
 ['Cryptography and Security'],
 ['Networking and Internet Architecture'],
 ['Cryptography and Security'],
 ['Cryptography and Security', 'Distributed, Parallel, and Cluster Computing'],
 ['Cryptography and Security'],
 

In [76]:
## get possible data from the scopus API
class infoScraper:
    def __init__(self):
        ## protacted var and can't be accessed outside the class
        self.baseURL = "https://arxiv.org/search/"  # to search using query
        self.keywords = 'cyber-security+AND+blockchain'   # query=
        self.searchType = 'abstract'                # searchtype=
        self.size ='200'                            # size=

        # eg: https://arxiv.org/search/?query=blockchain+cyber+attack&searchtype=abstract&abstracts=show&order=-announced_date_first&size=50
        self.RequestURL = None # build URL for scraping; 
        self.htmlDump = None # to store the api response

        # acceptable queries
        self.attributes = {
          'titles' : ['title', 'is-5', 'mathjax'],
          'authors' : ['authors'],
          'abstracts' : ['abstract', 'mathjax'],
          'dates' : ['is-size-7'],
          'tags' : ['tags', 'is-inline-block'],
          'resCount'  : ['title', 'is-clearfix']
        }

    
    """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""

    ## defining the query on the bases of the parameters
    def setTargetURL(self):
        # reqType = query OR abstract; single 

        ## pull base URL
        baseURL =  self.baseURL
        
        ## ading params
        url = baseURL + \
              '?' + 'query=' + self.keywords + \
              '&' + 'searchtype=' + self.searchType + \
              '&' + 'abstracts=' + 'show' + \
              '&' + 'order=' + '-announced_date_first' + \
              '&' + 'size=' + self.size
                
        self.RequestURL = url
        self.htmlDump = None # removing previous scraped soup due to new url
        
        return True

    ## get response
    def scrapePage(self):
        if self.RequestURL is not None:
          url = self.RequestURL
        else:
          self.setTargetURL()
          url = self.RequestURL
        
        req = requests.get(url)
        try:
          toSet = BeautifulSoup(req.content, 'html.parser')
        except:
          toSet = False
        finally:
          self.htmlDump = toSet
        
        # returning
        return True if toSet else toSet
    
    def getHTMLResp(self,disp=False):
        if not self.htmlDump:
            self.htmlDump = self.scrapePage()
        
        if disp:
            results = self.htmlDump.prettify()
            print(results)

        return True

    """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""

    # resp counter
    # individual info fetcher

    # fetched string cleaner
    def charCleaning(self,text):
        # 1. remove special char (.,-)
        pattern = re.compile(r'[\n|\s]+?')
        # 2. rempve multiple space
        spacePat = re.compile(r'\s\s+')

        # cleaning
        text = re.sub(pattern, ' ', text)
        text = re.sub(spacePat, ' ', text)  

        # 3. remove 'space' from 'beg.' and 'end'
        text = text.strip()
        
        return text

    
    # that main bulk fetcher
    def scrapeInfo(self, fetch):
        ## validating the fetch param / class
        try:
          classes = tuple(self.attributes[fetch])
        except:
          print("Error occured!! Invalid argument encountered for fetch!\n")
          return False

        ## info extraction
        info = []
        soup = self.htmlDump
        
        if fetch == 'resCount': # for paper's info
          for tag in soup.find_all("h1"):
            if tag['class'] == list(classes):
              dataString = self.charCleaning(tag.text)
              info.append(dataString)
              break

        elif fetch == 'tags': # for paper's tags
          for tag in soup.find_all("li"): 
            for div in tag.find_all("div"):
              if div['class'] == list(classes):
                tags = []
                for s in div.find_all("span"):
                  dataString = s['data-tooltip']
                  tags.append(dataString)
                
                info.append(tags)
        
        else: # for results' info
          for tag in soup.find_all("li"): 
            for p in tag.find_all("p"):
              if p['class'] == list(classes):
                dataString = self.charCleaning(p.text)
                info.append(dataString)
        
        ## returning data
        return info

    """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""

    ## gathering
    def getTitles(self):
      return self.scrapeInfo('titles')
    
    def getAuthors(self):
      return self.scrapeInfo('authors')
    
    def getAbstracts(self):
      return self.scrapeInfo('abstracts')
    
    def getPubDates(self):
      return self.scrapeInfo('dates')
    
    def getTags(self):
      return self.scrapeInfo('tags')
    
    def getResultCounts(self):
      return self.scrapeInfo('resCount')
    


In [77]:
## create the db
class buildDataSet(infoScraper):
  def __init__(self):
    super().__init__()
    self.papersDF = self.initPapersDF()

  """~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""

  def initPapersDF(self):
    papersDF = pd.DataFrame(columns=['cover_date', 'title', 'authors', 'abstract', 'tags'])
    return papersDF
  
  def addRow(self, df, scopus_id, cover_date, title, authors, abstract, tags):
    #create rows for insertion
    thisRow = {'cover_date':cover_date,
               'title':title,
               'authors':authors,
               'abstract':abstract,
               'tags':tags}
    thisRow = pd.Series(thisRow)
    df = df.append(thisRow,ignore_index=True)
    
    return df

  def populateDF(self, inform=False):
    flag = True

    try:
      # creating temp dataframe
      tempPapersDF = self.initPapersDF()

      # scraping url
      self.scrapePage()

      # fetching data
      tempPapersDF['title'] = self.getTitles()
      tempPapersDF['authors'] = self.getAuthors()
      tempPapersDF['abstract'] = self.getAbstracts()
      tempPapersDF['cover_date'] = self.getPubDates()
      tempPapersDF['tags'] = self.getTags()
      counter = self.getResultCounts()
      
      self.papersDF = self.papersDF.append(tempPapersDF)

      if inform:
        print("Updated sucessfully!")
    except:
      if inform:
        print("Some error occured!")
      flag = False
    
    finally:
      return flag
  
  def getDF(self):
    if self.papersDF.shape[0] == 0:
      self.populateDF()
    
    return self.papersDF

In [78]:
## to be updated for custom url utility
s = buildDataSet()
gotDF = s.getDF()

gotDF.head()

Unnamed: 0,cover_date,title,authors,abstract,tags
0,"Submitted 21 January, 2022; originally announc...",Blockchain-based Collaborated Federated Learni...,"Authors: Amir Afaq, Zeeshan Ahmed, Noman Haide...","Abstract: …Instead, the devices can connect to...","[Cryptography and Security, Distributed, Paral..."
1,"Submitted 19 January, 2022; originally announc...",Towards Situational Aware Cyber-Physical Syste...,"Authors: Sabah Suhail, Saif Ur Rehman Malik, R...",Abstract: The complexity of cyberattacks in Cy...,[Cryptography and Security]
2,"Submitted 16 January, 2022; originally announc...",Improving Privacy and Security in Unmanned Aer...,"Authors: Hardik Sachdeva, Shivam Gupta, Anushk...",Abstract: …serving as mobile hotspots. Althoug...,"[Cryptography and Security, Distributed, Paral..."
3,"Submitted 12 January, 2022; originally announc...",Everything You wanted to Know about Smart Agri...,"Authors: Alakananda Mitra, Sukrutha L. T. Vang...",Abstract: …shows that the world is lagging beh...,[Computers and Society]
4,"Submitted 30 November, 2021; originally announ...",A Blockchain-Enabled Incentivised Framework fo...,"Authors: Kathy Nguyen, Shantanu Pal, Zahra Jad...",Abstract: In recent years Industrial Control S...,"[Cryptography and Security, Distributed, Paral..."


---  

### More Cleaning

In [49]:
# cleaning pub year
def cleanDates(dates):
  nonAlnum = re.compile(r'[-\s;.,]+?')
  dates = re.sub(nonAlnum, ' ', dates)
  dig = set()

  for d in dates.split(' '):
    if d.isdigit():
      dig.add(int(d))

  year = max(dig)
  return year

In [50]:
# cleaning abstract
def cleanAbs(abs):
  abs = abs.split(' ▽ More ')[-1]
  abs = abs.split(' △ Less')[0]

  return abs

In [51]:
# cleaning authors
def cleanAuthor(auth):
  auth = auth.split('Authors: ')[-1]
  auth = auth.split(', ')

  return auth

In [52]:
## updating table
gotDF['cover_date'] = gotDF['cover_date'].apply(cleanDates)
gotDF['authors'] = gotDF['authors'].apply(cleanAuthor)
gotDF['abstract'] = gotDF['abstract'].apply(cleanAbs)

gotDF.head(3)

Unnamed: 0,cover_date,title,authors,abstract
0,2022,Instantaneous and limiting behavior of an n-no...,"[Xiufeng Xu, Liang Hong]",We investigate the instantaneous and limiting ...
1,2022,Blockchain-based Collaborated Federated Learni...,"[Amir Afaq, Zeeshan Ahmed, Noman Haider, Muham...",Federated Learning (FL) provides privacy prese...
2,2022,Improving Privacy and Security in Unmanned Aer...,"[Hardik Sachdeva, Shivam Gupta, Anushka Misra,...","Unmanned Aerial Vehicles (UAVs), also known as..."


### Exporting

In [53]:
gotDF.to_csv('scrapedPapers.csv', index=False)

---