## Web Crawler 
This is a simple web crawler to crawl text from the top results in google search, so that we can pick any topic, easily create a dataset and try to train our model 

In [11]:
import requests 
from fake_useragent import UserAgent 
from bs4 import BeautifulSoup 
import re
import pandas as pd
#pip install pyyaml ua-parser user-agents fake-useragent

class crawler(object):
    def my_scraper(self, tmp_url_in):
        tmp_text = ''
        try:
            content = requests.get(tmp_url_in)
            soup = BeautifulSoup(content.text, 'html.parser')
    
            tmp_text = soup.findAll('p') 
    
            tmp_text = [word.text for word in tmp_text]
            tmp_text = ' '.join(tmp_text)
            tmp_text = re.sub('\W+', ' ', re.sub('xa0', ' ', tmp_text))
            #tmp_text = re.sub('\W+', ' ', tmp_text)
        except:
            pass
    
        return tmp_text
    
    def fetch_urls(self, query, cnt):
        #now lets use the following function that returns
        #URLs from an arbitrary regex crawl form google
        
        ua = UserAgent()
    
        google_url = "https://www.google.com/search?q=" + query + "&num=" + str(cnt)
        response = requests.get(google_url, {"User-Agent": ua.random})
        soup = BeautifulSoup(response.text, "html.parser")
    
        result_div = soup.find_all('div', attrs = {'class': 'ZINbbc'})
    
        links = []
        titles = []
        descriptions = []
        for r in result_div:
            # Checks if each element is present, else, raise exception
            try:
                link = r.find('a', href = True)
                title = r.find('div', attrs={'class':'vvjwJb'}).get_text()
                description = r.find('div', attrs={'class':'s3v9rd'}).get_text()
    
                # Check to make sure everything is present before appending
                if link != '' and title != '' and description != '': 
                    links.append(link['href'])
                    titles.append(title)
                    descriptions.append(description)
            # Next loop if one element is not present
            except:
                print('something is missing!')
                continue  
    
        to_remove = []
        clean_links = []
        for i, l in enumerate(links):
            clean = re.search('\/url\?q\=(.*)\&sa',l)
    
            # Anything that doesn't fit the above pattern will be removed
            if clean is None:
                to_remove.append(i)
                continue
            clean_links.append(clean.group(1))
    
        # Remove the corresponding titles & descriptions
        for x in to_remove:
            del titles[x]
            del descriptions[x]
            
        return clean_links
    
    def create_dataframe(self, my_query, the_cnt_in): 
        the_urls_list = self.fetch_urls(my_query, the_cnt_in)
        the_data = pd.DataFrame()
        for word in the_urls_list:
            body_basic = self.my_scraper(word)
            the_data = the_data.append({'body_basic': body_basic, 'url': word}, ignore_index = True)
        return the_data

In [30]:
#Crawl the 10 URLS from the output of fetch_urls and append to a dataframe the following; 
#url in one column and cleaned text in the other 
the_query = ['Home Team Science and Technology Agency', 'Question and Answering System']
num_docs =  100
dataframe_list = []
for item in the_query: 
    dataframe = pd.DataFrame()
    print('searching for ', item)
    my_func = crawler()
    dataframe = dataframe.append(my_func.create_dataframe(item, num_docs))
    dataframe= dataframe[dataframe.body_basic != '']
    dataframe['length'] = dataframe.body_basic.apply(lambda x: len(x.split(' ')))
    dataframe = dataframe[dataframe.length > 50] #we only want articles where clean text is more than 50 words
    dataframe_list.append(dataframe)

searching for  Question and Answering System
something is missing!
something is missing!
something is missing!
something is missing!
something is missing!


In [26]:
htx = dataframe_list[0]
htx.head()

Unnamed: 0,body_basic,url,length
1,The Home Team Science and Technology Agency ab...,https://en.wikipedia.org/wiki/Home_Team_Scienc...,181
3,1 Stars Avenue 12 01 Singapore 138507 1 Stars ...,https://www.sgdi.gov.sg/ministries/mha/statuto...,59
4,Copyright Ministry of Health Singapore All Ri...,https://www.healthhub.sg/directory/16/67277/ho...,166
5,The Home Team keeps Singapore safe and secure ...,https://www.mha.gov.sg/who-we-are,756
6,463 Restricted Client This resource is not ava...,https://sso.agc.gov.sg/Act/HTSTAA2019,72


In [34]:
qa = dataframe_list[1]
qa.head()

Unnamed: 0,body_basic,url,length
0,Question answering QA is a computer science di...,https://en.wikipedia.org/wiki/Question_answering,1596
1,Recently I came across this library that enabl...,https://www.section.io/engineering-education/q...,961
2,Please contact us via our support center for ...,https://www.sciencedirect.com/science/article/...,93
3,Sign in Alvira Swalin May 23 2018 9 min read F...,https://towardsdatascience.com/building-a-ques...,1679
4,but your activity and behavior on this site m...,https://iopscience.iop.org/article/10.1088/175...,72


In [29]:
htx[['body_basic']].to_csv('htx_articles.csv', index = False, header=False)

In [35]:
qa[['body_basic']].to_csv('qa_articles.csv', index = False, header=False)