# **Install Package**

In [1]:
!pip install selenium
!apt-get update



You should consider upgrading via the 'd:\program file\python.exe -m pip install --upgrade pip' command.
'apt-get' is not recognized as an internal or external command,
operable program or batch file.


# **Import Module**

In [2]:
'''
file :  StackExchange_Webscraper.ipynb
last modified date: 8/6/2020
'''
import os
import requests
import bs4 as bs
from selenium import webdriver
import pandas as pd
import json
import time

# **Webscraper Class**

In [3]:
class StackWebscraper:
    driver = None                  
    topicDict = {}                 
    topicDataframe = \
        pd.DataFrame(columns=[      
        'Topic Title', 
        'Tags',
        'Leading Comment', 
        ])
    
    
    
    def __init__(self, webdriverPath):
        # Set up webdriver
        options = webdriver.ChromeOptions()
        options.add_argument('--ignore-certificate-errors')     # Ignore security certificates
        options.add_argument('--incognito')                     # Use Chrome in Incognito mode
        #options.add_argument('--headless')                     # Run in background
        self.driver = webdriver.Chrome( \
            executable_path = webdriverPath, \
            options = options)
        
        
        
    # get title from a beautiful soup of a question page. return None if no title found
    def get_title(self, soup):
        title = soup.find('a', class_ = 'question-hyperlink')
        
        # check if title exists in soup
        if title:
            return title.get_text()
        else:
            return None
        
        
        
    # get tags from a beautiful soup of a question page. return None if no tags found
    def get_tags(self, soup):
        post_tags = soup.find_all('a', class_= 'post-tag js-gps-track')
        
        # check if tags exist in soup
        if not post_tags:
            return None
        
        #collect all the tags into a list and return if they exist
        list_of_tags = []
        for tag in post_tags:
          list_of_tags.append(tag.get_text())
        return list_of_tags
    
    
    
    # get leading comment from a beautiful soup of a question page. return None if no leading comment found
    def get_leading_comment(self, soup):
        leading_comment = soup.find('div', class_ = 'post-text')
        leading_comment_texts = None
        
        # check if leading comment exists in soup
        if leading_comment:
            leading_comment_texts = leading_comment.find_all('p')
        else:
            return None
        
        # get only the text of the leading comment and remove other code sections
        leading_comment_text = ''
        for elem in leading_comment_texts:
            leading_comment_text = leading_comment_text + elem.get_text()
        return leading_comment_text
    
    
    
    # get the links from specific tags
    def get_links_with_tags(self, url, tags_to_scrape):
        links = []
        
        # loop over all tags in the list
        for tag in tags_to_scrape:
            
          # for every tags run through the 1st 12 pages, each includes 50 posts
          for i in range(12):
            curr_url = url +'/questions/tagged/'+ tag +  '?tab=newest&pagesize=50&page=' + str(i+1)
            self.driver.get(curr_url)
            time.sleep(2)     #sleep to avoid too many requests (30 requests per min)
            page = self.driver.execute_script('return document.body.innerHTML')
            largesoup = bs.BeautifulSoup(''.join(page),'html.parser')
            soup= largesoup.find('div', id ='mainbar')
            posts = soup.find_all('a', class_= 'question-hyperlink')
            
            # get link of each post in the page and save it to the list to return
            for elem in posts:
                if elem.get('href') not in links:
                    links.append(elem.get('href'))
        return links
    
    
    
    # get the data from specific tags and save into csv
    def run_with_tags(self, url, tags_to_scrape):
      # declare variable to store the data
      titles = []
      tags = []
      comments = []
    
    
      # get links of posts from given tags
      links = self.get_links_with_tags(url, tags_to_scrape)
    
      #go through each link
      for link in links:
        
        #go to the page from the link and get a bs object from the page
        curr_url = url + link
        self.driver.get(curr_url)
        time.sleep(3)     # sleep in order to avoid too many requests error
        page = self.driver.execute_script('return document.body.innerHTML')
        soup = bs.BeautifulSoup(''.join(page),'html.parser')
        
        
        # get title, tags, and leading comment of the page
        title = self.get_title(soup)
        list_of_tags = self.get_tags(soup)
        leading_comment_text = self.get_leading_comment(soup)
        
        
        # check if title, tags and leading comment exist
        if not (title and list_of_tags and leading_comment_text):
            continue
        
        # store the title, tags, and leading comments of the current post
        titles.append(title)
        tags.append(list_of_tags)
        comments.append(leading_comment_text)
      
      #create dataframe and save data into csv
      attributeDict = {
                    'Topic Title'       :   titles,
                    'Tags'              :   tags,
                    'Leading Comment'   :   comments}

      #print (comments)
      self.topicDataframe =  pd.DataFrame(attributeDict)
      self.topicDataframe.to_csv('StackOverflow.csv') 



In [4]:
if __name__=='__main__':
    # Local path to webdriver
    webdriverPath = "chromedriver.exe"

    # stackoverflow forum base URL
    baseURL = 'https://stackoverflow.com'

    # Create Stackoverflow webscraping object
    stackWebscraper = StackWebscraper(webdriverPath)

    #create a list of tags to scrape
    tags_to_scrape = ['nlp', 'nltk', 'bert', 'word-embedding','text-classification', 'sentiment-analysis', 'tf-idf', 
                      'scikit-learn', 'text-mining', 'selenium', 'selenium-webdriver', 'web-scraping', 'splinter',
                      'beautifulsoup', 'scrapy']
    
    #tags_to_scrape = ['nlp']
    # Run webscraping and save data
    stackWebscraper.run_with_tags(baseURL,tags_to_scrape)
