In [7]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# import all depenencies at the top
from time import time
from time import sleep
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from bs4 import BeautifulSoup
import requests

import csv
import io

################################################################################
# define a function to process the page
def process_page(soup, questions):  
  
  # find all elements with class *summary*
  raw_questions = soup.select('.summary')

  # same as above, extract the info we need
  for question in raw_questions:
    title = question.select_one('h3 > a').get_text() # extract the title
    excerpt = question.select_one('.excerpt').get_text().strip() # extract the excerpt
    tags = [tag.get_text() for tag in question.select('.tags a')] # extract a list of tags
    new_question = {'title': title, 'excerpt': excerpt, 'tags': tags} # construct a dictionary
    questions.append(new_question) # add dictionary to list


################################################################################
# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

################################################################################
# prepare for the monitoring logic
start_time = time() # note the system time when the program starts
request_count = 0 # track the number of requests made

# create variables to store the data
questions = []

# variables to handle the request loop
has_next_page = True
MAX_REQUESTS = 3 # do not request more than 10 pages
page_number = 1
query = {'tab':'newest', 'page': page_number}
url = 'https://stackoverflow.com/questions'
headers = {'user-agent': 'questionscraper - school project (myeamail@gmail.com)'}

while has_next_page and request_count < MAX_REQUESTS:
  # keep the output clear
  clear_output(wait = True)
  
  # make an initial request
  response = requests.get(url, params=query, headers=headers)

  # make sure we got a valid response
  if(response.ok):
    # get the full data from the response
    data = response.text
    soup = BeautifulSoup(data, 'html.parser')
    process_page(soup, questions)

    # check for the next page
    # look for the presence of element with class *test-pagination-next*
    next_button = soup.select('a[rel="next"]')
    has_next_page = len(next_button) > 0
    
  else:
    # display a warning if there are any problems
    warn('Request #: {}, Failed with status code: {}'.format(request_count, response.status_code))
  
  request_count += 1
  
  # go to sleep for a bit
  # we use a random number between 1 and 5 so
  # We can wait as long as 5 seconds to make a second request
  
  sleep(randint(1,3))
  
  # output some logs for monitoring
  elapsed_time = time() - start_time
  print('Requests: {}, Frequency: {} requests/s, {} questions processed.'.format(request_count, request_count/elapsed_time, len(questions)))
  
  # prepare for next iteration
  page_number += 1
      
print('Sraping complete')
print('Requests: {}, Frequency: {} requests/s, {} questions processed.'.format(request_count, request_count/elapsed_time, len(questions)))

################################################################################
# Create an output stream
output = io.StringIO()

# these are the names of the properties in the dictionary
fieldnames = ['title', 'excerpt', 'tags']

# create a writer object, it can write dictionaries to the output stream
writer = csv.DictWriter(output, fieldnames=fieldnames)

# write all the headings 
writer.writeheader()

# iterate the questions and write each one
for question in questions:
  writer.writerow(question)

# Create & upload a text file.
uploaded = drive.CreateFile({'title': 'questions.csv', 'parents': [{'id': '1XI3y4xvTJss3eKDItnBOGRzWixd5uvHI'}]})
uploaded.SetContentString(output.getvalue())
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))  

Requests: 3, Frequency: 0.6028100063040868 requests/s, 45 questions processed.
Sraping complete
Requests: 3, Frequency: 0.6028100063040868 requests/s, 45 questions processed.
Uploaded file with ID 1H-cv2fxNwVknGHaMV_0wWXwdb6g7ArT-
