In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import re
import time
import pandas as pd

In [2]:
URL = 'https://stackoverflow.com/questions'
FILTER = 'Votes'
PAGE_LIMIT = 10000

In [3]:
def build_url(base_url=URL, query_filter=FILTER, page=1):
    return f'{base_url}?tab={query_filter}&page={page}'

In [4]:
def scrape_page(page=1):
    """
    Function to scrape a single page in stack overflow
    """
    try:
        
        response = requests.get(build_url(page=page))
        page_questions = []
        ques_link = []
        soup = BeautifulSoup(response.text, 'html.parser')
        question_summaries = soup.find_all('div', class_='s-post-summary')

        for summary in question_summaries:
            for link in soup.find_all('a', attrs={'href': re.compile("^/questions/11227809/")}):
                ques_link = link.get('href')
            vote_count = summary.find(class_='s-post-summary--stats-item-number').text
            answer_count = summary.find(class_='has-answers').find(class_='s-post-summary--stats-item-number').text
            ques_title = summary.find(class_='s-link').text
            ques_summ = summary.find(class_='s-post-summary--content-excerpt').text
            keywords = summary.find(class_='s-post-summary--meta-tags').text.split(' ')
            date = summary.find(class_='s-user-card--time').text

            page_questions.append({
                'ques_link':ques_link,
                'vote_count':vote_count,
                'answer_count':answer_count,
                'ques_title':ques_title,
                'ques_summ':ques_summ,
                'keywords':keywords,
                'date':date
            })
    except AttributeError:
        print("There's no item with that code")
        
    return page_questions
        

In [5]:
def scrape():
    """
    Function to scrape to PAGE_LIMIT
    """
    questions = []
    for i in range(1, PAGE_LIMIT+1):
        page_questions = scrape_page(i)
#         time.sleep(1.2)
        questions.extend(page_questions)
    return questions

In [6]:
def extract_data():
    data = scrape()
    with open('extracted_data.csv', 'w', encoding='utf-8') as data_file:
        fieldnames = ['ques_link' , 'vote_count', 'answer_count', 'ques_title', 'ques_summ', 'keywords', 'date']
        data_writer = csv.DictWriter(data_file, fieldnames=fieldnames)
        data_writer.writeheader()
        for d in data:
            data_writer.writerow(d)
        print('Successfully Extracted')

In [7]:
# extract_data()

In [8]:
data = pd.read_csv('extracted_data.csv')

In [9]:
data.shape

(495842, 7)

In [10]:
data.head()

Unnamed: 0,ques_link,vote_count,answer_count,ques_title,ques_summ,keywords,date
0,/questions/11227809/why-is-processing-a-sorted...,26257,28,Why is processing a sorted array faster than p...,\r\r\n Here is a piece of C++ c...,"['\njava', 'c++', 'performance', 'cpu-architec...","asked Jun 27, 2012 at 13:51"
1,/questions/11227809/why-is-processing-a-sorted...,24017,98,How do I undo the most recent local commits in...,\r\r\n I accidentally committed...,"['\ngit', 'version-control', 'git-commit', 'un...","\r\n\r\r\n 89 revs, 60 user..."
2,/questions/11227809/why-is-processing-a-sorted...,18983,41,How do I delete a Git branch locally and remot...,\r\r\n I want to delete a branc...,"['\ngit', 'version-control', 'git-branch', 'gi...","asked Jan 5, 2010 at 1:12"
3,/questions/11227809/why-is-processing-a-sorted...,13066,38,What is the difference between 'git pull' and ...,\r\r\n What are the differences...,"['\ngit', 'version-control', 'git-pull', 'git-...","asked Nov 15, 2008 at 9:51"
4,/questions/11227809/why-is-processing-a-sorted...,11888,47,"What does the ""yield"" keyword do?",\r\r\n What is the use of the y...,"['\npython', 'iterator', 'generator', 'yield',...","asked Oct 23, 2008 at 22:21"


In [11]:
data.tail()

Unnamed: 0,ques_link,vote_count,answer_count,ques_title,ques_summ,keywords,date
495837,[],14,4,Update or reload UITableView after completion ...,\r\r\n I have a uitableview tha...,"['\nios', 'json', 'swift', 'uitableview\n']","asked Jun 8, 2016 at 4:59"
495838,[],14,2,Efficiently Computing Significant Terms in SQL,\r\r\n I was introduced to Elas...,"['\nsql', 'elasticsearch', 'query-optimization...","asked Jun 2, 2016 at 17:03"
495839,[],14,2,How to encrypt docker images or source code in...,\r\r\n Say I have a docker imag...,"['\nencryption', 'docker\n']","asked Jun 2, 2016 at 7:09"
495840,[],14,1,Why do browsers allow CSRF?,\r\r\n I am pretty new to web s...,"['\nsecurity', 'web', 'csrf', 'csrf-protection...","asked Jun 1, 2016 at 4:59"
495841,[],14,2,Keyframes with Inline Styles ReactJS,\r\r\n I'm trying to set the ke...,"['\nhtml', 'css', 'reactjs\n']","asked May 25, 2016 at 22:54"
