In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import re
import time
import pandas as pd
import urllib.request

In [2]:
URL = 'https://stackoverflow.com/questions'
FILTER = 'Votes'
PAGE_LIMIT = 100

In [3]:
def build_url(base_url=URL, query_filter=FILTER, page=1):
    return f'{base_url}?tab={query_filter}&page={page}'

In [4]:
# build_url()

In [9]:
def scrape_page(page=1):
    """
    Function to scrape a single page
    """
    try:
        
        response = requests.get(build_url(page=page))
        page_questions = []
        ques_link = []
        soup = BeautifulSoup(response.text, 'html.parser')
        question_summaries = soup.find_all('div', class_='s-post-summary')

        for summary in question_summaries:
            ques_link = summary.find(class_='s-post-summary--content-title').find_all('a', class_='s-link')
            vote_count = summary.find(class_='s-post-summary--stats-item-number').text
            answer_count = summary.find(class_='has-answers').find(class_='s-post-summary--stats-item-number').text
            ques_title = summary.find(class_='s-link').text
            ques_summ = summary.find(class_='s-post-summary--content-excerpt').text
            keywords = summary.find(class_='s-post-summary--meta-tags').text.split(' ')
            date = summary.find(class_='s-user-card--time').text

            page_questions.append({
                'ques_link':ques_link,
                'vote_count':vote_count,
                'answer_count':answer_count,
                'ques_title':ques_title,
                'ques_summ':ques_summ,
                'keywords':keywords,
                'date':date
            })
    except AttributeError:
        print("There's no item with that code")
        
    return page_questions
        

In [10]:
def scrape():
    """
    Function to scrape to PAGE_LIMIT
    """
    questions = []
    for i in range(1, PAGE_LIMIT+1):
        page_questions = scrape_page(i)
#         time.sleep(1.2)
        questions.extend(page_questions)
    return questions

In [11]:
def extract_data():
    data = scrape()
    with open('dataset/extracted_data.csv', 'w', encoding='utf-8') as data_file:
        fieldnames = ['ques_link' , 'vote_count', 'answer_count', 'ques_title', 'ques_summ', 'keywords', 'date']
        data_writer = csv.DictWriter(data_file, fieldnames=fieldnames)
        data_writer.writeheader()
        for d in data:
            data_writer.writerow(d)
        print('Successfully Extracted')

In [12]:
extract_data()

There's no item with that code
Successfully Extracted


In [13]:
data = pd.read_csv('dataset/extracted_data.csv')

In [14]:
data.shape

(4975, 7)

In [15]:
data.head()

Unnamed: 0,ques_link,vote_count,answer_count,ques_title,ques_summ,keywords,date
0,"[<a class=""s-link"" href=""/questions/11227809/w...",26265,28,Why is processing a sorted array faster than p...,\r\r\n Here is a piece of C++ c...,"['\njava', 'c++', 'performance', 'cpu-architec...","asked Jun 27, 2012 at 13:51"
1,"[<a class=""s-link"" href=""/questions/927358/how...",24035,98,How do I undo the most recent local commits in...,\r\r\n I accidentally committed...,"['\ngit', 'version-control', 'git-commit', 'un...","\r\n\r\r\n 89 revs, 60 user..."
2,"[<a class=""s-link"" href=""/questions/2003505/ho...",18997,41,How do I delete a Git branch locally and remot...,\r\r\n I want to delete a branc...,"['\ngit', 'version-control', 'git-branch', 'gi...","asked Jan 5, 2010 at 1:12"
3,"[<a class=""s-link"" href=""/questions/292357/wha...",13069,38,What is the difference between 'git pull' and ...,\r\r\n What are the differences...,"['\ngit', 'version-control', 'git-pull', 'git-...","asked Nov 15, 2008 at 9:51"
4,"[<a class=""s-link"" href=""/questions/231767/wha...",11895,47,"What does the ""yield"" keyword do?",\r\r\n What is the use of the y...,"['\npython', 'iterator', 'generator', 'yield',...","asked Oct 23, 2008 at 22:21"


In [16]:
data.tail()

Unnamed: 0,ques_link,vote_count,answer_count,ques_title,ques_summ,keywords,date
4970,"[<a class=""s-link"" href=""/questions/3639342/wh...",488,7,"What's the difference between ""git reset"" and ...",\r\r\n I've always thought of g...,"['\ngit', 'git-checkout', 'git-reset\n']","asked Sep 3, 2010 at 20:21"
4971,"[<a class=""s-link"" href=""/questions/5306240/io...",487,38,iOS - Dismiss keyboard when touching outside o...,\r\r\n I'm wondering how to mak...,"['\nios', 'cocoa-touch', 'uitextfield', 'uikit...","asked Mar 15, 2011 at 0:31"
4972,"[<a class=""s-link"" href=""/questions/454854/no-...",487,30,No module named MySQLdb,\r\r\n I am using Python versio...,"['\npython', 'django', 'python-2.x\n']","asked Jan 18, 2009 at 9:13"
4973,"[<a class=""s-link"" href=""/questions/4033723/ho...",487,10,How do I access command line arguments?,\r\r\n I use python to create m...,"['\npython', 'command-line', 'command-line-arg...","asked Oct 27, 2010 at 13:24"
4974,"[<a class=""s-link"" href=""/questions/12787781/t...",487,14,Type definition in object literal in TypeScript,\r\r\n In TypeScript classes it...,['\ntypescript\n'],"asked Oct 8, 2012 at 18:58"
