In [1]:
import pandas as pd
import requests
import itertools
from bs4 import BeautifulSoup

In [2]:
def div(soup):
    """
    gets all div's from one page
    
    Parameters
    ----------
    soup : BeatifulSoup object
        BeatifulSoup object for a page of results

    Returns
    -------
    array
        a collection of divs for further analysis
    """
    divs=[]
    for i in soup.find_all('div',{'class':'s-prose js-post-body'}):
        try:
            divs.append(i.contents)
        except:
            divs.append(None)
            continue
    return divs

def href(soup):
    """
    gets all href links from one page
    
    Parameters
    ----------
    soup : BeatifulSoup object
        BeatifulSoup object for a page of results

    Returns
    -------
    array
        a collection of href links for individual thread requests
    """
    hrefs=[]
    for i in soup.find_all('a',{'class':'s-link'},href=True):
        try:
            hrefs.append(i['href'])
        except:
            hrefs.append(None)
            continue
    return hrefs

In [3]:
def clean(hrefs):
    """
    remove all empty lists of hrefs
    
    Parameters
    ----------
    hrefs : array
        a collection of hrefs

    Returns
    -------
    array
        a collection of hrefs without any empty lists
    """
    list_hrefs=[]
    for i in hrefs:
        if i!=[]:
            list_hrefs.append(i)
    # merge all elemenets in one list
    hrefs_list=[]
    for i in list_hrefs:
        for j in i:
            hrefs_list.append(j)
    return hrefs_list

In [4]:
def links(hrefs):
    """
    rearrage those links who do not have 'https://stackoverflow.com' prefix
    
    Parameters
    ----------
    hrefs : array
        a collection of hrefs

    Returns
    -------
    array
        a collection of hrefs with the appropriate domain as a prefix
    """
    new_href=[]
    domain='https://stackoverflow.com'
    for h in hrefs:
        if 'https' not in h:
            m=domain+h+"answertab=votes#tab-top"
            new_href.append(m)
        else:
            new_href.append(h+"answertab=votes#tab-top")
    return new_href

In [5]:
def single(url):
    """
    request a single url
    
    Parameters
    ----------
    url : string
        a url to query with requests.get

    Returns
    -------
    soup
        a BeautifulSoup object for the entire corresponding webpage
    """
    req=requests.get(url=url)
    soup=BeautifulSoup(req.text,"html.parser")
    return soup

In [6]:
def multi(url):
    """
    scrape multiple url's found on this url
    
    Parameters
    ----------
    url : string
        a url to query with requests.get

    Returns
    -------
    array
        a collection of questions found as the first <p> tag
    array
        a collection of answers found as subsequent <p> tags
    """
    page=single(url).find_all('div',{'class':'s-prose js-post-body'})
    question=[i.find("p").get_text()for i in page][0]
    answer=[i.find("p").get_text() for i in page][1:6]
    return question,answer

In [7]:
import time
import itertools
def questions_answers(url,start_page,end_page):
    """
    get collections of questions and answers
    
    Parameters
    ----------
    url : string
        a url to query with requests.get
    start_page : int
        starting page number to scrape results
    end_page : int
        ending page number to scrape results

    Returns
    -------
    array
        a collection of raw outputs for each of the question/answer 
        threads with code tags for further text post-processing
    array
        a collection of questions found as the first <p> tag
    array
        a collection of answers found as subsequent <p> tags
    """
    soups=[]
    for page in range(start_page,end_page):
        req=requests.get(url=url.format(page))
        soup=BeautifulSoup(req.text,"html.parser")
        soups.append(soup)
        time.sleep(1)

    # obtain all hrefs
    hrefs=[]
    for soup in soups:
        hrefs.append(href(soup))
    
    # obtain all divs
    divs =[]
    url='https://stackoverflow.com'
    for ref in hrefs:
        for link in ref:
            time.sleep(1)
            try:
                req=requests.get(url=url+link)
            except:
                continue
        soup=BeautifulSoup(req.text,"html.parser")
        divs.append(div(soup))

    questions=[]
    answers=[]
    hrefs = clean(hrefs)
    hrefs = links(hrefs)

    # distinguish questions with answers
    for link in hrefs:
        time.sleep(1)
        try:
            q,a=multi(link)
        except:
            continue
        questions.append(q)
        answers.append(a)

    # fetch remaining answers to the same question
    new=[]
    for i in range(len(answers)):
        try:
            new.append(answers[i][0])
        except:
            new.append(None)
    next_questions=[]
    next_answers=[]
    merge = list(itertools.chain.from_iterable(answers))
    for j in range(len(merge)-1):
        next_questions.append(merge[i])
        next_answers.append(merge[i+1])

    # return questions+next_questions,answers+next_answers
    return divs,questions,answers

In [9]:
all_raw=[]
all_q=[]
all_a=[]
for i in range(1,101,10):
    url = 'https://stackoverflow.com/questions/tagged/python?tab=votes&page={}&pagesize=50'
    raw,q,a=(questions_answers(url,i,i+10))
    all_raw.append(raw)
    all_q.append(q)
    all_a.append(a)

In [10]:
import csv

with open("raw.csv", "w", encoding = 'utf-8') as f:
    writer = csv.writer(f)
    for raw in all_raw:
        for line in raw:
            # writer = csv.writer(f)
            writer.writerows(line)

with open("questions.txt", "w", encoding = 'utf-8') as f:
    for q in all_q:
        for line in q:
            f.write(line)
            f.write("\n")

with open("answers.csv", "w", encoding = 'utf-8') as f:
    writer = csv.writer(f)
        for a in all_a:
            # writer = csv.writer(f)
            writer.writerows(a)

print("Success!")

Success!


In [11]:
import json

one2many = []
with open("qa_pairs.csv", "w", encoding = 'utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["question", "answer"])
    for i in range(len(all_q)):
        for j in range(len(all_q[i])):
            curr_q = all_q[i][j]
            one2many.append({'question': curr_q, 'answers': all_a[i][j]})
            for curr_a in all_a[i][j]:
                writer.writerow([curr_q, curr_a])

with open("qa_pairs.json", "w") as outfile:
    json.dump(one2many, outfile)
    
print("Success!")

Success!
