# Project McNulty - Scraping the Links for All Python-Related Questions

Paul Lim, Katharina Huang, Thaddeus Norman

05/17/2017

## Libraries

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from bs4 import BeautifulSoup
import time

from fake_useragent import UserAgent

## Getting Question Links

### Start with an example url

In [2]:
url_ex = 'http://stackoverflow.com/questions/tagged/python?page=14865&sort=newest&pagesize=50'
link_ex = requests.get(url_ex)
page_ex = link_ex.text
soup_ex = BeautifulSoup(page_ex, 'lxml')

### Get a sample link and the question text

In [3]:
questions_class_ex = soup_ex.find('div', {'class': 'question-summary'})
questions_div_ex = questions_class_ex.find('div', {'class': 'summary'})
questions_link_ex = questions_div_ex.find('a', href=True).get('href')
questions_text_ex = questions_div_ex.find('a').getText()
questions_text_ex

'Python - How to use Conch to create a Virtual SSH server'

### Try looping through the entire page and get all links and questions

In [38]:
all_question_summaries_ex = soup_ex.find_all('div', {'class': 'question-summary'})
link_list_ex = []
question_list_ex = []
view_count_list_ex = []

for question in all_question_summaries_ex:
    div_ex = question.find('div', {'class': 'summary'})
    link_ex = div_ex.find('a', href=True).get('href')
    text_ex = div_ex.find('a').getText()
    
    stats_ex = question.find('div', {'class': 'statscontainer'})
    view_ex = stats_ex.find_all('div')[-1].getText().strip('\r\n ').strip(' views')
    
    link_list_ex.append(link_ex)
    question_list_ex.append(text_ex)
    view_count_list_ex.append(view_ex)

# print(link_list_ex)
# print(question_list_ex)
print(view_count_list_ex)

['2k', '27k', '377', '191k', '3k', '44k', '43k', '12k', '901', '192k', '7k', '15k', '2k', '1k', '146k', '2k', '1k', '700', '980', '2k', '3k', '5k', '15k', '447k', '139k', '8k', '68k', '57k', '2k', '146k', '4k', '210k', '6k', '12k', '3k', '288', '30k', '645', '829', '3k', '64k', '32k', '3k', '2k', '1.6m', '3k', '5k', '2k', '188k', '591']


### Create a function that will loop through multiple pages

In [2]:
def scrape_links(url_template, start_page, end_page, delay=5):
    '''
    url_template should look like this: 
        http://stackoverflow.com/questions/tagged/python?page={page}&sort=newest&pagesize=50
    start_page should be LESS THAN end_page
    '''
    df = pd.DataFrame(columns = ['links','questions','views'])

    link_list = []
    question_list = []
    view_list = []
    ua = UserAgent()

        
    for i in range(start_page, end_page):
        url = url_template.format(page=start_page)
        user_agent = {'User-agent': ua.random}
        print(user_agent)
        
        try:
            link = requests.get(url, headers = user_agent)
            start_page += 1
        except:
            print(url)
            print('Check to make sure the URL is correct!')
            
        page = link.text
        soup = BeautifulSoup(page, 'lxml')
        
        all_question_summaries = soup.find_all('div', {'class': 'question-summary'})
        
        if all_question_summaries:
            for question in all_question_summaries:
                div = question.find('div', {'class': 'summary'})
                link_str = div.find('a', href=True).get('href')
                
                text = div.find('a').getText()

                stats = question.find('div', {'class': 'statscontainer'})
                view = stats.find_all('div')[-1].getText().strip('\r\n ').strip(' views')
                view = view.replace('k','E+03').replace('m','E+06')

                link_list.append('http://stackoverflow.com' + link_str)
                question_list.append(text)
                view_list.append(view)
        else:
            print(url)
            print('No question-summary class found!')
            
        df = pd.DataFrame({'links': link_list, 'questions': question_list, 'views': view_list})
#         df['links'] = link_list
#         df['questions'] = question_list
#         df['views'] = view_list
        df = df.drop_duplicates()
        
        df.to_csv('links.csv')
            
#         with open('links.csv', 'a') as f:
#             df.to_csv(f, header=False)
            
        time.sleep(delay + 2*np.random.rand())
        
    return df

## Getting Data From Each Individual Question Page

### Start with an example url

In [41]:
url_ex2 = 'http://stackoverflow.com/questions/469/how-can-i-find-the-full-path-to-a-font-from-its-display-name-on-a-mac'
link_ex2 = requests.get(url_ex2)
page_ex2 = link_ex2.text
soup_ex2 = BeautifulSoup(page_ex2, 'lxml')

### Get the user information

In [51]:
users_ex = soup_ex2.find('div', {'class': 'user-details'})

badges_ex = users_ex.find_all('span', {'class': 'badgecount'})
for badge in badges_ex:
    print(badge.getText())
    
id_ex = users_ex.find('a', href=True)

9
27
60


<a href="/users/1997093/task">TAsk</a>

### Try looping through one page

In [55]:
all_users_ex = soup_ex2.find_all('div', {'class': 'user-details'})
badge_list_ex = []
id_list_ex = []

for user in all_users_ex:
    badges_ex2 = user.find_all('span', {'class': 'badgecount'})
    temp_badge_list_ex = []
    
    for badge in badges_ex2:
        temp_badge_list_ex.append(int(badge.getText()))
    badge_list_ex.append(temp_badge_list_ex)
    
    id_list_ex.append(users_ex.find('a', href=True))

# print(badge_list_ex)
# print(id_list_ex)

df_ex = pd.DataFrame(badge_list_ex, columns=['gold','silver','bronze'])
df_ex['id'] = id_list_ex

df_ex.head()

Unnamed: 0,gold,silver,bronze,id
0,9,27,60.0,"<a href=""/users/1997093/task"">TAsk</a>"
1,10,36,46.0,"<a href=""/users/1997093/task"">TAsk</a>"
2,1,27,43.0,"<a href=""/users/1997093/task"">TAsk</a>"
3,4,34,57.0,"<a href=""/users/1997093/task"">TAsk</a>"
4,9,44,88.0,"<a href=""/users/1997093/task"">TAsk</a>"


### Create a function that will loop through multiple pages

In [1]:
def scrape_users(link_list, delay=5):
    badge_list = []
    id_list = []
    ua = UserAgent()
        
    for link in link_list:
        url = link
        user_agent = {'User-agent': ua.random}
        print(user_agent)
        
        try:
            link = requests.get(url, headers = user_agent)

        except:
            print(url)
            print('Check to make sure the URL is correct!')
            
        page = link.text
        soup = BeautifulSoup(page, 'lxml')
        
        all_users = soup.find_all('div', {'class': 'user-details'})
        
        if all_users:
            for user in all_users:
                badges = user.find_all('span', {'class': 'badgecount'})
                temp_badge_list = []

                for badge in badges:
                    temp_badge_list.append(int(badge.getText()))
                badge_list.append(temp_badge_list)

                id_list.append(user.find('a', href=True))
            
            time.sleep(delay + 2*np.random.rand())
        else:
            print(url)
            print('No users found!')

            
    df = pd.DataFrame(badge_list, columns=['gold','silver','bronze'])
    df['id'] = id_list
    df = df.drop_duplicates(keep='last')
    df = df.dropna(how='all')
    
    df.to_csv('users.csv')

    return df

## Test Cells

### Testing the individual page scraping

In [100]:
url_t = ['http://stackoverflow.com/questions/8724352/getting-the-nth-element-using-beautifulsoup']

df_t = scrape_users(url_t)
df_t

{'User-agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36'}


Unnamed: 0,gold,silver,bronze,id
0,1.0,3.0,11.0,"<a href=""/users/2584/aadvaark"">aadvaark</a>"
1,4.0,56.0,91.0,"<a href=""/users/183066/jcollado"">jcollado</a>"
5,9.0,96.0,132.0,"<a href=""/users/667301/mike-pennington"">Mike P..."


### Testing the link scraping

In [94]:
ex2 = 'http://stackoverflow.com/questions/tagged/python?page={page}&sort=newest&pagesize=50'

df_ex2 = scrape_links(ex2, 14868, 14869)
df_ex2

{'User-agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36'}


Unnamed: 0,links,questions,views
0,http://stackoverflow.com/questions/279782/how-...,How can one get the set of all classes with re...,2000.0
1,http://stackoverflow.com/questions/279707/maki...,making a python GUI [duplicate],3000.0
2,http://stackoverflow.com/questions/279561/what...,What is the Python equivalent of static variab...,182000.0
3,http://stackoverflow.com/questions/279434/pyth...,Python: How do I generate a keypress?,3000.0
4,http://stackoverflow.com/questions/279237/impo...,Import a module from a relative path,664000.0
5,http://stackoverflow.com/questions/279129/can-...,Can anyone recommend a decent FOSS PDF generat...,2000.0
6,http://stackoverflow.com/questions/279119/how-...,How do I search for unpublished Plone content ...,963.0
7,http://stackoverflow.com/questions/279094/how-...,How do I script an OLE component using Python?,9000.0
8,http://stackoverflow.com/questions/277965/dict...,"Dictionary or If statements, Jython",3000.0
9,http://stackoverflow.com/questions/277922/pyth...,Python Argument Binders,13000.0
