# Towards Data Science Blog Posts: Get Blog Posts
### *Use requests library and BeautifulSoup to scrape posts from TowardsDataScience.com*

In [1]:
# Import libaries

import pandas as pd
import requests 
import numpy as np
from time import sleep 

from bs4 import BeautifulSoup

In [2]:
#save url as a variable
url = 'https://towardsdatascience.com/archive/2021/3/3'

#check status
res = requests.get(url)
res

<Response [200]>

In [2]:
# this code adapted from https://hackernoon.com/how-to-scrape-a-medium-publication-a-python-tutorial-for-beginners-o8u3t69
stories_data = []

#iterate through each month, determine the number of days in each month
for month in range(1, 5):
    if month in [1, 3, 5, 7, 8, 10, 12]:
        n_days = 31
    elif month in [4, 6, 9, 11]:
        n_days = 30
    else:
        n_days = 28
    
    #iterate through the appropriate nunber of days in each month
    for day in range(1, n_days + 1):

        month, day = str(month), str(day)

        if len(month) == 1:
            month = f'0{month}'
        if len(day) == 1:
            day = f'0{day}'
        
        #build the date and url for each day
        date = f'{month}/{day}/2021'
        url = f'https://towardsdatascience.com/archive/2021/{month}/{day}'

        #scrape the content on that day's archive page and save as a BeautifulSoup object
        page  = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')

        stories = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
        
        #sleep so as not to overwhelm the server
        sleep(np.random.randint(1, 15))
        
        #for each post
        for story in stories:
            each_story = []
            
            #grab the author URL
            author_box = story.find('div', class_='postMetaInline u-floatLeft u-sm-maxWidthFullWidth')
            author_url = author_box.find('a')['href']

            #get the reading time, if applicable
            try:
                reading_time = author_box.find('span', class_='readingTime')['title']
                reading_time = reading_time.split()[0]
            except:
                continue
            
            #get the title and subtitle
            title = story.find('h3').text if story.find('h3') else '-'
            subtitle = story.find('h4').text if story.find('h4') else '-'
            
            sleep(np.random.randint(1, 15))

            #save claps, if present
            if story.find('button', class_='button button--chromeless u-baseColor--buttonNormal'
                                           ' js-multirecommendCountButton u-disablePointerEvents'):

                claps = story.find('button', class_='button button--chromeless u-baseColor--buttonNormal'
                                                    ' js-multirecommendCountButton u-disablePointerEvents').text
            else:
                claps = 0
            
            #save responses, if present
            if story.find('a', class_='button button--chromeless u-baseColor--buttonNormal'):
                responses = story.find('a', class_='button button--chromeless u-baseColor--buttonNormal').text
                responses = responses.split()[0]
            else:
                responses = '0 responses'
            
            #save the post's URL
            story_url = story.find('a', class_='button button--smaller button--chromeless u-baseColor--buttonNormal')[
                'href']
            
            #scrape the content for each post and save as a BeautifulSoup object

            story_page = requests.get(story_url)
            story_soup = BeautifulSoup(story_page.text, 'html.parser')

            #grab all section and paragraph info
            sections = story_soup.find_all('section')
            story_paragraphs = []
            section_titles = []
            
            for section in sections:
                paragraphs = section.find_all('p')
                for paragraph in paragraphs:
                    story_paragraphs.append(paragraph.text)

                subs = section.find_all('h1')
                for sub in subs:
                    section_titles.append(sub.text)

            number_sections = len(section_titles)
            number_paragraphs = len(story_paragraphs)
            
            #append to the list
            each_story.append(date)
            each_story.append(title)
            each_story.append(subtitle)
            each_story.append(claps)
            each_story.append(responses)
            each_story.append(author_url)
            each_story.append(story_url)
            each_story.append(reading_time)
            each_story.append(number_sections)
            each_story.append(section_titles)
            each_story.append(number_paragraphs)
            each_story.append(story_paragraphs)
            stories_data.append(each_story)
    
    #set up column headers for df
    columns = ['date', 'title', 'subtitle', 'claps', 'responses', 
           'author_url', 'story_url', 'reading_time (mins)', 
           'number_sections', 'section_titles', 'number_paragraphs', 'paragraphs']
    #save as a dataframe
    df = pd.DataFrame(stories_data, columns=columns)
    df.to_csv(f'../data/2021_{month}.csv', sep='\t', index=False)

In [4]:
#get some 2020 stories
stories_data = []

for month in range(10, 13):
    if month in [1, 3, 5, 7, 8, 10, 12]:
        n_days = 31
    elif month in [4, 6, 9, 11]:
        n_days = 30
    else:
        n_days = 28

    for day in range(1, n_days + 1):

        month, day = str(month), str(day)

        if len(month) == 1:
            month = f'0{month}'
        if len(day) == 1:
            day = f'0{day}'

        date = f'{month}/{day}/2020'
        url = f'https://towardsdatascience.com/archive/2020/{month}/{day}'

        page  = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')

        stories = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
        
        sleep(np.random.randint(1, 15))
        
        for story in stories:
            each_story = []

            author_box = story.find('div', class_='postMetaInline u-floatLeft u-sm-maxWidthFullWidth')
            author_url = author_box.find('a')['href']

            try:
                reading_time = author_box.find('span', class_='readingTime')['title']
                reading_time = reading_time.split()[0]
            except:
                continue

            title = story.find('h3').text if story.find('h3') else '-'
            subtitle = story.find('h4').text if story.find('h4') else '-'
            
            sleep(np.random.randint(1, 15))

            if story.find('button', class_='button button--chromeless u-baseColor--buttonNormal'
                                           ' js-multirecommendCountButton u-disablePointerEvents'):

                claps = story.find('button', class_='button button--chromeless u-baseColor--buttonNormal'
                                                    ' js-multirecommendCountButton u-disablePointerEvents').text
            else:
                claps = 0

            if story.find('a', class_='button button--chromeless u-baseColor--buttonNormal'):
                responses = story.find('a', class_='button button--chromeless u-baseColor--buttonNormal').text
                responses = responses.split()[0]
            else:
                responses = '0 responses'

            story_url = story.find('a', class_='button button--smaller button--chromeless u-baseColor--buttonNormal')[
                'href']
            
            story_page = requests.get(story_url)
            story_soup = BeautifulSoup(story_page.text, 'html.parser')

            sections = story_soup.find_all('section')
            story_paragraphs = []
            section_titles = []
            
            for section in sections:
                paragraphs = section.find_all('p')
                for paragraph in paragraphs:
                    story_paragraphs.append(paragraph.text)

                subs = section.find_all('h1')
                for sub in subs:
                    section_titles.append(sub.text)

            number_sections = len(section_titles)
            number_paragraphs = len(story_paragraphs)
            
            each_story.append(date)
            each_story.append(title)
            each_story.append(subtitle)
            each_story.append(claps)
            each_story.append(responses)
            each_story.append(author_url)
            each_story.append(story_url)
            each_story.append(reading_time)
            each_story.append(number_sections)
            each_story.append(section_titles)
            each_story.append(number_paragraphs)
            each_story.append(story_paragraphs)
            stories_data.append(each_story)
            
    columns = ['date', 'title', 'subtitle', 'claps', 'responses', 
           'author_url', 'story_url', 'reading_time (mins)', 
           'number_sections', 'section_titles', 'number_paragraphs', 'paragraphs']

    df = pd.DataFrame(stories_data, columns=columns)
    df.to_csv(f'../data/2020_{month}.csv', sep='\t', index=False)
    df = pd.DataFrame()