In [1]:
#importing necessary libraries
import os
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import re
import pandas as pd
import time

In [2]:
def base_url_builder(tag):
    #builds the base url
    url = "https://medium.com/tag/" + tag +"/archive/"
    return url

In [3]:
def get_start_date(year, month, day):
    #checks if start date is valid and converts to start time object
    try:
        start_date = datetime(year, month, day)
    except:
        raise Exception("Start date is in the wrong format or is invalid.")
    return start_date


In [4]:
def get_end_date(year, month, day):
    #checks if end date is valid and converts to date time object
    try:
        end_date = datetime(year, month, day)
    except:
        raise Exception("End date is in the wrong format or is invalid.")
    return end_date

In [5]:
def open_chrome():
    #to open a chrome driver
    driver = webdriver.Chrome()
    driver.implicitly_wait(30)
    return driver

In [6]:
def url_masher(base_url, year, month, day):
    #makes new url with the given date
    month = "0" + month
    if len(day) == 1:
        day = "0" + day
    url = base_url + year + "/" + month + "/" + day
    return url

In [7]:
def find_post_cards(soup):
    #pulls each card from the feed.Each card contains author name, title, claps, published date,etc
    cards = soup.find_all("div", class_="streamItem streamItem--postPreview js-streamItem")
    return cards

In [8]:
def get_titles_from_cards(cards):
    #gets title of article from cards
    def title_cleaner(title):
        #REMOVE MEDIUMS ENCODING SYMBOLS AND EMOJIS FROM TITLES
        title = title.replace("\xa0"," ")
        title = title.replace("\u200a","")
        title = title.replace("\ufe0f","")
        title = re.sub(r'[^\x00-\x7F]+','', title)
        return title

    titles=[]
    for card in cards:
        #search for title in 3 different classes
        variant1 = card.find("h3", class_="graf graf--h3 graf-after--figure graf--title")
        variant2 = card.find("h3", class_="graf graf--h3 graf-after--figure graf--trailing graf--title")
        variant3 = card.find("h4", class_="graf graf--h4 graf--leading")
        variant4 = card.find("h3", class_="graf graf--h3 graf--leading graf--title")
        variant5 = card.find("p", class_="graf graf--p graf--leading")
        variant6 = card.find("h3", class_="graf graf--h3 graf--startsWithDoubleQuote graf--leading graf--title")
        variant7= card.find("h3", class_="graf graf--h3 graf--startsWithDoubleQuote graf-after--figure graf--trailing graf--title")
        
        variants = [variant1, variant2, variant3, variant4, variant5, variant6, variant7]
        saved = False
        #save the first title entry from the above variants
        for variant in variants:
            if ((variant is not None) and (not saved)):
                title = variant.text
                title = title_cleaner(title)
                titles.append(title)
                saved = True
        if not saved:
            titles.append("NaN")
    return titles

In [9]:
def get_auth_and_pubs_from_cards(cards):
    # gets author and publication details from each card
    authors = []
    pubs = []
    for card in cards:
        # get the author and publication
        author = card.find("a", class_="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken")
        pub = card.find("a", class_="ds-link ds-link--styleSubtle link--darken link--accent u-accentColor--textNormal")
        if author is not None:
            text = author.text
            text = re.sub('\s+[^A-Za-z]', '', text)
            text = re.sub(r'[^\x00-\x7F]+',' ', text)
            authors.append(text)
        else:
            authors.append("NaN")
        if pub is not None:
            text2 = pub.text
            text2 = re.sub('\s+[^A-Za-z]', '', text2)
            text2 = re.sub(r'[^\x00-\x7F]+',' ', text2)
            pubs.append(text2)
        else:
            pubs.append("NaN")
    return authors, pubs

In [10]:
def get_dates_and_tags(tag, year,month,day,cards):
    #gets date and tags of the cards
    Year=[]
    Month=[]
    Day = []
    tags=[]
    for card in cards:
        tags.append(tag)
        Year.append(year)
        Month.append(month)
        Day.append(day)
    return Year, Month, Day, tags

In [11]:
def get_readTime_from_cards(cards):
    #gets the reading time of each article card
    readingTimes=[]
    for card in cards:
        time = card.find("span", class_="readingTime")
        if time is not None:
            time = time['title']
            time = time.replace(" min read", "")
            readingTimes.append(time)
        else:
            readingTimes.append("0")
    return readingTimes

In [12]:
def get_applause_from_cards(cards):
    #gets claps of the cards
    applause=[]
    for card in cards:
        claps=card.find("button", class_="button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents")
        if claps is not None:
            applause.append(claps.text)
        else:
            applause.append("0")
    return applause

In [13]:
def get_urls_from_cards(cards):
    #gets article url from cards
    urls = []
    for card in cards:
        url = card.find("a", class_="")
        if url is not None:
            urls.append(url['href'])
        else:
            raise Exception("couldnt find a url")
    return urls

In [14]:
def get_auth_urls_from_cards(cards):
    #gets author url from cards
    auth_urls = []
    for card in cards:
        url = card.find("a", class_="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken")
        if url is not None:
            auth_urls.append(url['href'])
        else:
            auth_urls.append("NaN")
    return auth_urls

In [15]:
def scrape_tag(tag, yearstart, monthstart, yearstop, monthstop):
    #Function to scrap details from the url based on tag and date
    
    path = os.getcwd()
    path = path + "/TAG_SCRAPES/medium_"+tag+".csv"
    
    #opening the file path
    try:
        file = open(path, "w")
        file.close()
    except:
        raise Exception("Could not open file.")

    #check if start date, end date is valid
    current_date = get_start_date(int(yearstart), int(monthstart), 1)
    end_date = get_start_date(int(yearstop), int(monthstop), 1)
    if current_date > end_date:
        raise Exception("End date exceeds start date.")
    else:
        None
        
    #buld the base url
    base_url = base_url_builder(tag)
    #opening the chrome driver
    chrome_driver = open_chrome()

    firstPage=True
    counter=0

    #START ITERATION OVER DATES
    while(current_date <= end_date):
        #bulid url from the current date
        url = url_masher(base_url,
                        str(current_date.year),
                        str(current_date.month),
                        str(current_date.day))

        #parse web responses using chrome driver and  beautiful soup
        response = chrome_driver.get(url)
        soup = BeautifulSoup(chrome_driver.page_source, features='lxml')

        #find all story cards with article and author details
        cards = find_post_cards(soup)

        #scrape title, author, pulblication ,date, reading time ,claps,article url and author url from cards
        titles = get_titles_from_cards(cards)
        authors, pubs = get_auth_and_pubs_from_cards(cards)
        year, month, day, tags = get_dates_and_tags(tag,
                                        current_date.year,
                                        current_date.month,
                                        current_date.day,
                                        cards)
        readingTimes = get_readTime_from_cards(cards)
        applause = get_applause_from_cards(cards)
        urls = get_urls_from_cards(cards)
        auth_urls = get_auth_urls_from_cards(cards)

        #store data in a dictonary
        dict = {"Title":titles,"Author":authors, "Publication":pubs, "Year":year, "Month":month, "Day":day, "Tag":tags, "Reading_Time":readingTimes, "Claps":applause, "url":urls, "Author_url":auth_urls}

        #check if all the dictionary values are of same length
        vals = list(dict.values())
        for col in vals:
            if len(col)==len(cards):
                continue
            else:
                raise Exception("Data length does not match number of stories on page.")

        #store the dictonary items to a dataframe
        df = pd.DataFrame.from_dict(dict)

        #append data from dataframe to a csv file
        if firstPage:
            with open(path, 'a') as f:
                df.to_csv(f, mode="a", header=True, index = False)
            firstPage=False
        else:
            with open(path, 'a') as f:
                df.to_csv(f, mode="a", header=False, index=False)

        #adda a date to the current date for next url call
        current_date = current_date + timedelta(days=1)

        #prints the number of cards/artilce details saved to csv
        counter = counter + len(cards)
        print(counter, "    ",current_date)
        time.sleep(2)
    chrome_driver.close()    

In [16]:
#tags to scrape
tags = ["data-science"]

#specify the dates to be scraped
yearstart=2021
monthstart=1
yearstop=2021
monthstop=9

In [17]:
#scrapes all the article detials of the tags based on the given date and saves the details to a csv
for tag in tags:
    scrape_tag(tag, yearstart, monthstart, yearstop, monthstop)
    print("Done with tag: ", tag)
print("done")

113      2021-01-02 00:00:00
228      2021-01-03 00:00:00
346      2021-01-04 00:00:00
514      2021-01-05 00:00:00
648      2021-01-06 00:00:00
806      2021-01-07 00:00:00
930      2021-01-08 00:00:00
1070      2021-01-09 00:00:00
1180      2021-01-10 00:00:00
1320      2021-01-11 00:00:00
1495      2021-01-12 00:00:00
1658      2021-01-13 00:00:00
1814      2021-01-14 00:00:00
1944      2021-01-15 00:00:00
2084      2021-01-16 00:00:00
2176      2021-01-17 00:00:00
2293      2021-01-18 00:00:00
2457      2021-01-19 00:00:00
2620      2021-01-20 00:00:00
2801      2021-01-21 00:00:00
2931      2021-01-22 00:00:00
3044      2021-01-23 00:00:00
3133      2021-01-24 00:00:00
3246      2021-01-25 00:00:00
3382      2021-01-26 00:00:00
3560      2021-01-27 00:00:00
3714      2021-01-28 00:00:00
3846      2021-01-29 00:00:00
3976      2021-01-30 00:00:00
4072      2021-01-31 00:00:00
4186      2021-02-01 00:00:00
4338      2021-02-02 00:00:00
4480      2021-02-03 00:00:00
4589      2021-02