# Data collection
This notebook collects the data from different data sources and end points and saves the raw data in a SQLite database.

In the following I search for articles published in a certain time frame in the TowardsDataScience archive (https://towardsdatascience.com/archive/year/month/day) 

References: 
https://hackernoon.com/how-to-scrape-a-medium-publication-a-python-tutorial-for-beginners-o8u3t69

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

stories_data = []

In [2]:
from datetime import date, timedelta

# returns a list with a entry for each day between sdate and edate
days = []

sdate = date(2020, 1, 1)   # start date
edate = date(2020, 12, 31)   # end date

delta = edate - sdate       # as timedelta

for i in range(delta.days + 1):
    day = sdate + timedelta(days=i)
    days.append(day)

In [5]:
for k in range(0,1,1):
    year = str(days[k].year)
    month = str(days[k].month).zfill(2)
    day = str(days[k].day).zfill(2)
    
    date = f'{month}/{day}/{year}'
    url = f'https://towardsdatascience.com/archive/{year}/{month}/{day}'
    print("Url: " + url)

Url: https://towardsdatascience.com/archive/2020/01/01


In [7]:
# function to extract all the information to the stories given in the medium archive
def extract_data(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    stories = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
    stories_data = []
    
    # find attributes author_url, reading_time, reading_time, responses, story_url for the stories which where published 
    # for the specified date
    
    for story in stories:
        each_story = []

        author_box = story.find('div', class_='postMetaInline u-floatLeft u-sm-maxWidthFullWidth')
        author_url = author_box.find('a')['href']

        try:
            reading_time = author_box.find('span', class_='readingTime')['title']
        except:
            continue

        title = story.find('h3').text if story.find('h3') else '-'
        subtitle = story.find('h4').text if story.find('h4') else '-'

        if story.find('button', class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents'):

            claps = story.find('button', class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents').text

        else:
            claps = 0

        if story.find('a', class_='button button--chromeless u-baseColor--buttonNormal'):

            responses = story.find('a', class_='button button--chromeless u-baseColor--buttonNormal').text

        else:
            responses = '0 responses'

        story_url = story.find('a', class_='button button--smaller button--chromeless u-baseColor--buttonNormal')['href']
   
        # data cleaning
        reading_time = reading_time.split()[0]
        responses = responses.split()[0]

        story_page = requests.get(story_url)
        story_soup = BeautifulSoup(story_page.text, 'html.parser')

        sections = story_soup.find_all('section')
        story_paragraphs = []
        section_titles = []

        for section in sections:
            paragraphs = section.find_all('p')
            for paragraph in paragraphs:
                story_paragraphs.append(paragraph.text)

            subs = section.find_all('h1')
            for sub in subs:
                section_titles.append(sub.text)

        number_sections = len(section_titles)
        number_paragraphs = len(story_paragraphs)

        each_story.append(date)
        each_story.append(title)
        each_story.append(subtitle)
        each_story.append(claps)
        each_story.append(responses)
        each_story.append(author_url)
        each_story.append(story_url)
        each_story.append(reading_time)
        each_story.append(number_sections)
        each_story.append(section_titles)
        each_story.append(number_paragraphs)
        each_story.append(story_paragraphs)

        stories_data.append(each_story)

    # write data to data frame "df" and return df
    columns = ['date', 'title', 'subtitle', 'claps', 'responses', 
           'author_url', 'story_url', 'reading_time (mins)', 
           'number_sections', 'section_titles', 
           'number_paragraphs', 'paragraphs']

    df = pd.DataFrame(stories_data, columns=columns)
    return df

df = extract_data(url)
df

Unnamed: 0,date,title,subtitle,claps,responses,author_url,story_url,reading_time (mins),number_sections,section_titles,number_paragraphs,paragraphs
0,01/01/2020,Making Python Programs Blazingly Fast,Let’s look at the performance of our Python pr...,3.3K,3,https://towardsdatascience.com/@martin.heinz,https://towardsdatascience.com/making-python-p...,5,4,"[Making Python Programs Blazingly Fast, Timing...",27,"[Martin Heinz, Jan 1, 2020·5 min read, Python ..."
1,01/01/2020,Implementing a fully convolutional network (FC...,"A tutorial on building, training and…",403,3,https://towardsdatascience.com/@himanshurawlani,https://towardsdatascience.com/implementing-a-...,11,8,[Understanding and implementing a fully convol...,36,"[Himanshu Rawlani, Jan 1, 2020·11 min read, Co..."
2,01/01/2020,6 New Features in Python 3.8 for Python Newbies,Python Beginner,1.8K,4,https://towardsdatascience.com/@edenau,https://towardsdatascience.com/6-new-features-...,4,5,[6 New Features in Python 3.8 for Python Newbi...,26,"[Eden Au, Jan 1, 2020·4 min read, Languages ch..."
3,01/01/2020,How to be fancy with Python,Python tricks that will make your life easier,1.7K,12,https://towardsdatascience.com/@dipam44,https://towardsdatascience.com/how-to-be-fancy...,5,1,[How to be fancy with Python],30,"[Dipam Vasani, Jan 1, 2020·5 min read, Python ..."
4,01/01/2020,Perfectly Privacy-Preserving AI,Data Privacy,331,2,https://towardsdatascience.com/@patriciathaine,https://towardsdatascience.com/perfectly-priva...,10,9,"[Perfectly Privacy-Preserving AI, The Four Pil...",45,"[Patricia Thaine, Jan 1, 2020·10 min read, Dat..."
5,01/01/2020,From scratch to search: playing with your data...,One Pipeline to rule…,231,1,https://towardsdatascience.com/@stanislavprihoda,https://towardsdatascience.com/from-scratch-to...,9,11,[From scratch to search: playing with your dat...,47,"[Stanislav Prihoda 🔥, Jan 1, 2020·9 min read, ..."
6,01/01/2020,GAN Pix2Pix Generative Model,Image-to-image translation with Pix2Pix model,87,1,https://towardsdatascience.com/@baakchsu.sprx77,https://towardsdatascience.com/gan-pix2pix-gen...,6,9,"[GAN Pix2Pix Generative Model, Pix2Pix GAN: In...",22,"[Anirudh S, Jan 1, 2020·6 min read, We hear a ..."
7,01/01/2020,An Introduction to Decision Trees with Python ...,A complete guide to getting an intuitive under...,105,1,https://towardsdatascience.com/@mikkelduif,https://towardsdatascience.com/an-introduction...,8,5,[An Introduction to Decision Trees with Python...,19,"[Mikkel Duif, Jan 1, 2020·8 min read, Decision..."
8,01/01/2020,Decision Trees for Dummies,An Intuitive Approach,84,1,https://towardsdatascience.com/@pratishgupta91,https://towardsdatascience.com/decision-trees-...,5,1,[Decision Trees for Dummies],21,"[Pratish, Jan 1, 2020·5 min read, Let’s look a..."
9,01/01/2020,Kaggle User Survey 2019,"A dashboard made using R, Flexdashboard, and H...",73,0,https://towardsdatascience.com/@theairbend3r,https://towardsdatascience.com/kaggle-user-sur...,6,4,"[Kaggle User Survey Dashboard— 2019, Home, Wom...",23,"[Akshaj Verma, Jan 1, 2020·6 min read, A dashb..."


In [1]:
import sqlite3
import helper_functions

helper_functions.create_sqlite_connection("medium.db")

<sqlite3.Connection at 0x113671a3110>