# Data collection
This notebook collects the data from different data sources and end points and saves the raw data in a SQLite database.

In the following I search for articles published in a certain time frame in the TowardsDataScience archive (https://towardsdatascience.com/archive/year/month/day) 

References: 
https://hackernoon.com/how-to-scrape-a-medium-publication-a-python-tutorial-for-beginners-o8u3t69

In [15]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

# function to extract all the information to the stories given in the medium archive
def extract_data(url, date_published):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    stories = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
    stories_data = []
    
    # find attributes author_url, reading_time, reading_time, responses, story_url for the stories which where published 
    # for the specified date
    
    for story in stories:
        each_story = []

        author_box = story.find('div', class_='postMetaInline u-floatLeft u-sm-maxWidthFullWidth')
        author_url = author_box.find('a')['href']

        try:
            reading_time = author_box.find('span', class_='readingTime')['title']
        except:
            continue

        title = story.find('h3').text if story.find('h3') else '-'
        subtitle = story.find('h4').text if story.find('h4') else '-'

        if story.find('button', class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents'):

            claps = story.find('button', class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents').text

        else:
            claps = 0

        if story.find('a', class_='button button--chromeless u-baseColor--buttonNormal'):

            responses = story.find('a', class_='button button--chromeless u-baseColor--buttonNormal').text

        else:
            responses = '0 responses'

        story_url = story.find('a', class_='button button--smaller button--chromeless u-baseColor--buttonNormal')['href']
   
        # data cleaning
        reading_time = reading_time.split()[0]
        responses = responses.split()[0]

        story_page = requests.get(story_url)
        story_soup = BeautifulSoup(story_page.text, 'html.parser')

        sections = story_soup.find_all('section')
        story_paragraphs = []
        section_titles = []

        for section in sections:
            paragraphs = section.find_all('p')
            for paragraph in paragraphs:
                story_paragraphs.append(paragraph.text)

            subs = section.find_all('h1')
            for sub in subs:
                section_titles.append(sub.text)

        number_sections = len(section_titles)
        number_paragraphs = len(story_paragraphs)

        each_story.append(date_published)
        each_story.append(title)
        each_story.append(subtitle)
        each_story.append(claps)
        each_story.append(responses)
        each_story.append(author_url)
        each_story.append(story_url)
        each_story.append(reading_time)
        each_story.append(number_sections)
        each_story.append(section_titles)
        each_story.append(number_paragraphs)
        each_story.append(story_paragraphs)

        stories_data.append(each_story)

    # write data to data frame "df" and return df
    columns = ['date_published', 'title', 'subtitle', 'claps', 'responses', 
           'author_url', 'story_url', 'reading_time', 
           'number_sections', 'section_titles', 
           'number_paragraphs', 'paragraphs']

    df = pd.DataFrame(stories_data, columns=columns)
    return df

In [25]:
url = f'https://towardsdatascience.com/archive/2020/01/01'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

stories = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
for story in stories:
    story_url = story.find('a', class_='button button--smaller button--chromeless u-baseColor--buttonNormal')['href']
    print(story_url)

https://towardsdatascience.com/making-python-programs-blazingly-fast-c1cd79bd1b32?source=collection_archive---------0-----------------------
https://towardsdatascience.com/implementing-a-fully-convolutional-network-fcn-in-tensorflow-2-3c46fb61de3b?source=collection_archive---------1-----------------------
https://towardsdatascience.com/6-new-features-in-python-3-8-for-python-newbies-dc2e7b804acc?source=collection_archive---------2-----------------------
https://towardsdatascience.com/how-to-be-fancy-with-python-8e4c53f47789?source=collection_archive---------3-----------------------
https://towardsdatascience.com/perfectly-privacy-preserving-ai-c14698f322f5?source=collection_archive---------4-----------------------
https://towardsdatascience.com/from-scratch-to-search-playing-with-your-data-elasticsearch-ingest-pipelines-6d054bf5d866?source=collection_archive---------5-----------------------
https://towardsdatascience.com/gan-pix2pix-generative-model-c9bf5d691bac?source=collection_archi

# Save the results in a SQLite database
After each scraping process, save the content of the data frame into a SQLite database. You can find the functions used in the module helper_functions.py.

In [3]:
import sqlite3
import helper_functions

# connect to SQLite database medium.db
con = helper_functions.create_sqlite_connection("medium.db")
cur = con.cursor()

In [4]:
# define sql statement to create table "stories" if not exist
sql_create_table_stories = """
                                CREATE TABLE IF NOT EXISTS test (
                                
                                    date_published INTEGER PRIMARY KEY,
                                    title TEXT NOT NULL,
                                    subtitle TEXT,
                                    claps TEXT NOT NULL,
                                    responses INTEGER,
                                    author_url TEXT NOT NULL,
                                    reading_time INTEGER
                                
                            );
                            """

In [5]:
# execute sql statement
cur.execute(sql_create_table_stories)

<sqlite3.Cursor at 0x1b73d4ccdc0>

In [6]:
# print all tables
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[('test',)]


In [7]:
# drop table
helper_functions.print_tables_in_database(cur)
helper_functions.drop_table(cur, 'test')
helper_functions.print_tables_in_database(cur)

[('test',)]
[]


# Execute the extraction process iteratively
The HTML request is looking for all articles published on the defined date. By iterating over a list with all dates in a certain time frame, we will get all articles published on TowardsDataScience in this time frame. 

In [16]:
from datetime import date, timedelta

# returns a list with a entry for each day between sdate and edate
days = []

sdate = date(2020, 1, 1)   # start date
edate = date(2020, 1, 3)   # end date

delta = edate - sdate       # as timedelta

for i in range(delta.days + 1):
    day = sdate + timedelta(days=i)
    days.append(day)

In [17]:
for k in range(0,1,1):
    year = str(days[k].year)
    month = str(days[k].month).zfill(2)
    day = str(days[k].day).zfill(2)
    
    date_published = f'{month}/{day}/{year}'
    url = f'https://towardsdatascience.com/archive/{year}/{month}/{day}'
    print("Url: " + url)
    
    df = extract_data(url, date_published)
    
    # save df to sql
    # df.to_sql('stories', con=conn, if_exists='append')

Url: https://towardsdatascience.com/archive/2020/01/01


In [26]:
df["story_url"][0]

'https://towardsdatascience.com/making-python-programs-blazingly-fast-c1cd79bd1b32?source=collection_archive---------0-----------------------'

# Read important data directly from story

In [30]:
import requests
from bs4 import BeautifulSoup

url = "https://towardsdatascience.com/7-of-the-most-commonly-used-regression-algorithms-and-how-to-choose-the-right-one-fc3c8890f9e3"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

figures = soup.find_all('figcaption')

In [42]:
class Story:
    '''
        Class to export all required data from a specific story
    '''
    def __init__(self, url):
        self.page = requests.get(url)
        self.soup = BeautifulSoup(page.text, 'html.parser')
        
        
        ##########################################################################################################
        # Find all figures and figurer captures
        ########################################################################################################## 
        self.figure_captures = []
        
        figures = self.soup.find_all('figcaption')
        
        for figure in figures:
            capture_modified = str(figure).replace('<figcaption class="kl km fy fw fx kn ko bf b bg bh dx">','')
            capture_modified = capture_modified.replace('</figcaption>','')
            self.figure_captures.append(capture_modified)
            
        ##########################################################################################################
        # Find all figures and figurer captures
        ##########################################################################################################             

In [43]:
story = Story("https://towardsdatascience.com/7-of-the-most-commonly-used-regression-algorithms-and-how-to-choose-the-right-one-fc3c8890f9e3")

['Regression Algorithms — Image by the author',
 'Overview of types of learning — Image by the author',
 'Linear Regression: interception term and regression coefficients — Image by the author',
 'Global trend models [Fah16, p.512]',
 'Polynomial Regression: Sample Model — Image by the author',
 'Effects of individual outliers on the linear regression model — Image by the author',
 'RANSAC algorithm — Image by the author',
 'RANSAC algorithm: Four iterations of the model building process (Min_Samples =2, Threshhold = 20) — Image by the author',
 'Decision tree for a simple two-dimensional case with a depth of one — Image by the author',
 'Random Forest: Sample Model — Image by the author',
 'A-priori Gaussian-Prozess using a Squared Exponential Kernel — Image by the autor (inspired by [Sci18n][Duv14])',
 'Squared Exponential Kernel: Influence of Hyperparameters — Image by the author',
 'A-priori Gaussian-Prozess using Rational Quadratic Kernel — Image by the autor (inspired by [Sci18n]

In [None]:
# function to extract all the information to the stories given in the medium archive
def extract_data(url, date_published):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')

    stories = soup.find_all('div', class_='streamItem streamItem--postPreview js-streamItem')
    stories_data = []

    # find attributes author_url, reading_time, reading_time, responses, story_url for the stories which where published
    # for the specified date

    for story in stories:
        each_story = []

        author_box = story.find('div', class_='postMetaInline u-floatLeft u-sm-maxWidthFullWidth')
        author_url = author_box.find('a')['href']

        try:
            reading_time = author_box.find('span', class_='readingTime')['title']
        except:
            continue

        title = story.find('h3').text if story.find('h3') else '-'
        subtitle = story.find('h4').text if story.find('h4') else '-'

        if story.find('button',
                      class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents'):

            claps = story.find('button',
                               class_='button button--chromeless u-baseColor--buttonNormal js-multirecommendCountButton u-disablePointerEvents').text

        else:
            claps = 0

        if story.find('a', class_='button button--chromeless u-baseColor--buttonNormal'):

            responses = story.find('a', class_='button button--chromeless u-baseColor--buttonNormal').text

        else:
            responses = '0 responses'

        story_url = story.find('a', class_='button button--smaller button--chromeless u-baseColor--buttonNormal')[
            'href']

        # data cleaning
        reading_time = reading_time.split()[0]
        responses = responses.split()[0]

        story_page = requests.get(story_url)
        story_soup = BeautifulSoup(story_page.text, 'html.parser')

        sections = story_soup.find_all('section')
        story_paragraphs = []
        section_titles = []

        for section in sections:
            paragraphs = section.find_all('p')
            for paragraph in paragraphs:
                story_paragraphs.append(paragraph.text)

            subs = section.find_all('h1')
            for sub in subs:
                section_titles.append(sub.text)

        number_sections = len(section_titles)
        number_paragraphs = len(story_paragraphs)

        each_story.append(date_published)
        each_story.append(title)
        each_story.append(subtitle)
        each_story.append(claps)
        each_story.append(responses)
        each_story.append(author_url)
        each_story.append(story_url)
        each_story.append(reading_time)
        each_story.append(number_sections)
        each_story.append(section_titles)
        each_story.append(number_paragraphs)
        each_story.append(story_paragraphs)

        stories_data.append(each_story)

    # write data to data frame "df" and return df
    columns = ['date_published', 'title', 'subtitle', 'claps', 'responses',
               'author_url', 'story_url', 'reading_time',
               'number_sections', 'section_titles',
               'number_paragraphs', 'paragraphs']

    df = pd.DataFrame(stories_data, columns=columns)
    return df

In [35]:
url = "https://towardsdatascience.com/7-of-the-most-commonly-used-regression-algorithms-and-how-to-choose-the-right-one-fc3c8890f9e3"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')


def extract_figure_captures(soup):
    figures = soup.find_all('figcaption')
    
    figure_capture = []

    for figure in figures:
        capture_modified = str(figure).replace('<figcaption class="kl km fy fw fx kn ko bf b bg bh dx">','')
        capture_modified = capture_modified.replace('</figcaption>','')
        figure_capture.append(capture_modified)

    figure_capture

['Regression Algorithms — Image by the author',
 'Overview of types of learning — Image by the author',
 'Linear Regression: interception term and regression coefficients — Image by the author',
 'Global trend models [Fah16, p.512]',
 'Polynomial Regression: Sample Model — Image by the author',
 'Effects of individual outliers on the linear regression model — Image by the author',
 'RANSAC algorithm — Image by the author',
 'RANSAC algorithm: Four iterations of the model building process (Min_Samples =2, Threshhold = 20) — Image by the author',
 'Decision tree for a simple two-dimensional case with a depth of one — Image by the author',
 'Random Forest: Sample Model — Image by the author',
 'A-priori Gaussian-Prozess using a Squared Exponential Kernel — Image by the autor (inspired by [Sci18n][Duv14])',
 'Squared Exponential Kernel: Influence of Hyperparameters — Image by the author',
 'A-priori Gaussian-Prozess using Rational Quadratic Kernel — Image by the autor (inspired by [Sci18n]

In [21]:
class SQLiteConnection:

    def __init__(self, db_file):
        """ create a database connection to the SQLite database
            specified by db_file
        :param db_file: database file
        :return: Connection object or None
        """
        conn = None
        try:
            self.conn = sqlite3.connect(db_file)
        except:
            print("Error:", sys.exc_info()[0])

        return conn

    def create_table(cur, create_table_sql):
        """ create a table from the create_table_sql statement
        :param cur: Defined cursor
        :param create_table_sql: a CREATE TABLE statement
        :return:
        """
        try:
            cur.execute(create_table_sql)
        except:
            print("Error:", sys.exc_info()[0])

    def drop_table(cur, table_to_drop):
        """ drop a table
        :param conn: Defined cursor
        :param table_to_drop: specify the table name you want to drop
        :return:
        """

        sql_drop_table = f"drop table {table_to_drop}"

        try:
            cur.execute(sql_drop_table)
        except:
            print("Error:", sys.exc_info()[0])

    def print_tables_in_database(cur):
        try:
            cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
            print(cur.fetchall())
        except:
            print("Error:", sys.exc_info()[0])

AttributeError: ResultSet object has no attribute 'find_all'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?