In [1]:
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import requests

import csv
import os
import sys
import re

In [4]:
import requests
import json
import xml.etree.ElementTree as ET
from datetime import datetime

should probably test if markdown is better than html... either way no processing for me at the end except for retrieving image!

In [3]:
def find_latest_posts(user, limit):
    """
        Get latest {limit} posts from {user} in JSON format.

        If {limit} is zero, it will fetch 1 posts
        If there is no {limit}, it will fetch the default value (10)
    """
    url = "https://medium.com/@{0}/latest".format(user)

    if limit is 0:
        params = {"limit": 1}
    elif limit:
        params = {"limit": limit}
    else:
        params = {}

    headers = {'Accept': 'application/json'}

    response = requests.get(url, params, headers=headers)
    return get_post_metadata(response.text)

In [4]:
def get_post_metadata(text):
    """
        Deletes the XML snipplet from the beginning of {text},
        as the Medium API tries to avoid json hijacking

        Looks for Post IDs in {parsed_json}["payload"]["references"]["Post"],
        and if it exists, returns the keys, otherwise an empty array.
    """
    # this is needed, because the Medium API endpoint returns xml snipplet
    # before the json to avoid json hijacking
    # 2017. 06. 07.
    #
    # Takes the substring of {text} between the first occurence of { and the
    # end
    text_without_xml = text[text.find("{"):]
    posts = []
    keys = []
    author = ""

    parsed_json = json.loads(text_without_xml)
    if "payload" in parsed_json:
        if "references" in parsed_json["payload"]:
            if "Post" in parsed_json["payload"]["references"]:
                keys = parsed_json["payload"]["references"]["Post"].keys()
            if "user" in parsed_json["payload"]:
                if "name" in parsed_json["payload"]["user"]:
                    author = parsed_json["payload"]["user"]["name"]

    for key in keys:
        post = parsed_json["payload"]["references"]["Post"][key]
        new_entry = {
            "id": post["id"],
            "title": post["title"],
            "author": author,
            # conver the timestamps from miliseconds to seconds
            "published": float(post["firstPublishedAt"]) / 1000
        }
        posts.append(new_entry)
    return posts

In [5]:
def tag_handler(tag, separator, post_html):
    """
    This function switches from a specific {tag} to a specific
    markdown {separator}

    Example 1: <em>text</em> => *text*
    Example 2: <strong>text</strong> => **text**
    """
    old_tag = tag
    close_tag = "</{0}>".format(tag)
    tag = "<{0}".format(tag)

    start = post_html.find(tag)
    end = post_html.find(close_tag) + len(close_tag)
    start_text = post_html.find(">", post_html.find(tag)) + 1
    end_text = post_html.find(close_tag)

    text = post_html[start_text:end_text]
    new_text = "{1}{0}{1}".format(text, separator)
    post_html = post_html[:start] + new_text + post_html[end:]
    if (post_html.find(tag) >= 0):
        post_html = tag_handler(old_tag, separator, post_html)
    return post_html

In [6]:
def picture_handler(tag, separator, post_html):
    """
    This function switches from a specific {tag} to a specific
    markdown image syntax

    Example:
    <figure>
        <img src="PATH_TO_IMG">
        <figcaption class="imageCaption">CAPTION</figcaption></figure>
    </figure>

    => ![CAPTION](PATH_TO_IMG)
    """
    old_tag = tag
    close_tag = "</{0}>".format(tag)
    tag = "<{0}".format(tag)

    start = post_html.find(tag)
    end = post_html.find(close_tag) + len(close_tag)
    picture_html = post_html[start:end]

    caption = ""
    link = ""
    picture_markdown = ""

    if picture_html.find("src=") >= 0:
        link = picture_html[picture_html.find(
            "src=") + 5:picture_html.find("jpeg\">") + 4]
    if picture_html.find("<figcaption") >= 0:
        caption = picture_html[picture_html.find(
            "<figcaption class=\"imageCaption\">") + 33: picture_html.find("</figcaption>")]

    if len(link) > 0:
        picture_markdown = "<{2}>![{0}]({1})</{2}>".format(caption,
                                                           link, separator)
    post_html = post_html[:start] + picture_markdown + post_html[end:]

    if (post_html.find(tag) >= 0):
        post_html = picture_handler(old_tag, separator, post_html)
    return post_html

In [7]:
def link_handler(tag, post_html):
    """
    This function switches from a specific {tag} to a specific
    markdown link syntax

    Example:
    <a href=URL>CAPTION</a> => [CAPTION](URL)
    """
    old_tag = tag
    close_tag = "</{0}>".format(tag)
    tag = "<{0}".format(tag)

    start = post_html.find(tag)
    end = post_html.find(close_tag) + len(close_tag)
    link_html = post_html[start:end]

    link_markdown = ""
    link = ""
    caption = ""

    caption = link_html[link_html.find(">") + 1: link_html.find("</a>")]
    if link_html.find("href=") >= 0:
        link = link_html[link_html.find(
            "href=") + 6:link_html.find("\" data-href=")]

    if len(caption) == 0:
        caption = link

    if len(link) > 0:
        link_markdown = "[{0}]({1})".format(caption, link)
    post_html = post_html[:start] + link_markdown + post_html[end:]

    if (post_html.find(tag) >= 0):
        post_html = link_handler(old_tag, post_html)
    return post_html

In [8]:
def new_line_handler(tag, post_html):
    """
        Replacing a tag with a tag + closing tag

        Example:

        <br> => <br></br>
    """
    start_tag = "<{0}>".format(tag)
    end_tag = "</{0}>".format(tag)

    return post_html.replace(start_tag, start_tag + end_tag)

In [9]:
def transform_html_to_markdown(response):
    """
        Get the article out of the {response} as html then convert it to markdown.

        I parse it as an xml. Because of this, I have to cut out the following part:

        <div class="section-divider">
            <hr class="section-divider">
        </div>

        as this is not a valid xml snipplet.
    """

    post_md = ""
#     post_html = response[response.find(
#         "<div class=\"section-inner sectionLayout--insetColumn\">"):response.find("</div></section>")]
    post_html = response[response.find('<article'):response.find('</article>')]

    if post_html.find("<strong") >= 0:
        post_html = tag_handler("strong", "**", post_html)

    if post_html.find("<em") >= 0:
        post_html = tag_handler("em", "*", post_html)

    if post_html.find("<figure") >= 0:
        post_html = picture_handler("figure", "p", post_html)

    if post_html.find("<a") >= 0:
        post_html = link_handler("a", post_html)
    if post_html.find("<br") >= 0:
        post_html = new_line_handler("br", post_html)
        post_html = tag_handler("br", "\n", post_html)
        
    print(post_html)
    print("bruh")

    root = ET.fromstring(post_html)

    for child in root:
        if child.tag == "h1":
            post_md += "# {0}".format(child.text)
        elif child.tag == "h2":
            post_md += "## {0}".format(child.text)
        elif child.tag == "h3":
            post_md += "### {0}".format(child.text)
        elif child.tag == "p":
            # TODO: links
            post_md += "\n\n{0}".format(child.text)
        elif child.tag == "ul":
            for item in child:
                post_md += "\n - {0}".format(item.text)
        elif child.tag == "ol":
            for index, item in enumerate(child, start=1):
                post_md += "\n {1}. {0}".format(item.text, index)
        else:
            print("\nunkown tag: ", child.tag, child.text, "\n")

    return post_md

In [10]:
def download_posts_in_html(user, posts, category):
    """
        Download a {user}'s posts. The post ids are defined in the {posts}.

        The {posts} also contains the article's title and publish that.
        These combined will be the filename for the markdown files.
    """
    html_posts = []
    main_url = "https://medium.com/@{0}/".format(user)
    print("Downloading posts from user {0}.\n".format(user))

    for post in posts:
        url = main_url + post["id"]

        # for the filename getting the date of the publication and
        date = datetime.fromtimestamp(post["published"])
        full_date = date.strftime("%Y-%m-%d %H:%M:%S")
        date = date.strftime("%Y-%m-%d")
        # dasherizing the title
        title = post["title"].lower().replace(" ", "-")
        filename = "{0}-{1}".format(date, title)
        print("Saving post {0} - {1} into {2}...\t".format(post["id"],post["title"], filename), end="")
        author = ""
        if len(post["author"]) > 0:
            author = "author: {0}".format(post["author"])

        if category is None:
            category = "medium"
        post_header = "---\nlayout: post\ntitle: {0}\ndate: {1}\ncategories: {2}\n{3}\n---\n".format(post["title"], full_date, category, author)
        try:
            post_file = open("medium_posts_markdown/{0}.md".format(filename), "w")
            response = requests.get(url)
            post_file.write(post_header + transform_html_to_markdown(response.text))
            post_file.close()
            print("done")
        except ET.ParseError:
            print("Something went wrong during the parsing of this post.")

    print("\nThe files can be found in the medium_posts_markdown folder.")
    return html_posts

In [2]:
url = "https://onezero.medium.com/facebook-insists-no-security-backdoor-is-planned-for-whatsapp-e9d7a298bb69?format=json"

In [3]:
import json
import requests

In [4]:
response = requests.get(url).text

In [5]:
data = json.loads(response[response.find('{'):])

In [6]:
s = data['payload']

In [8]:
s['value']['content']['bodyModel']['paragraphs']

[{'name': '743e',
  'type': 3,
  'text': 'Facebook Insists No Security ‘Backdoor’ Is Planned for WhatsApp',
  'markups': []},
 {'name': '6b27',
  'type': 13,
  'text': 'The company is fighting back against rumors that it would scan messages on users’ phones prior to encryption',
  'markups': []},
 {'name': '4147',
  'type': 4,
  'text': 'Photo: SOPA Images/Getty Images',
  'markups': [],
  'layout': 1,
  'metadata': {'id': '1*moEaNMSad1vss3gXQCuoeA.jpeg',
   'originalWidth': 1024,
   'originalHeight': 683,
   'isFeatured': True,
   'focusPercentX': 57,
   'focusPercentY': 68}},
 {'name': 'b427',
  'type': 1,
  'text': 'Billions of people use the messaging tool WhatsApp, which added end-to-end encryption for every form of communication available on its platform back in 2016. This ensures that conversations between users and their contacts — whether they occur via text or voice calls — are private, inaccessible even to the company itself.',
  'markups': [],
  'hasDropCap': True,
  'dropC

In [133]:
def processMarkup(p):
    markups = p['markups']
    markups.sort(key=lambda x: x['start'])
    prev = 0
    text = ''
    for mark in markups:
        start = mark['start'] + prev
        end = mark['end'] + prev
        if mark['type'] == 1: #bold
            text = p['text'][:start] + '**' + p['text'][start:end] + '**' + p['text'][end:]
            prev += 3
        elif mark['type'] == 2: #italic
            text = p['text'][:start] + '*' + p['text'][start:end] + '*' + p['text'][end:]
            prev += 2
        elif mark['type'] == 3: #anchor tag
            close_anchor = f"]({mark['href']})"
            text = p['text'][:start] + '[' + p['text'][start:end] + close_anchor + p['text'][end:]
            prev += len(close_anchor) + 1
        p['text'] = text
    return p

In [134]:
test = {'name': '257b',
  'type': 1,
  'text': 'But several recent posts published to Forbes’ blogging platform call WhatsApp’s future security into question. The posts, which were written by contributor Kalev Leetaru, allege that Facebook, WhatsApp’s parent company, plans to detect abuse by implementing a feature to scan messages directly on people’s phones before they are encrypted. The posts gained significant attention: A blog post by technologist Bruce Schneier rehashing one of the Forbes posts has the headline “Facebook Plans on Backdooring WhatsApp.”',
  'markups': [{'type': 3,
    'start': 382,
    'end': 422,
    'href': 'https://www.schneier.com/blog/archives/2019/08/facebook_plans_.html',
    'title': '',
    'rel': 'noopener',
    'anchorType': 0},
   {'type': 2, 'start': 38, 'end': 44},
   {'type': 2, 'start': 444, 'end': 451}]}

In [135]:
pls = {'name': 'd939',
  'type': 1,
  'text': 'italic. bold. link. italic bold. italic bold link.',
  'markups': [{'type': 3,
    'start': 14,
    'end': 19,
    'href': 'http://google.com',
    'title': '',
    'rel': '',
    'anchorType': 0},
   {'type': 3,
    'start': 33,
    'end': 50,
    'href': 'http://bruh.org',
    'title': '',
    'rel': '',
    'anchorType': 0},
   {'type': 1, 'start': 8, 'end': 13},
   {'type': 1, 'start': 20, 'end': 32},
   {'type': 1, 'start': 33, 'end': 50},
   {'type': 2, 'start': 0, 'end': 8},
   {'type': 2, 'start': 13, 'end': 14},
   {'type': 2, 'start': 19, 'end': 50}]}

In [137]:
processMarkup(pls)

{'name': 'd939',
 'type': 1,
 'text': '*italic. ***bold.****[ link](http://google.com)*. **italic bold.**[ italic bold link]**(http://bruh.org)***.',
 'markups': [{'type': 2, 'start': 0, 'end': 8},
  {'type': 1, 'start': 8, 'end': 13},
  {'type': 2, 'start': 13, 'end': 14},
  {'type': 3,
   'start': 14,
   'end': 19,
   'href': 'http://google.com',
   'title': '',
   'rel': '',
   'anchorType': 0},
  {'type': 2, 'start': 19, 'end': 50},
  {'type': 1, 'start': 20, 'end': 32},
  {'type': 3,
   'start': 33,
   'end': 50,
   'href': 'http://bruh.org',
   'title': '',
   'rel': '',
   'anchorType': 0},
  {'type': 1, 'start': 33, 'end': 50}]}

In [86]:
def processParagraph(p):
    if p['markups']:
        p = processMarkup(p)
    markup = ''
    if p['type'] == 1:
        markup = '\n'
    elif p['type'] == 2:
        p['text'] = '\n# ' + p['text'].replace(/\n/g, '\n# ')
    elif p['type'] == 3:
        p['text'] = '\n## ' + p['text'].replace(/\n/g, '\n## ')
    elif p['type'] == 4: #image and caption
        text = '\n![' + p['text'] + '](' + p.metadata.id + ')'
        if (p['text']):
            text += '*' + p['text'] + '*'
        p['text'] = text
    elif p['type'] == 6:
        markup = '> '
    elif p['type'] == 7: #quote
        p['text'] = '> # ' + p['text'].replace(/\n/g, '\n> # ')
    elif p['type'] == 8:
        p['text'] = '\n```\n' + p['text'] + '\n```\n'
    elif p['type'] == 9:
        markup = '\n* '
    elif p['type'] == 10:
        markup = '\n1. '
    elif p['type'] == 11:
        return getGitHubEmbed()
    elif p['type'] == 13:
        markup = '\n### '
    elif p['type'] == 15: #caption for section image
        p['text'] = '*' + p['text'] + '*'
    
    p['text'] = markup + p['text']
    if (p['alignment'] == 2 and p['type'] != 6 and p['type'] != 7)
    p['text'] = '<center>' + p['text'] + '</center>'

    return p['text']

SyntaxError: invalid syntax (<ipython-input-86-30c08ee43982>, line 8)