In [8]:
url = 'https://gist.githubusercontent.com/simonw/29326d38d35ff1d57f1e99538d9edfff/raw/f3e2bca16ffdc077fb948bb3d240439613b62e42/ask-metafilter-simonw.json'

In [9]:
import requests

In [10]:
items = requests.get(url).json()

In [11]:
len(items)

63

In [12]:
comments = items

In [13]:
items = []

In [14]:
comments[0]

{u'comment_id': u'4551046',
 u'date': u'October 27',
 u'html': u'Tommy\u2019s Joynt is a couple of blocks away and is a San Francisco institution - great comfort food, inexpensive, crammed with personality and open late.',
 u'tags': [u'sanfrancisco', u'greatamericanmusichall', u'dinner'],
 u'time': u'8:12 PM',
 u'title': u'Late night dining near Great American Music Hall',
 u'url': u'http://ask.metafilter.com/315108/Late-night-dining-near-Great-American-Music-Hall'}

In [16]:
comments[-1]['date']

u'January 16, 2005'

In [17]:
for comment in comments:
    if not ',' in comment['date']:
        comment['date'] += ', 2017'

In [18]:
comments[0]

{u'comment_id': u'4551046',
 u'date': u'October 27, 2017',
 u'html': u'Tommy\u2019s Joynt is a couple of blocks away and is a San Francisco institution - great comfort food, inexpensive, crammed with personality and open late.',
 u'tags': [u'sanfrancisco', u'greatamericanmusichall', u'dinner'],
 u'time': u'8:12 PM',
 u'title': u'Late night dining near Great American Music Hall',
 u'url': u'http://ask.metafilter.com/315108/Late-night-dining-near-Great-American-Music-Hall'}

In [19]:
from dateutil import parser

In [20]:
for comment in comments:
    comment['datetime'] = parser.parse(comment['time'] + ' ' + comment['date']).isoformat()

In [21]:
comments[0]

{u'comment_id': u'4551046',
 u'date': u'October 27, 2017',
 'datetime': '2017-10-27T20:12:00',
 u'html': u'Tommy\u2019s Joynt is a couple of blocks away and is a San Francisco institution - great comfort food, inexpensive, crammed with personality and open late.',
 u'tags': [u'sanfrancisco', u'greatamericanmusichall', u'dinner'],
 u'time': u'8:12 PM',
 u'title': u'Late night dining near Great American Music Hall',
 u'url': u'http://ask.metafilter.com/315108/Late-night-dining-near-Great-American-Music-Hall'}

In [22]:
comments[-3]

{u'comment_id': u'329865',
 u'date': u'June 20, 2005',
 'datetime': '2005-06-20T08:18:00',
 u'html': u'The <a href="http://caib.nasa.gov/news/report/default.html">official accident report</a> is surprisingly readable - I had to look at it a while back for a university project.',
 u'tags': [u'Space', u'Shuttle', u'Columbia', u'accident', u'NASA'],
 u'time': u'8:18 AM',
 u'title': u'Space Shuttle Columbia Accident',
 u'url': u'http://ask.metafilter.com/20151/Space-Shuttle-Columbia-Accident'}

In [23]:
comments.sort(key = lambda c: c['datetime'])

In [29]:
items = []
items_by_url = {}
for comment in comments:
    url = comment['url']
    if url in items_by_url:
        item = items_by_url[url]
    else:
        item = {
            'url': url,
            'datetime': comment['datetime'],
            'title': comment['title'],
            'comments': [],
            'tags': comment['tags'],
        }
    item['comments'].append({
        'id': comment['comment_id'],
        'datetime': comment['datetime'],
        'html': comment['html']
    })
    if url not in items_by_url:
        items.append(item)
        items_by_url[url] = item


In [30]:
len(items)

55

In [31]:
items[0]

{'comments': [{'datetime': '2005-01-16T14:08:00',
   'html': u'I\'m a big fan of eBags.com for this kind of thing, because it lets you <a href="http://www.ebags.com/business_cases/laptop_cases/category_search/index.cfm?N=4001+2006675">pick the model of your laptop</a> and then tells you which bags it will fit in. I bought a bag from there last year and the e-commerce / delivery side of things was flawless.',
   'id': u'242583'}],
 'datetime': '2005-01-16T14:08:00',
 'tags': [u'backpacks', u'laptops', u'style', u'accessories', u'bags'],
 'title': u'I need a new backpack',
 'url': u'http://ask.metafilter.com/14075/I-need-a-new-backpack'}

In [32]:
items[-1]

{'comments': [{'datetime': '2017-10-27T20:12:00',
   'html': u'Tommy\u2019s Joynt is a couple of blocks away and is a San Francisco institution - great comfort food, inexpensive, crammed with personality and open late.',
   'id': u'4551046'}],
 'datetime': '2017-10-27T20:12:00',
 'tags': [u'sanfrancisco', u'greatamericanmusichall', u'dinner'],
 'title': u'Late night dining near Great American Music Hall',
 'url': u'http://ask.metafilter.com/315108/Late-night-dining-near-Great-American-Music-Hall'}

In [108]:

def body_for_item(item):
    url = item['url']
    comments = item['comments']
    if len(comments) == 1:
        url += '#' + comments[0]['id']
    start_p = [
        '<p><em>My answer to <a href="%s">%s</a> on Ask MetaFilter</em></p>' % (
            url, item['title']
        )
    ]
    bits = []
    bits.append(comments[0]['html'])
    if len(comments) > 1:
        for comment in comments[1:]:
            bits.append('<br />\n<br /><em>Then <a href="%s">at %s</a>:</em><br />\n<br />' % (
                url + '#' + comment['id'],
                parser.parse(comment['datetime']).strftime('%H:%M')
            ))
            bits.append(comment['html'])
    done = '\n'.join(bits)
    done = done.replace('\r\n', '\n')
    # Turn <br><br> into paragraphs instead
    paragraphs = done.split('<br />\n<br />')
    return '\n'.join(start_p + ['<p>%s</p>' % p.strip() for p in paragraphs])


In [109]:
nile = [i for i in items if 'Nile' in i['title']][0]

In [110]:
print body_for_item(nile)

<p><em>My answer to <a href="http://ask.metafilter.com/167075/Nile-Cruises-Does-the-Oberoi-Shehrayar-actually-exist">Nile Cruises: Does the "Oberoi Shehrayar" actually exist?</a> on Ask MetaFilter</em></p>
<p>Yeah, that's the site he showed us I think. I'm leaning towards &quot;bog standard Nile Cruiser adopts confusing name to try and get ahead of the rest&quot; as the explanation at the moment. There are 270 cruisers on the Nile, after all.</p>
<p><em>Then <a href="http://ask.metafilter.com/167075/Nile-Cruises-Does-the-Oberoi-Shehrayar-actually-exist#2402202">at 11:54</a>:</em></p>
<p>Another data point: http://www.luxurynilecruisers.com/tariff.htm is one of the top hits on Google for that ship... but the price list hasn't been updated since 2007. Most of the reviews I've found are from years ago as well, I don't think I've found one dated 2010 or 2009 yet.</p>
<p><em>Then <a href="http://ask.metafilter.com/167075/Nile-Cruises-Does-the-Oberoi-Shehrayar-actually-exist#2402206">at 11:5

In [83]:
items[0]['comments']

[{'datetime': '2005-01-16T14:08:00',
  'html': u'I\'m a big fan of eBags.com for this kind of thing, because it lets you <a href="http://www.ebags.com/business_cases/laptop_cases/category_search/index.cfm?N=4001+2006675">pick the model of your laptop</a> and then tells you which bags it will fit in. I bought a bag from there last year and the e-commerce / delivery side of things was flawless.',
  'id': u'242583'}]

In [111]:
for item in items:
    item['body'] = body_for_item(item)

In [85]:
def slugify(s):
    return '-'.join(badchar.sub('', s.lower()).split()[:5])

In [55]:
import re
badchar = re.compile('[^a-zA-Z0-9 ]')

In [56]:
slugify(items[0]['title'])

u'i-need-a-new-backpack'

In [57]:
[slugify(i['title']) for i in items]

[u'i-need-a-new-backpack',
 u'do-content-management-systems-really',
 u'space-shuttle-columbia-accident',
 u'get-tickets-for-filming-of',
 u'can-social-bookmarking-services-prevent',
 u'what-are-some-good-software',
 u'patching-windows-xp-from-burned',
 u'so-long-safari',
 u'what-is-the-physically-smallest',
 u'sending-a-postal-letter-via',
 u'how-is-google-giving-me',
 u'how-can-my-nontechsavvy-mum',
 u'stupid-computersdo-what-i-want',
 u'san-diego-tell-me-more',
 u'problems-with-xhtml-content-type',
 u'help-me-figure-out-my',
 u'what-to-do-on-vacation',
 u'website-for-the-masses',
 u'getting-from-point-a-to',
 u'nile-cruises-does-the-oberoi',
 u'weekend-travel-in-europe',
 u'im-in-polanco-mexico-city',
 u'help-with-next-steps-for',
 u'lets-make-moving-and-packing',
 u'lovecraftinspired-fiction-and-cookbooks-unrelated',
 u'need-new-noms',
 u'difficulty-level-eating-dumplings',
 u'japanese-pantry-staples',
 u'where-should-we-stay-in',
 u'probably-need-to-gtfo-another',
 u'practical-gif

In [112]:
for item in items:
    item['slug'] = slugify(item['title'])
    item['type'] = 'entry'

In [113]:
import json
open('/tmp/items.json', 'w').write(json.dumps(items, indent=2))

In [114]:
!cat /tmp/items.json | pbcopy

In [86]:
print items[44]['body']

<p><em>My answer to <a href="http://ask.metafilter.com/305383/Podcasts-to-love-and-fall-asleep-to#4421965">Podcasts to love and fall asleep to</a> on Ask MetaFilter</em></p>
<p>Not technically podcasts but the BBC's radio output is still free to listen to from anywhere in the world (no ads!) and helps me get to sleep most nights. I just hit play in the browser on my phone.</p>
<p>I tend to start from this page of recent comedy releases: <a href="http://www.bbc.co.uk/radio/programmes/genres/comedy/player/episodes">http://www.bbc.co.uk/radio/programmes/genres/comedy/player/episodes</a></p>
<p>Stuff is available for 30 days after it is broadcast. There are a few episodes of Cabin Pressure up at the moment which is a particular favorite.</p>
