In [1]:
import os

import dateparser
import parsel
import yaml

start = 'pages/program/content'
talks = []
for cnt, talk in enumerate(os.listdir(start), start=1):
    #print(cnt, talk)
    talk_data = {'slug': talk}
    try:
        with open(start + '/' + talk + '/content.en.yaml') as f:
            talk_data.update({'en': yaml.load(f)})
    except:
        pass
    try:
        with open(start + '/' + talk + '/content.ru.yaml') as f:
            talk_data.update({'ru': yaml.load(f)})
    except:
        pass
    try:
        with open(start + '/' + talk + '/media.md') as f:
            selector = parsel.Selector(f.read())
            talk_data['media'] = selector.xpath('//iframe/@src').extract_first()
    except:
        pass
    talks.append(talk_data)



In [2]:
import re
from urllib.parse import urlparse

for t in talks:
    #print(t['en']["speaker"], t['en'].get('title', t['ru'].get('title')), t.get('media'))
    p = urlparse(t.get('media', '')).path
    try:
        d = re.search('/embed/(?P<video_id>.+)', p).groupdict()
        t['youtube_url'] = 'https://www.youtube.com/watch?v={}'.format(d['video_id'])
    except:
        continue

In [3]:
# ---- schedule
schedule = {}
mapping = {
    'day1': dateparser.parse('2014-06-02'),
    'day2': dateparser.parse('2013-06-03'),
}
try:
    with open('pages/program/schedule/data.ru.yaml') as f:
        d = yaml.load(f)
        for day, data in d.items():
            dt = mapping[day]
            for _, t in data.items():
                for talk in t:
                    slug = talk.get('slug')
                    if slug:
                        schedule[slug.lower()] = {
                            'datetime': dt,
                            'title': talk.get('talk'),
                            'speaker': talk.get('speaker'),
                        }
except:
    print(talk)
    raise

In [4]:
set(schedule.keys()) - set(t['slug'].lower() for t in talks)

{'lightning-talks-1', 'lightning-talks-2'}

In [5]:
schedule, len(schedule)

({'asyncio': {'datetime': datetime.datetime(2013, 6, 3, 0, 0),
   'speaker': 'Андрей Светлов, LevelUp',
   'title': 'Как писать для asyncio. Советы от разработчика библиотеки.'},
  'autoscaling-on-the-cloud': {'datetime': datetime.datetime(2014, 6, 2, 0, 0),
   'speaker': 'Simone Soldateschi',
   'title': 'Auto scaling on the Cloud the right way'},
  'bdd_in_python': {'datetime': datetime.datetime(2013, 6, 3, 0, 0),
   'speaker': 'Кирилл Борисов, БАРС Груп',
   'title': 'Behavior Driven Development in Python'},
  'data_analysis': {'datetime': datetime.datetime(2014, 6, 2, 0, 0),
   'speaker': 'Роман Иманкулов, Todoist',
   'title': 'Python для анализа данных'},
  'explore-your-data': {'datetime': datetime.datetime(2013, 6, 3, 0, 0),
   'speaker': 'Honza Král',
   'title': 'From __icontains to search'},
  'hamly': {'datetime': datetime.datetime(2014, 6, 2, 0, 0),
   'speaker': 'Виктор Коцеруба, Имхонет',
   'title': 'Выжимаем максимум из шаблонизатора'},
  'jetbrains': {'datetime': date

In [6]:
set(t['slug'].lower() for t in talks)

{'asyncio',
 'autoscaling-on-the-cloud',
 'bdd_in_python',
 'data_analysis',
 'explore-your-data',
 'hamly',
 'jetbrains',
 'kivy',
 'memory',
 'open_source_advices',
 'pony_orm',
 'python3',
 'python_in_yandex',
 'python_pytest',
 'registrer',
 'scrapinghub',
 'soa',
 'the-sorry-state-of-ssl',
 'underscore',
 'workshop',
 'workshop2',
 'writing-secure-apis',
 'ya_tank'}

In [7]:
import re
from urllib.parse import urlparse

for t in talks:
    try:
        t['datetime'] = schedule[t['slug'].lower()]['datetime'].isoformat()
        t['speaker'] = schedule[t['slug'].lower()]['speaker']
        t['title'] = schedule[t['slug'].lower()]['title']
    except:
        pass

In [None]:
import json

with open('/home/paul/src/pyvideodata/pycon-ru-2014/talks.json', 'w') as f:
    json.dump(talks, f, **{
        'indent': 2,
        'separators': (',', ': '),
        'sort_keys': True,
    })

In [8]:
playlist_urls = '''https://www.youtube.com/watch?v=Ooi5ECECwkA
https://www.youtube.com/watch?v=6CeXt62Dt2A
https://www.youtube.com/watch?v=cvU6vQFugsc
https://www.youtube.com/watch?v=rhMGkTQc0wU
https://www.youtube.com/watch?v=FAK03HHsPWU
https://www.youtube.com/watch?v=LGhMQ8YoEiY
https://www.youtube.com/watch?v=zPrEgGAXdhI
https://www.youtube.com/watch?v=7Ix9aI7apmQ
https://www.youtube.com/watch?v=jRDT7kslD8A
https://www.youtube.com/watch?v=hERb4be-h0w
https://www.youtube.com/watch?v=DTm4bHPZ7Vc
https://www.youtube.com/watch?v=b_rGwaryDmI
https://www.youtube.com/watch?v=vlBzrgfGNbQ
https://www.youtube.com/watch?v=toaB4dbr0D0
https://www.youtube.com/watch?v=n4nfjM1ecgw
https://www.youtube.com/watch?v=yybXi8EsfHo
https://www.youtube.com/watch?v=P0QGFuq_M1g
https://www.youtube.com/watch?v=11hGwtHF3SU
https://www.youtube.com/watch?v=-tsHmOVlRio
https://www.youtube.com/watch?v=La75xaG6HO0
https://www.youtube.com/watch?v=TAowGjrW2Fo
https://www.youtube.com/watch?v=vKkHLNHpFUI
https://www.youtube.com/watch?v=f6-C1ycmYho
https://www.youtube.com/watch?v=-UMzPDr87JU
https://www.youtube.com/watch?v=q7AtGVx3DMM'''.splitlines()

github_urls = '''https://www.youtube.com/watch?v=rhMGkTQc0wU
https://www.youtube.com/watch?v=zPrEgGAXdhI
https://www.youtube.com/watch?v=-tsHmOVlRio
https://www.youtube.com/watch?v=toaB4dbr0D0
https://www.youtube.com/watch?v=q7AtGVx3DMM
https://www.youtube.com/watch?v=vlBzrgfGNbQ
https://www.youtube.com/watch?v=yybXi8EsfHo
https://www.youtube.com/watch?v=f6-C1ycmYho
https://www.youtube.com/watch?v=TAowGjrW2Fo
https://www.youtube.com/watch?v=jRDT7kslD8A
https://www.youtube.com/watch?v=7Ix9aI7apmQ
https://www.youtube.com/watch?v=-UMzPDr87JU
https://www.youtube.com/watch?v=FAK03HHsPWU
https://www.youtube.com/watch?v=11hGwtHF3SU
https://www.youtube.com/watch?v=n4nfjM1ecgw
https://www.youtube.com/watch?v=cvU6vQFugsc
https://www.youtube.com/watch?v=LGhMQ8YoEiY
https://www.youtube.com/watch?v=P0QGFuq_M1g
https://www.youtube.com/watch?v=vKkHLNHpFUI
https://www.youtube.com/watch?v=6CeXt62Dt2A
https://www.youtube.com/watch?v=La75xaG6HO0
https://www.youtube.com/watch?v=Ooi5ECECwkA'''.splitlines()

In [9]:
# in YouTube playlist but not in the website archive
set(playlist_urls) - set(github_urls)

{'https://www.youtube.com/watch?v=DTm4bHPZ7Vc',
 'https://www.youtube.com/watch?v=b_rGwaryDmI',
 'https://www.youtube.com/watch?v=hERb4be-h0w'}

In [10]:
# in the website archive but no in YouTube playlist
set(github_urls) - set(playlist_urls)

set()

In [11]:
import pprint

pprint.pprint(talks)

[{'datetime': '2014-06-02T00:00:00',
  'en': {'description': 'An introduction into writing secure APIs with '
                        'Python.  This covers both integral API design for '
                        'internal APIs as well as remotely exposed APIs; both '
                        'high level basics such as use of protocols (SSL and '
                        'OAuth) as well as how to structure internal security '
                        'contexts.  On top of that it shows how to take '
                        'advantage of Python features to make writing secure '
                        'code easier.\n',
         'speaker': 'Armin Ronacher',
         'speaker_desc': '[Flask](http://flask.pocoo.org/) and '
                         '[Jinja2](http://jinja.pocoo.org/) creator, [Pocoo '
                         'Team](http://www.pocoo.org/) founder, '
                         '[PSF](https://www.python.org/psf) member, developer '
                         'at [Splash Damage](http://