In [1]:
from datetime import datetime
import json
from pprint import pprint as pp
import re

from fuzzywuzzy import fuzz
from prettytable import PrettyTable


def duration_seconds(duration_string):
    fmt = '%H:%M:%S'
    delta = datetime.strptime(duration_string, fmt) \
            - datetime.strptime('00:00:00', fmt)
    return delta.total_seconds()

file_re = re.compile(r'Processing (.+)', re.I)
duration_re = re.compile(r'Duration: ([\d:]+)')
def video_durations(filename):
    with open(filename) as f:
       lines = f.readlines()
    durations = [{"video_file": file_re.search(lines[i]).group(1),
                  "duration": duration_seconds(
                        duration_re.search(lines[i+1]).group(1))}
                  for i in xrange(0, len(lines), 3)]
    return durations


def talk_string(data):
    return ' '.join((' '.join(data.get("speakers")), data.get('title')))


def print_matches(matches):
    x = PrettyTable([
        'Video title',
        'Talk speakers',
        'Talk title',
        #'URL',
        'Match ratio'])
    x.align = 'l'
    cutlimit = 48
    for video, best_match, ratio in matches:
            x.add_row([
                video['title'][:cutlimit],
                "; ".join(best_match['speakers'])[:cutlimit],
                best_match['title'][:cutlimit],
                #video['videos'][0]['src'].replace('http://video-pyconfr2015.paulla.asso.fr/', ''),
                ratio
            ])
    print x

with open('talks.json') as t:
    talks = json.load(t)

with open('videos.json') as v:
    videos = json.load(v)

durations = video_durations('video_durations.txt')
durations_dict = {d['video_file']: d['duration'] for d in durations}

matches = []
misses = []

for video in videos:
    current_talk = video['title']
    ratios = []
    for t in talks:
        ratios.append((fuzz.ratio(current_talk, talk_string(t)), t))
    best_ratio, best_match = max(ratios)
    if best_ratio >= 80:
        matches.append((video, best_match, best_ratio))
    else:
        misses.append(video)

In [2]:
def populate_video_urls(data, video, durations_dict):
    vidurls = []
    for v in video['videos']:
        _, vidfmt = v['type'].split('/')
        if vidfmt == 'ogg':
            vidfmt = 'ogv'
        data['video_{fmt}_url'.format(fmt=vidfmt)] = v['src']
        vidurls.append(v['src'])

    urlset = set(durations_dict.keys()) & set(vidurls)
    if urlset and len(urlset)==1:
        data['duration'] = int(durations_dict[urlset.pop()])
        
def merge_data(video, talk, durations_dict):
    keys = [
        # state
        #1 - live
        #2 - draft
        'state',
        
        'title', #The title of the video.

        # Short summary of the video formatted in Markdown.
        # Should be no more than a single paragraph of a few sentences.
        'summary', 

        # Longer description of the video in Markdown.
        # Outlines, linked timecodes, etc would go here.
        'description',

        # List of tags.
        'tags',

        # The title of the category.
        # The category must exist on the site.
        # If it doesn’t exist, the API will waggle its finger at you. 
        'category',

        # List of speaker names
        # If you pass in speaker names and they don’t exist, the API will create them for you.
        # If they do exist, the API will associate the video with the existing speaker objects. (Yay!)
        'speakers', 

        # notes for quality issues (audio or video) in the video
        'quality_notes',

        # the primary language the video is in
        'language',

        # text for copyright/license--for now it's loose form.
        # if null, use source video link.
        'copyright_text',

        # embed for flash player things
        'embed', 

        # url for the thumbnail
        'thumbnail_url',

        # duration in seconds
        'duration',


        # these are downloadable urls
        'video_ogv_length',
        'video_ogv_url',
        'video_ogv_download_only',
        'video_mp4_length',
        'video_mp4_url',
        'video_mp4_download_only',
        'video_webm_length',
        'video_webm_url',
        'video_webm_download_only',
        'video_flv_length',
        'video_flv_url',
        'video_flv_download_only',

        # source url in case we need to find things again
        'source_url',

        # whiteboard for editor notes
        'whiteboard',

        # when the video was originally recorded
        'recorded',

        # when the video was added to this site
        'added',

        # when the video metadata was last updated
        'updated',

        'slug',
        ]
    data = {
        'state': 2,
        'category': 'PyCon FR 2015',
        'title': talk['title'],
        'speakers': talk['speakers'],
        'language': 'French',
        'summary': talk['description_md'],
        'description': talk['abstract_md'],
        'thumbnail_url': video['video_thumbnail'],
        'video_filename_tmp': video['title'],
        'recorded': talk['date'][:10],
        'source_url': video['page_url'],
    }
    populate_video_urls(data, video, durations_dict)
    
    return data

In [3]:
def write_data_json(data, output_dir):
    filename = data.pop('video_filename_tmp')
    filename = re.sub(r'[,\': -]', '_', filename.encode('ascii', 'ignore'))
    write_to = '{odir}/{filename}.json'.format(odir=output_dir, filename=filename)
    print write_to
    with open(write_to, 'w') as of:
        json.dump(data, of)

In [4]:
merged_data = [merge_data(video, talk, durations_dict)
               for video, talk, _ in matches]

In [5]:
for d in merged_data:
    if d['title'] == u'The Art of Doing Nothing – Using profiling to speed up your code':
        d['language'] = 'English'

In [6]:
output_dir = '../pyconfr2015-steve/json/'
for d in merged_data:
    write_data_json(d, output_dir)

../pyconfr2015-steve/json//012___Boris_FELD___Micro_services_ksako.json
../pyconfr2015-steve/json//011___Nicole_Harris___Learning_Python__Learning_French.json
../pyconfr2015-steve/json//010___Guillaume_Ayoub___Du_site_dynamique_au_document_imprimable_en_Python.json
../pyconfr2015-steve/json//009___Charlie_Clark___The_Art_of_Doing_Nothing___Using_profiling_to_speed_up_your_code.json
../pyconfr2015-steve/json//008___serge_sans_paille___Plus_loin_que_la_mmoization___la_tabulation.json
../pyconfr2015-steve/json//007___ebel_franck___Scapy__application__la_scurit.json
../pyconfr2015-steve/json//006___Boris_FELD___from_Sync_to_to_Async_Python__a_AsyncIO_migration.json
../pyconfr2015-steve/json//005___Feth_Arezki___Inventaire_des_spcificits_de_Python.json
../pyconfr2015-steve/json//004___Alex_Marandon___Piges_et_bizarreries_de_(C)Python.json
../pyconfr2015-steve/json//003___Alexis_Mtaireau__Mathieu_Leplatre__Rmy_Hubscher___Kinto__stocker__synchroniser_et_partager_ses_donnes_de_manire_gnrique.j

In [7]:
misses

[{u'page_url': u'http://video-pyconfr2015.paulla.asso.fr/113_-_Patrice_Journaud,_Florent_Aide_-_Python,_routing_et_TSP.html',
  u'title': u'113 - Patrice Journaud, Florent Aide - Python, routing et TSP',
  u'video_thumbnail': u'http://video-pyconfr2015.paulla.asso.fr/113%20-%20Patrice%20Journaud,%20Florent%20Aide%20-%20Python,%20routing%20et%20TSP.mp4.jpg',
  u'videos': [{u'src': u'http://video-pyconfr2015.paulla.asso.fr/113%20-%20Patrice%20Journaud,%20Florent%20Aide%20-%20Python,%20routing%20et%20TSP.ogv',
    u'type': u'video/ogg'},
   {u'src': u'http://video-pyconfr2015.paulla.asso.fr/113%20-%20Patrice%20Journaud,%20Florent%20Aide%20-%20Python,%20routing%20et%20TSP.webm',
    u'type': u'video/webm'},
   {u'src': u'http://video-pyconfr2015.paulla.asso.fr/113%20-%20Patrice%20Journaud,%20Florent%20Aide%20-%20Python,%20routing%20et%20TSP.mp4',
    u'type': u'video/mp4'}]},
 {u'page_url': u'http://video-pyconfr2015.paulla.asso.fr/015_-_AG_AFPy.html',
  u'title': u'015 - AG AFPy',
  u'vid

In [8]:
video113 = misses[0]

In [9]:
video113_data = {
    'state': 2,
    'category': 'PyCon FR 2015',
    'title': 'Python, routing et TSP',
    'speakers': ['Patrice Journaud', 'Florent Aide'],
    'language': 'French',
    'summary': 'Python, routing et TSP',
    'video_filename_tmp': video113['title']
}
populate_video_urls(video113_data, video113, durations_dict)
pp(video113_data)

{'category': 'PyCon FR 2015',
 'duration': 1718,
 'language': 'French',
 'speakers': ['Patrice Journaud', 'Florent Aide'],
 'state': 2,
 'summary': 'Python, routing et TSP',
 'title': 'Python, routing et TSP',
 'video_filename_tmp': u'113 - Patrice Journaud, Florent Aide - Python, routing et TSP',
 'video_mp4_url': u'http://video-pyconfr2015.paulla.asso.fr/113%20-%20Patrice%20Journaud,%20Florent%20Aide%20-%20Python,%20routing%20et%20TSP.mp4',
 'video_ogv_url': u'http://video-pyconfr2015.paulla.asso.fr/113%20-%20Patrice%20Journaud,%20Florent%20Aide%20-%20Python,%20routing%20et%20TSP.ogv',
 'video_webm_url': u'http://video-pyconfr2015.paulla.asso.fr/113%20-%20Patrice%20Journaud,%20Florent%20Aide%20-%20Python,%20routing%20et%20TSP.webm'}


In [10]:
write_data_json(video113_data, '../pyconfr2015-steve/json/')

../pyconfr2015-steve/json//113___Patrice_Journaud__Florent_Aide___Python__routing_et_TSP.json
