In [1]:
import xml.dom.minidom
import xml.etree.ElementTree as ET

import zipfile
import os
from collections import defaultdict
from glob import glob
from os.path import join as pjoin

In [2]:
filepath_2016 = os.listdir("wit/XML_releases/xml/")
filepath_2015 = os.listdir("wit/XML_releases/xml-20150616/")
filepath_2014 = os.listdir("wit/XML_releases/xml-20140120/")

In [3]:
print(len(filepath_2016))
print(len(filepath_2015))
print(len(filepath_2014))

218
210
204


In [4]:
def et_to_dict(tree):
    dct = {tree.tag: {} if tree.attrib else None}
    children = list(tree)
    if children:
        dd = defaultdict(list)
        for dc in map(et_to_dict, children):
            for k, v in dc.items():
                dd[k].append(v)
        dct = {tree.tag: dd}
    if tree.attrib:
        dct[tree.tag].update((k, v) for k, v in tree.attrib.items())
    if tree.text:
        text = tree.text.strip()
        if children or tree.attrib:
            if text:
                dct[tree.tag]["text"] = text
        else:
            dct[tree.tag] = text
    return dct

### Files which are corrupted (truncated?)
* XML_releases/xml-20150616/ted_ru-20150530.zip 

    *file_id* = 37 missing *wordnum*, *charnum* & *content*
 
* ted_uk-20150530.zip
    *file_id* = 6 


In [5]:
# for file in filepath_2015:
#     if file.startswith('._ted') or 'wit3.dtd' in file:
#         continue
#     else:
#         try:
#             with zipfile.ZipFile("wit/XML_releases/xml-20150616//" + file) as zf:
#                 tree = ET.parse(zf.open(file[:-3]+"xml"))
#                 root = tree.getroot()
#                 lang1_talks = et_to_dict(root)
#         except:
#             print(file)

#### I will use the next few code blocks to test the validity of each file 

Result after running the next few blocks:

- Added checks to check if description & title fields are present. As a default behaviour also added a " " via get 

- Sometimes content & transcriptions are not present. Checked before assigning it to a variable. 

Both the above changes ensure that there are no further errors. Integrated it into the data_loader

In [6]:
path_2016 = "wit/XML_releases/xml/"
path_2015 = "wit/XML_releases/xml-20150616/"
path_2014 = "wit/XML_releases/xml-20140120/"

In [133]:
%%time
translation = list()
for file in filepath_2016:
    if file.startswith('._ted') or 'wit3.dtd' in file:
        continue
    else:
        # Get language code
        try:
            lang_code = file.split('-')[0].replace("ted_", "") 
        except:
            print(f"ERROR: Reading language code for file:: {file}")
            
        # Break if the file has malformed XML
        try:
            with zipfile.ZipFile(path_2016 + file) as zf:
                tree = ET.parse(zf.open(file[:-3]+"xml"))
                root = tree.getroot()
                lang_talks = et_to_dict(root).get('xml').get('file')
        except:
            print(f"ERROR: Malformed or truncated XML file:: {file}")
            continue
        
        lang_talksids = [talk.get("head")[0].get("talkid") for talk in lang_talks]
        
        translation = list()
        for talkid in lang_talksids:
            source = list(filter(lambda talk: talk['head'][0]['talkid'] == talkid, lang_talks))
            if len(source) == 0:
                print(f"No talks found for talkid::{talkid} and file:: {file}")
                pass
            else:
                source = source[0]
            try:

                if source['head'][0]['description']:
                    if source['head'][0]['description'][0]:
                        temp_dict = dict()
                        temp_dict['id'] = source['head'][0].get('talkid')[0] + "_1"
                        temp_dict[lang_code] =  source['head'][0].get('description', " ")[0].replace("TED Talk Subtitles and Transcript: ", "")
                        translation.append(temp_dict)
            except:
                print(f"ERROR: Reading description for talkid::{talkid}, file:: {file}")
            
            try:
                if source['head'][0]['title']:
                    if source['head'][0]['title'][0]:
                        temp_dict = dict()
                        temp_dict['id'] = source['head'][0].get('talkid')[0] + "_2"
                        temp_dict[lang_code] =  source['head'][0]['title'][0]
                        translation.append(temp_dict)
            except:
                print(f"ERROR: Reading title for talkid::{talkid}, file:: {file}")
                
            try:
                if source.get('head')[0].get('transcription'): 
                    source_transc = source.get('head')[0].get('transcription')[0].get('seekvideo')
                    transcriptions = [{'id': s.get('id'), lang_code: s.get('text')} for s in source_transc]
                    translation.extend(transcriptions)
            except:
                print(f"ERROR: Reading transcriptions for talkid::{talkid}, file:: {file}")
                
print(f"LENGTH OF TRANSLATION: {len(translation)}")

LENGTH OF TRANSLATION: 5
CPU times: user 3min 3s, sys: 6.53 s, total: 3min 9s
Wall time: 3min 9s


In [127]:
filepath = path_2015
filename = "ted_ja-20150530.zip"

with zipfile.ZipFile(filepath + filename) as zf:
    tree = ET.parse(zf.open(filename[:-3]+"xml"))
    root = tree.getroot()
    lang_talks = et_to_dict(root).get('xml').get('file')
    lang_code = filename.split('-20')[0].replace("ted_","")

print(len(lang_talks))
print(lang_code)

1766
ja


In [128]:
talkid = "1435"
source = list(filter(lambda talk: talk['head'][0]['talkid'][0] == talkid, lang_talks))
source = source[0]

In [129]:
source

defaultdict(list,
            {'head': [defaultdict(list,
                          {'url': ['http://www.ted.com/talks/reuben_margolin_sculpting_waves_in_wood_and_time'],
                           'pagesize': ['77440'],
                           'dtime': ['Mon Dec 22 11:49:33 CET 2014'],
                           'encoding': ['UTF-8'],
                           'content-type': ['text/html; charset=utf-8'],
                           'keywords': ['talks, TED Conference, art, culture, design, nature'],
                           'speaker': ['Reuben Margolin'],
                           'talkid': ['1435'],
                           'videourl': ['http://download.ted.com/talks/ReubenMargolin_2012.mp4'],
                           'videopath': ['talks/ReubenMargolin_2012.mp4'],
                           'date': ['2012/03/01'],
                           'wordnum': ['0'],
                           'charnum': ['0']})],
             'content': [None],
             'id': '697'})

In [80]:
temp_dict = dict()

temp_dict['id'] = source['head'][0].get('talkid')[0] + "_1"
temp_dict[lang_code] =  source['head'][0].get('description', " ")[0].replace("TED Talk Subtitles and Transcript: ", "")
temp_dict

{'id': '1779_1', 'fa': ' '}

In [81]:
source['head'][0].get('title', " ")

' '

In [82]:
if source['head'][0].get('title'):
    if source['head'][0].get('title')[0]:
        temp_dict = dict()
        temp_dict['id'] = source['head'][0].get('talkid')[0] + "_2"
        temp_dict[lang_code] =  source['head'][0].get('title', " ")[0]
        print(temp_dict)

In [83]:
temp_dict

{'id': '1779_1', 'fa': ' '}

In [130]:
if source.get('head')[0].get('transcription'): 
    source_transc = source.get('head')[0].get('transcription')[0].get('seekvideo')
    transcriptions = [{'id': s.get('id'), lang_code: s.get('text')} for s in source_transc]

#### As is observed the returned dict file has the followin structure

`lang1_talks['xml']['file']` --> list of all the records

`lang1_talks['xml']['language']` --> language in which the records are transcribed/translated

`lang1_talks['xml']['file'][N].keys()` :: *head*, *content*, *id*

*id*: is an integer from 1 to N

*content*: the entire content of the talk

*head*: this is the meat that I am interested in! 


In [None]:
# file_en = "ted_en-20160408.zip"
# file_hi = "ted_hi-20160408.zip"

# lang1 = "_en" # Originally done with en_hi
# lang2 = "_hi"

In [None]:
# for file in filepath_2015:
#     if file.startswith('._ted'):
#         continue
#     if lang1 in file:
#         with zipfile.ZipFile("wit/XML_releases/xml-20150616//" + file) as zf:
#             tree = ET.parse(zf.open(file[:-3]+"xml"))
#             root = tree.getroot()
#             lang1_talks = et_to_dict(root)
            
#     elif lang2 in file:
#         with zipfile.ZipFile("wit/XML_releases/xml-20150616//" + file) as zf:
#             tree = ET.parse(zf.open(file[:-3]+"xml"))
#             root = tree.getroot()
#             lang2_talks = et_to_dict(root)

### The *head* 

For some strange reason the head is a list of one. which has a __defaultdict__

`head[0].keys()` --> *url*, *pagesize*, *dtime*, *encoding*, *content-type*, *keywords*, *speaker*, *talkid*, *videourl*, *videopath*, *date*, *title*, *description*, *transcription*, *translators*, *reviewers*, *wordnum*, *charnum*

`head[0]['transcription'][0]['seekvideo']` is a list with sentence encoded with *id* & *text*

`head[0]['description']` has the description of the talk & can also be used as a training example
                         could do good to write a replace script for __"TED Talk Subtitles and Transcript: "__


In [None]:
# lang2_talks['xml']['file'][len(lang2_talks['xml']['file'])-1]['id']

In [None]:
# lang2_talks['xml']['file'][0]['head'][0].keys()

In [None]:
# lang2_talks['xml']['file'][0]['head'][0]['transcription'][0]['seekvideo'][:3]

In [None]:
# #lang2_talks['xml']['file'][0]['head'][0]['transcription'][0]['seekvideo']
# lang2_talks['xml']['file'][10]['head'][0]['description']

In [None]:
# #lets first strip the initial fluff
# lang1_talks = lang1_talks['xml']['file']
# lang2_talks = lang2_talks['xml']['file']

In [None]:
# print(len(lang2_talks)) # Validating that we have got a list with the same number of items
# print(len(lang1_talks))

In [None]:
# lang2_talks[10]['head'][0]['talkid']

In [None]:
# list(filter(lambda talk: talk['id'] == '23', lang2_talks))

In [None]:
# lang2_ids = [talk['head'][0]['talkid'] for talk in lang2_talks ]
# lang1_ids = [talk['head'][0]['talkid'] for talk in lang1_talks ]

# print(len(lang2_ids))
# print(len(lang1_ids))


In [None]:
# comm_talkids = [talkid for talkid in lang2_ids if talkid in lang1_ids]
# print(len(comm_talkids))

#### Code used to parse the XML file and create a translation pair for the given language pirs

In [None]:
# translation = list()
    
# for talkid in comm_talkids:
#     source = list(filter(lambda talk: talk['head'][0]['talkid'] == talkid, lang1_talks))
#     target = list(filter(lambda talk: talk['head'][0]['talkid'] == talkid, lang2_talks))
#     if len(source) == 0 or len(target) == 0:
#         pass
#     else:
#         source = source[0]
#         target = target[0]
        
#     if source['head'][0]['description'][0]:
#         if target['head'][0]['description'][0]:

#             temp_dict = dict()
#             temp_dict['id'] = source['head'][0].get('talkid')[0] + "_1"
#             temp_dict['en'] =  source['head'][0].get('description', " ")[0].replace("TED Talk Subtitles and Transcript: ", "")
#             temp_dict['hi'] =  target['head'][0].get('description', " ")[0].replace("TED Talk Subtitles and Transcript: ", "")
#             translation.append(temp_dict)

#     temp_dict = dict()
#     temp_dict['id'] = source['head'][0].get('talkid')[0] + "_2"
#     temp_dict['en'] =  source['head'][0]['title'][0]
#     temp_dict['hi'] =  target['head'][0]['title'][0]
#     translation.append(temp_dict)
    
#     source_transc = source.get('head')[0].get('transcription')[0].get('seekvideo')
#     target_transc = target.get('head')[0].get('transcription')[0].get('seekvideo')
    
#     transc = zip(source_transc, target_transc)
#     transcriptions = [{'id': s.get('id'), 'en': s.get('text'), 'hi': t.get('text')} for s, t in transc]
#     translation.extend(transcriptions)

In [None]:
# if target['head'][0]['description']: 
#     print("yes")

In [None]:
# len(translation)

In [None]:
# translation[:5]