In [1]:
import ast
from os.path import commonprefix
from urllib.parse import urlparse, parse_qs

import pandas as pd

In [2]:
""" Read sentence-level library """
sentences = pd.read_excel("uk_library\\attenborough_sentences.xlsx")
sentences.head()

Unnamed: 0,Content,Name,Metadata,Tags
0,At the southern tip of the Australian continen...,David Attenborough's | Tasmania | Weird and Wo...,"{'start': 0, 'end': 82, 'id': 'https://www.you...",DAVID ATTENBOROUGH: FULL DOCUMENTARIES
1,"An immense wilderness, divided by mountains.",David Attenborough's | Tasmania | Weird and Wo...,"{'start': 17, 'end': 112, 'id': 'https://www.y...",DAVID ATTENBOROUGH: FULL DOCUMENTARIES
2,"It's a world of ancient forests, of pristine r...",David Attenborough's | Tasmania | Weird and Wo...,"{'start': 28, 'end': 116, 'id': 'https://www.y...",DAVID ATTENBOROUGH: FULL DOCUMENTARIES
3,Its animal inhabitants are as extraordinary as...,David Attenborough's | Tasmania | Weird and Wo...,"{'start': 49, 'end': 130, 'id': 'https://www.y...",DAVID ATTENBOROUGH: FULL DOCUMENTARIES
4,This is a land of black devils and white walla...,David Attenborough's | Tasmania | Weird and Wo...,"{'start': 60, 'end': 138, 'id': 'https://www.y...",DAVID ATTENBOROUGH: FULL DOCUMENTARIES


In [3]:
""" Equip sentences with unique video id """
def parse_url(metadata: str):
    # Get url from metadata dict
    return ast.literal_eval(metadata)["id"]

def parse_id(url: str):
    # parse id from youtube url
    parse_result = urlparse(url)
    query_params = parse_qs(parse_result.query)
    return query_params["v"][0]

print(parse_id("https://www.youtube.com/watch?v=5HnoTT7so8w&t=0s"))

sentences["Id"] = sentences["Metadata"].apply(lambda metadata: parse_id(parse_url(metadata)))
sentences[["Id", "Name"]].head()

5HnoTT7so8w


Unnamed: 0,Id,Name
0,RMeacmRH0wA,David Attenborough's | Tasmania | Weird and Wo...
1,RMeacmRH0wA,David Attenborough's | Tasmania | Weird and Wo...
2,RMeacmRH0wA,David Attenborough's | Tasmania | Weird and Wo...
3,RMeacmRH0wA,David Attenborough's | Tasmania | Weird and Wo...
4,RMeacmRH0wA,David Attenborough's | Tasmania | Weird and Wo...


In [4]:
""" Join all sentences belonging to the same video on new lines """
transcripts = sentences.groupby("Id").agg({
    "Name": lambda names: commonprefix(list(names))[:-1],
    "Content": "\n".join,
    "Tags": "first",
    "Metadata": "first"
})
transcripts.head()

Unnamed: 0_level_0,Name,Content,Tags,Metadata
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7MxeAPR-uvQ,A Perfect Planet: Prequel | New David Attenbor...,This is a perfect planet.\nLife flourishes her...,DAVID ATTENBOROUGH: FULL DOCUMENTARIES,"{'start': 0, 'end': 92, 'id': 'https://www.you..."
7yn9hZb9M2U,Empire of the Ants 2020 BBC Documentary HD,The Jura Mountains on the French-Swiss border ...,DAVID ATTENBOROUGH: FULL DOCUMENTARIES,"{'start': 0, 'end': 55, 'id': 'https://www.you..."
8J7ycovMqMo,Animals of Africa with David Attenborough - HD...,"After crossing the Mara River, the herds follo...",DAVID ATTENBOROUGH: FULL DOCUMENTARIES,"{'start': 0, 'end': 34, 'id': 'https://www.you..."
9FqwhW0B3tY,Our Planet | High Seas | FULL EPISODE | Netflix,"Just 50 years ago, we finally ventured to the ...",DAVID ATTENBOROUGH: FULL DOCUMENTARIES,"{'start': 0, 'end': 112, 'id': 'https://www.yo..."
9Nr_XukuCZM,Animals of Africa in HIGH DEFINITION HD Docume...,"After crossing the Mara River, the herds follo...",DAVID ATTENBOROUGH: FULL DOCUMENTARIES,"{'start': 0, 'end': 34, 'id': 'https://www.you..."


In [5]:
""" Clean the resulting data and tag the entries appropriately """
def delete_video_positions(metadata: str):
    dct = ast.literal_eval(metadata)
    del dct["start"]
    del dct["end"]
    return str(dct)

transcripts["Metadata"] = transcripts["Metadata"].apply(delete_video_positions)

transcripts["Tags"] = transcripts["Tags"].apply(lambda tags: "TRANSCRIPT_LEVEL, " + tags)
sentences["Tags"] = sentences["Tags"].apply(lambda tags: "SENTENCE_LEVEL, " + tags)

In [6]:
""" Merge the two libraries and dump the result to an xlsx file """
multi_level_library = pd.concat([transcripts, sentences]).reset_index(drop=True).drop("Id", axis=1)
display(multi_level_library)

multi_level_library.to_excel("uk_library\\attenborough_transcripts_and_sentences.xlsx")

Unnamed: 0,Name,Content,Tags,Metadata
0,A Perfect Planet: Prequel | New David Attenbor...,This is a perfect planet.\nLife flourishes her...,"TRANSCRIPT_LEVEL, DAVID ATTENBOROUGH: FULL DOC...",{'id': 'https://www.youtube.com/watch?v=7MxeAP...
1,Empire of the Ants 2020 BBC Documentary HD,The Jura Mountains on the French-Swiss border ...,"TRANSCRIPT_LEVEL, DAVID ATTENBOROUGH: FULL DOC...",{'id': 'https://www.youtube.com/watch?v=7yn9hZ...
2,Animals of Africa with David Attenborough - HD...,"After crossing the Mara River, the herds follo...","TRANSCRIPT_LEVEL, DAVID ATTENBOROUGH: FULL DOC...",{'id': 'https://www.youtube.com/watch?v=8J7yco...
3,Our Planet | High Seas | FULL EPISODE | Netflix,"Just 50 years ago, we finally ventured to the ...","TRANSCRIPT_LEVEL, DAVID ATTENBOROUGH: FULL DOC...",{'id': 'https://www.youtube.com/watch?v=9FqwhW...
4,Animals of Africa in HIGH DEFINITION HD Docume...,"After crossing the Mara River, the herds follo...","TRANSCRIPT_LEVEL, DAVID ATTENBOROUGH: FULL DOC...",{'id': 'https://www.youtube.com/watch?v=9Nr_Xu...
...,...,...,...,...
8122,Empire of the Ants 2020 BBC Documentary HD_367,"With the help of Frank and Cam, they took us i...","SENTENCE_LEVEL, DAVID ATTENBOROUGH: FULL DOCUM...","{'start': 3502, 'end': 3537, 'id': 'https://ww..."
8123,Empire of the Ants 2020 BBC Documentary HD_368,"And remarkably, using a tiny lens just like th...","SENTENCE_LEVEL, DAVID ATTENBOROUGH: FULL DOCUM...","{'start': 3507, 'end': 3537, 'id': 'https://ww..."
8124,Empire of the Ants 2020 BBC Documentary HD_369,"Searching out the true story of Jumbo, the fir...","SENTENCE_LEVEL, DAVID ATTENBOROUGH: FULL DOCUM...","{'start': 3521, 'end': 3537, 'id': 'https://ww..."
8125,Empire of the Ants 2020 BBC Documentary HD_370,Who'd be a Ranganathan when your dad tells all?,"SENTENCE_LEVEL, DAVID ATTENBOROUGH: FULL DOCUM...","{'start': 3528, 'end': 3537, 'id': 'https://ww..."
