# Import Packages

In [1]:
import pandas as pd
import spacy
from spacy_langdetect import LanguageDetector

# Read in Data

In [2]:
talk_df = pd.read_csv('full_data.csv', index_col = 0)

# Check for missing data

In [3]:
talk_df.head()

Unnamed: 0,date,speaker,title,url,length,summ,tags,views,transcript,date_recorded,upload_date,occupation,bio,comments
0,Jan 2020,Ipsita Dasgupta,"To challenge the status quo, find a ""co-conspi...",/talks/ipsita_dasgupta_to_challenge_the_status...,11:03,"In a complex and changing world, how can we ma...","['innovation', 'collaboration', 'society', 'so...",599446,So I've been thinking about how to explain thi...,2019-09-24,2020-01-02,"Business executive, ""co-conspirator""",Ipsita Dasgupta drives the consumption of ente...,7
1,Jan 2020,Rod Phillips,A brief history of alcohol,/talks/rod_phillips_a_brief_history_of_alcohol,4:56,Nobody knows exactly when humans began to crea...,"['TED-Ed', 'education', 'animation', 'history'...",501290,This chimpanzee stumbles across a windfall of ...,2020-01-02,2020-01-02,,,--
2,Jan 2020,Pat Mitchell,Dangerous times call for dangerous women,/talks/pat_mitchell_dangerous_times_call_for_d...,17:14,Pat Mitchell has nothing left to prove and muc...,"['women', 'women in business', 'community', 'a...",461705,"Recently, I've been declaring to anyone who wo...",2019-12-04,2020-01-02,Dangerous woman,Pat Mitchell is a lifelong advocate for women ...,14
3,Dec 2019,Cara E. Yar Khan,The beautiful balance between courage and fear,/talks/cara_e_yar_khan_the_beautiful_balance_b...,9:55,After being diagnosed with a rare genetic cond...,"['fear', 'personal growth', 'health', 'life', ...",880662,"When we're young, we're innocently brave, and ...",2019-12-04,2019-12-23,Human rights and disability activist,Cara E. Yar Khan is an international human rig...,17
4,Dec 2019,Valorie Kondos Field,Why winning doesn't always equal success,/talks/valorie_kondos_field_why_winning_doesn_...,15:49,Valorie Kondos Field knows a lot about winning...,"['success', 'sports', 'leadership', 'empathy',...",956498,"OK, I have a question for all of us. You ready...",2019-12-04,2019-12-20,Gymnastics coach,Valorie Kondos Field is the retired head coach...,18


In [4]:
talk_df.transcript.isna().sum()

595

In [5]:
for index in range(0, len(talk_df)):
    if not isinstance(talk_df.transcript[index], str):
        talk_df.transcript[index] = '--'

In [6]:
(talk_df.transcript == '').sum()

0

In [7]:
(talk_df.transcript == '--').sum()

595

In [30]:
no_transcript = talk_df[talk_df.transcript == '--']

for url in no_transcript.url:
    print('https://www.ted.com' + url)

https://www.ted.com/talks/jean_manuel_izaret_a_new_netflix_style_pricing_model_that_could_make_medical_treatments_affordable_for_all
https://www.ted.com/talks/juniper_fitzgerald_why_we_need_to_stop_stigmatizing_mothers_who_do_sex_work
https://www.ted.com/talks/dorsa_amir_how_the_industrial_revolution_changed_childhood
https://www.ted.com/talks/adar_cohen_3_ways_to_lead_tough_unavoidable_conversations
https://www.ted.com/talks/martha_redbone_sleep_sleep_beauty_bright
https://www.ted.com/talks/maria_popova_an_excerpt_from_figuring
https://www.ted.com/talks/morley_follow_the_sound
https://www.ted.com/talks/david_carroll_how_i_sued_cambridge_analytica_over_my_personal_data
https://www.ted.com/talks/chris_fisher_why_we_should_archive_everything_on_the_planet
https://www.ted.com/talks/ella_al_shamahi_why_archaeology_needs_to_transcend_borders
https://www.ted.com/talks/andrew_nemr_the_sounds_and_sights_of_tap_dance
https://www.ted.com/talks/hiromi_ozaki_how_i_bring_myth_and_magic_to_life
http

In [8]:
transcripts = talk_df[talk_df.transcript != '--']['transcript']

In [9]:
len(transcripts)

3649

In [34]:
transcripts

0       So I've been thinking about how to explain thi...
1       This chimpanzee stumbles across a windfall of ...
2       Recently, I've been declaring to anyone who wo...
3       When we're young, we're innocently brave, and ...
4       OK, I have a question for all of us. You ready...
5       I wanted to be a psychologist since I was a te...
6       In the 4th century BCE, a banker’s son threw t...
7       Chris Anderson: Nick Bostrom. So, you have alr...
8       In June of 2017, I volunteered with a group at...
9       So, on April 23 of 2013, the Associated Press ...
10      A mother and her son trek across an endless de...
11      We are all atomically connected. Fundamentally...
12      In her Auntie An-mei’s home, Jing-Mei reluctan...
13      Now, I know it might be easy to think that mic...
14      Aquay Wunne Kesuk. Kelsey Leonard Nooweesuonk....
15      I am an ideas activist. That means I fight for...
16      William Golding was losing his faith in humani...
17      It was

# Check for English Transcripts

In [32]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
text = 'This is an english text.'
doc = nlp(text)
# document level language detection. Think of it like average language of the document!
print(doc._.language)

{'language': 'en', 'score': 0.9999953402501132}


In [37]:
docs = []

for transcript in transcripts:
    doc = nlp(transcript)
    print(doc._.language)
    docs.append(doc)

{'language': 'en', 'score': 0.9999975966759548}
{'language': 'en', 'score': 0.9999958509616962}
{'language': 'en', 'score': 0.999997216310683}
{'language': 'en', 'score': 0.99999461017091}
{'language': 'en', 'score': 0.9999984817468119}
{'language': 'en', 'score': 0.9999954464958546}
{'language': 'en', 'score': 0.9999972905363708}
{'language': 'en', 'score': 0.9999972577275744}
{'language': 'en', 'score': 0.9999961068862551}
{'language': 'en', 'score': 0.9999968914905872}
{'language': 'en', 'score': 0.9999980306146714}
{'language': 'en', 'score': 0.9999977074050481}
{'language': 'en', 'score': 0.9999956406337589}
{'language': 'en', 'score': 0.9999976965252378}
{'language': 'en', 'score': 0.9999963855461114}
{'language': 'en', 'score': 0.9999968428658197}
{'language': 'en', 'score': 0.9999974116981157}
{'language': 'en', 'score': 0.9999964233933782}
{'language': 'en', 'score': 0.9999935177601322}
{'language': 'en', 'score': 0.9999964227394472}
{'language': 'en', 'score': 0.9999973562562

{'language': 'en', 'score': 0.9999956904762035}
{'language': 'en', 'score': 0.9999963418689591}
{'language': 'en', 'score': 0.9999944574645903}
{'language': 'en', 'score': 0.9999974080170946}
{'language': 'en', 'score': 0.9999972045678902}
{'language': 'en', 'score': 0.9999972271309495}
{'language': 'en', 'score': 0.9999977531580689}
{'language': 'en', 'score': 0.9999974948820577}
{'language': 'en', 'score': 0.99999701586922}
{'language': 'en', 'score': 0.9999963863588668}
{'language': 'en', 'score': 0.9999972580159557}
{'language': 'en', 'score': 0.9999981491468183}
{'language': 'en', 'score': 0.9999935891932683}
{'language': 'en', 'score': 0.9999953811419069}
{'language': 'en', 'score': 0.9999983622605249}
{'language': 'en', 'score': 0.9999957940044397}
{'language': 'en', 'score': 0.9999969510922069}
{'language': 'en', 'score': 0.9999978847311737}
{'language': 'en', 'score': 0.9999971097846152}
{'language': 'en', 'score': 0.9999957585840689}
{'language': 'en', 'score': 0.999998313350

{'language': 'en', 'score': 0.9999962366999964}
{'language': 'en', 'score': 0.9999970665643992}
{'language': 'en', 'score': 0.9999978241653835}
{'language': 'en', 'score': 0.9999973378661318}
{'language': 'en', 'score': 0.9999976955509018}
{'language': 'en', 'score': 0.9999962996662789}
{'language': 'en', 'score': 0.9999965246739952}
{'language': 'en', 'score': 0.9999966581265153}
{'language': 'en', 'score': 0.999997073225509}
{'language': 'en', 'score': 0.9999957433019113}
{'language': 'en', 'score': 0.999998566635903}
{'language': 'en', 'score': 0.9999982486971428}
{'language': 'en', 'score': 0.9999977698306011}
{'language': 'en', 'score': 0.9999973440015092}
{'language': 'en', 'score': 0.999998474168744}
{'language': 'en', 'score': 0.999997466612486}
{'language': 'en', 'score': 0.9999973413907193}
{'language': 'en', 'score': 0.9999966040331978}
{'language': 'en', 'score': 0.9999969301193545}
{'language': 'en', 'score': 0.999996827692029}
{'language': 'en', 'score': 0.999998503404639

{'language': 'en', 'score': 0.9999972820536833}
{'language': 'en', 'score': 0.9999969961250925}
{'language': 'en', 'score': 0.999997714447935}
{'language': 'en', 'score': 0.999996371019338}
{'language': 'en', 'score': 0.9999963505402498}
{'language': 'en', 'score': 0.9999987776098228}
{'language': 'en', 'score': 0.9999983935979495}
{'language': 'en', 'score': 0.9999974639107522}
{'language': 'en', 'score': 0.9999988315900886}
{'language': 'en', 'score': 0.9999962339924451}
{'language': 'en', 'score': 0.9999972838823501}
{'language': 'en', 'score': 0.9999976569650435}
{'language': 'en', 'score': 0.9999975945359694}
{'language': 'en', 'score': 0.9999975745436597}
{'language': 'en', 'score': 0.9999956403176222}
{'language': 'en', 'score': 0.9999988034350478}
{'language': 'en', 'score': 0.9999960830952844}
{'language': 'en', 'score': 0.9999973183828027}
{'language': 'en', 'score': 0.9999970680350109}
{'language': 'en', 'score': 0.9999981713598185}
{'language': 'en', 'score': 0.999995173562

{'language': 'en', 'score': 0.9999973777589656}
{'language': 'en', 'score': 0.999996829052266}
{'language': 'en', 'score': 0.9999976332961162}
{'language': 'en', 'score': 0.999994216702077}
{'language': 'en', 'score': 0.9999969640438673}
{'language': 'en', 'score': 0.9999975199390827}
{'language': 'en', 'score': 0.9999988008151144}
{'language': 'en', 'score': 0.9999972177655034}
{'language': 'en', 'score': 0.9999974695945442}
{'language': 'en', 'score': 0.9999966418679611}
{'language': 'en', 'score': 0.9999961554391339}
{'language': 'en', 'score': 0.9999960008867748}
{'language': 'en', 'score': 0.9999984622196906}
{'language': 'en', 'score': 0.9999961856579311}
{'language': 'en', 'score': 0.999995866024452}
{'language': 'en', 'score': 0.9999971769645092}
{'language': 'en', 'score': 0.9999973113410525}
{'language': 'en', 'score': 0.9999967533236419}
{'language': 'en', 'score': 0.9999958226793669}
{'language': 'en', 'score': 0.9999964055169497}
{'language': 'en', 'score': 0.9999973264778

{'language': 'en', 'score': 0.9999965484242246}
{'language': 'en', 'score': 0.9999970936619148}
{'language': 'en', 'score': 0.9999949406068225}
{'language': 'en', 'score': 0.9999978649782808}
{'language': 'en', 'score': 0.9999963706919799}
{'language': 'en', 'score': 0.9999959053808665}
{'language': 'en', 'score': 0.9999966943046517}
{'language': 'en', 'score': 0.9999966445540545}
{'language': 'en', 'score': 0.9999991522278935}
{'language': 'en', 'score': 0.9999983919560943}
{'language': 'en', 'score': 0.9999960592328524}
{'language': 'en', 'score': 0.999996160634201}
{'language': 'en', 'score': 0.9999963433863235}
{'language': 'en', 'score': 0.9999965665470655}
{'language': 'en', 'score': 0.99999661394342}
{'language': 'en', 'score': 0.9999980584954787}
{'language': 'en', 'score': 0.9999974574797521}
{'language': 'en', 'score': 0.9999973377983745}
{'language': 'en', 'score': 0.9999966969987795}
{'language': 'en', 'score': 0.9999981467618037}
{'language': 'en', 'score': 0.9999961419883

{'language': 'en', 'score': 0.9999968988267022}
{'language': 'en', 'score': 0.999997649621141}
{'language': 'en', 'score': 0.999996420369013}
{'language': 'en', 'score': 0.9999971255460158}
{'language': 'en', 'score': 0.9999963652678312}
{'language': 'en', 'score': 0.9999967077296454}
{'language': 'en', 'score': 0.99999697849226}
{'language': 'en', 'score': 0.9999965499629672}
{'language': 'en', 'score': 0.9999961456474777}
{'language': 'en', 'score': 0.9999964180737188}
{'language': 'en', 'score': 0.999996685761996}
{'language': 'en', 'score': 0.9999964222171593}
{'language': 'en', 'score': 0.9999972303553384}
{'language': 'en', 'score': 0.9999973768658712}
{'language': 'en', 'score': 0.9999974524225459}
{'language': 'en', 'score': 0.9999970981283564}
{'language': 'en', 'score': 0.9999972955901639}
{'language': 'en', 'score': 0.999997925827055}
{'language': 'en', 'score': 0.9999979023771609}
{'language': 'en', 'score': 0.9999963061181909}
{'language': 'en', 'score': 0.9999987325945963

{'language': 'en', 'score': 0.9999973033844027}
{'language': 'en', 'score': 0.9999985875952062}
{'language': 'en', 'score': 0.9999975845632507}
{'language': 'en', 'score': 0.9999976052610973}
{'language': 'en', 'score': 0.9999975678551989}
{'language': 'en', 'score': 0.999996933196811}
{'language': 'en', 'score': 0.9999972904580449}
{'language': 'en', 'score': 0.9999966095571122}
{'language': 'en', 'score': 0.9999981853271456}
{'language': 'en', 'score': 0.999998130467985}
{'language': 'en', 'score': 0.9999974359487661}
{'language': 'en', 'score': 0.9999978940247756}
{'language': 'en', 'score': 0.9999976789629103}
{'language': 'en', 'score': 0.9999979030922581}
{'language': 'en', 'score': 0.9999985842268915}
{'language': 'en', 'score': 0.9999976116011113}
{'language': 'en', 'score': 0.9999969697415797}
{'language': 'en', 'score': 0.9999985874641403}
{'language': 'en', 'score': 0.9999965785213071}
{'language': 'en', 'score': 0.9999981895359301}
{'language': 'en', 'score': 0.999995222863

{'language': 'en', 'score': 0.9999983707787689}
{'language': 'en', 'score': 0.9999982563029466}
{'language': 'en', 'score': 0.9999981854987243}
{'language': 'en', 'score': 0.9999973184817033}
{'language': 'en', 'score': 0.9999969612687158}
{'language': 'en', 'score': 0.9999950374534313}
{'language': 'en', 'score': 0.9999977852486823}
{'language': 'en', 'score': 0.9999962219825886}
{'language': 'en', 'score': 0.9999962364100952}
{'language': 'en', 'score': 0.9999964869829874}
{'language': 'en', 'score': 0.9999963964169025}
{'language': 'en', 'score': 0.9999975327696844}
{'language': 'en', 'score': 0.9999988108949827}
{'language': 'en', 'score': 0.999996407826204}
{'language': 'en', 'score': 0.9999970988500836}
{'language': 'en', 'score': 0.9999965519852196}
{'language': 'en', 'score': 0.9999957890958927}
{'language': 'en', 'score': 0.9999954917533243}
{'language': 'en', 'score': 0.9999983319004031}
{'language': 'en', 'score': 0.9999955838432766}
{'language': 'en', 'score': 0.99999827248

{'language': 'en', 'score': 0.9999965140885447}
{'language': 'en', 'score': 0.9999969049983889}
{'language': 'en', 'score': 0.9999978986626346}
{'language': 'en', 'score': 0.9999967221630707}
{'language': 'en', 'score': 0.9999980577802678}
{'language': 'en', 'score': 0.9999981793275536}
{'language': 'en', 'score': 0.9999976415060625}
{'language': 'en', 'score': 0.9999972552546832}
{'language': 'en', 'score': 0.9999977261335318}
{'language': 'en', 'score': 0.9999955165701684}
{'language': 'en', 'score': 0.9999971895216148}
{'language': 'en', 'score': 0.9999956576249682}
{'language': 'en', 'score': 0.9999967744154782}
{'language': 'en', 'score': 0.9999974181619146}
{'language': 'en', 'score': 0.9999973092793932}
{'language': 'en', 'score': 0.9999970444072248}
{'language': 'en', 'score': 0.9999976766922758}
{'language': 'en', 'score': 0.9999987013536118}
{'language': 'en', 'score': 0.9999977517231091}
{'language': 'en', 'score': 0.9999964259021895}
{'language': 'en', 'score': 0.9999972607

{'language': 'en', 'score': 0.9999972382035683}
{'language': 'en', 'score': 0.999997237774876}
{'language': 'en', 'score': 0.9999963897011327}
{'language': 'en', 'score': 0.9999987409355956}
{'language': 'en', 'score': 0.9999971585144242}
{'language': 'en', 'score': 0.9999959036245204}
{'language': 'en', 'score': 0.9999964412286135}
{'language': 'en', 'score': 0.9999959314857627}
{'language': 'en', 'score': 0.9999958937997894}
{'language': 'en', 'score': 0.9999970527367945}
{'language': 'en', 'score': 0.9999967924145317}
{'language': 'en', 'score': 0.99999598886346}
{'language': 'en', 'score': 0.9999968366268501}
{'language': 'en', 'score': 0.9999969029982386}
{'language': 'en', 'score': 0.9999981446058523}
{'language': 'en', 'score': 0.9999963934565915}
{'language': 'en', 'score': 0.9999969887541432}
{'language': 'en', 'score': 0.9999961930320316}
{'language': 'en', 'score': 0.9999990728956862}
{'language': 'en', 'score': 0.9999966947368998}
{'language': 'en', 'score': 0.9999962225625

{'language': 'en', 'score': 0.9999972203323859}
{'language': 'en', 'score': 0.9999949489867245}
{'language': 'en', 'score': 0.9999964607746044}
{'language': 'en', 'score': 0.999996774214326}
{'language': 'en', 'score': 0.9999972105786233}
{'language': 'en', 'score': 0.9999986291487852}
{'language': 'en', 'score': 0.9999966735475946}
{'language': 'en', 'score': 0.9999987732352248}
{'language': 'en', 'score': 0.9999963785580037}
{'language': 'en', 'score': 0.9999969441837375}
{'language': 'en', 'score': 0.9999965413780909}
{'language': 'en', 'score': 0.9999971595317901}
{'language': 'en', 'score': 0.9999974601506163}
{'language': 'en', 'score': 0.9999935128523912}
{'language': 'en', 'score': 0.9999980334945615}
{'language': 'en', 'score': 0.9999950768019574}
{'language': 'en', 'score': 0.9999968408462421}
{'language': 'en', 'score': 0.9999976265362459}
{'language': 'en', 'score': 0.9999953345948589}
{'language': 'en', 'score': 0.9999989602631676}
{'language': 'en', 'score': 0.99999706469

{'language': 'en', 'score': 0.9999972946185455}
{'language': 'en', 'score': 0.9999973032185907}
{'language': 'en', 'score': 0.9999984083034261}
{'language': 'en', 'score': 0.9999967352608907}
{'language': 'en', 'score': 0.9999950574959968}
{'language': 'en', 'score': 0.9999959419337651}
{'language': 'en', 'score': 0.9999964764608393}
{'language': 'en', 'score': 0.9999970617689751}
{'language': 'en', 'score': 0.9999970671093028}
{'language': 'en', 'score': 0.9999977977885237}
{'language': 'en', 'score': 0.9999994837219902}
{'language': 'en', 'score': 0.9999947157654895}
{'language': 'en', 'score': 0.9999981944385147}
{'language': 'en', 'score': 0.9999979760402149}
{'language': 'en', 'score': 0.9999969308550526}
{'language': 'en', 'score': 0.9999948016218453}
{'language': 'en', 'score': 0.999996441814701}
{'language': 'en', 'score': 0.9999975659216442}
{'language': 'en', 'score': 0.9999983273680897}
{'language': 'en', 'score': 0.9999974376960573}
{'language': 'en', 'score': 0.99999883839

{'language': 'en', 'score': 0.9999986044420832}
{'language': 'en', 'score': 0.9999985968327219}
{'language': 'en', 'score': 0.9999945358772842}
{'language': 'en', 'score': 0.9999971187854559}
{'language': 'en', 'score': 0.999997587300534}
{'language': 'en', 'score': 0.9999973904549492}
{'language': 'en', 'score': 0.999997214747887}
{'language': 'en', 'score': 0.9999972757063631}
{'language': 'en', 'score': 0.9999993065344492}
{'language': 'en', 'score': 0.9999967149871537}
{'language': 'en', 'score': 0.9999960540594085}
{'language': 'en', 'score': 0.9999974102008777}
{'language': 'en', 'score': 0.9999960748649891}
{'language': 'en', 'score': 0.9999974918959023}
{'language': 'en', 'score': 0.9999965149527129}
{'language': 'en', 'score': 0.9999972650374072}
{'language': 'en', 'score': 0.9999982015099614}
{'language': 'en', 'score': 0.9999959788855619}
{'language': 'en', 'score': 0.9999973121101778}
{'language': 'en', 'score': 0.9999978421865086}
{'language': 'en', 'score': 0.999998727432

{'language': 'en', 'score': 0.9999955559287625}
{'language': 'en', 'score': 0.999996569754938}
{'language': 'en', 'score': 0.9999956503610297}
{'language': 'en', 'score': 0.9999957731975064}
{'language': 'en', 'score': 0.9999946978624403}
{'language': 'en', 'score': 0.9999964734474407}
{'language': 'en', 'score': 0.9999971122834204}
{'language': 'en', 'score': 0.9999970461355777}
{'language': 'en', 'score': 0.9999956727096728}
{'language': 'en', 'score': 0.9999961578222791}
{'language': 'en', 'score': 0.9999978540754826}
{'language': 'en', 'score': 0.9999975134916355}
{'language': 'en', 'score': 0.9999967028451483}
{'language': 'en', 'score': 0.999997378704744}
{'language': 'en', 'score': 0.9999985269510485}
{'language': 'en', 'score': 0.9999972661702639}
{'language': 'en', 'score': 0.9999988658248509}
{'language': 'en', 'score': 0.9999978756538859}
{'language': 'en', 'score': 0.9999968524335969}
{'language': 'en', 'score': 0.9999967403675138}
{'language': 'en', 'score': 0.999996205705

{'language': 'en', 'score': 0.9999969213805463}
{'language': 'en', 'score': 0.9999973377291222}
{'language': 'en', 'score': 0.9999979653558919}
{'language': 'en', 'score': 0.9999969284543828}
{'language': 'en', 'score': 0.9999979798323629}
{'language': 'en', 'score': 0.9999962088517733}
{'language': 'en', 'score': 0.9999981982434851}
{'language': 'en', 'score': 0.9999966537013412}
{'language': 'en', 'score': 0.9999987712300115}
{'language': 'en', 'score': 0.9999975187216024}
{'language': 'en', 'score': 0.9999973745601525}
{'language': 'en', 'score': 0.9999957305815057}
{'language': 'en', 'score': 0.99999732963603}
{'language': 'en', 'score': 0.999998947558705}
{'language': 'en', 'score': 0.9999959984039248}
{'language': 'en', 'score': 0.999998643027541}
{'language': 'en', 'score': 0.9999985887062215}
{'language': 'en', 'score': 0.9999966724542663}
{'language': 'en', 'score': 0.9999969956606368}
{'language': 'en', 'score': 0.9999948218093198}
{'language': 'en', 'score': 0.99999793455620

{'language': 'en', 'score': 0.9999963885477055}
{'language': 'en', 'score': 0.9999993506907342}
{'language': 'en', 'score': 0.9999982466611077}
{'language': 'en', 'score': 0.9999980011547295}
{'language': 'en', 'score': 0.9999963075656408}
{'language': 'en', 'score': 0.9999973266885973}
{'language': 'en', 'score': 0.9999973402596127}
{'language': 'en', 'score': 0.9999975386985881}
{'language': 'en', 'score': 0.9999960609832295}
{'language': 'en', 'score': 0.9999959120311515}
{'language': 'en', 'score': 0.9999968155596075}
{'language': 'en', 'score': 0.9999968116432374}
{'language': 'en', 'score': 0.9999989932644604}
{'language': 'en', 'score': 0.9999990290010496}
{'language': 'en', 'score': 0.9999974426592653}
{'language': 'en', 'score': 0.9999966427308664}
{'language': 'en', 'score': 0.9999978850230549}
{'language': 'en', 'score': 0.9999968191137854}
{'language': 'en', 'score': 0.9999983848079994}
{'language': 'en', 'score': 0.9999996416816647}
{'language': 'en', 'score': 0.9999929284

{'language': 'en', 'score': 0.9999975940872785}
{'language': 'en', 'score': 0.99999807752014}
{'language': 'en', 'score': 0.9999966277262687}
{'language': 'en', 'score': 0.9999962464217532}
{'language': 'en', 'score': 0.9999967672604444}
{'language': 'en', 'score': 0.9999982908966145}
{'language': 'en', 'score': 0.9999976565241474}
{'language': 'en', 'score': 0.9999980101823552}
{'language': 'en', 'score': 0.9999963928241367}
{'language': 'en', 'score': 0.999997489563606}
{'language': 'en', 'score': 0.9999959333873158}
{'language': 'en', 'score': 0.9999960932612194}
{'language': 'en', 'score': 0.9999963897163334}
{'language': 'en', 'score': 0.9999964789809561}
{'language': 'en', 'score': 0.9999952236067278}
{'language': 'en', 'score': 0.9999979846756607}
{'language': 'en', 'score': 0.9999979253829062}
{'language': 'en', 'score': 0.9999972102829807}
{'language': 'en', 'score': 0.9999955917367124}
{'language': 'en', 'score': 0.9999963531400141}
{'language': 'en', 'score': 0.9999970546155

{'language': 'en', 'score': 0.9999975691879445}
{'language': 'en', 'score': 0.999995905839382}
{'language': 'en', 'score': 0.999997169138596}
{'language': 'en', 'score': 0.9999956658856874}
{'language': 'en', 'score': 0.999997057084645}
{'language': 'en', 'score': 0.9999977576024273}
{'language': 'en', 'score': 0.999996348893542}
{'language': 'en', 'score': 0.999996193832792}
{'language': 'en', 'score': 0.9999947736167643}
{'language': 'en', 'score': 0.9999973632041999}
{'language': 'en', 'score': 0.9999960190672373}
{'language': 'en', 'score': 0.999998033248793}
{'language': 'en', 'score': 0.9999982014672086}
{'language': 'en', 'score': 0.999997218650102}
{'language': 'en', 'score': 0.9999950540630445}
{'language': 'en', 'score': 0.9999949819288978}
{'language': 'en', 'score': 0.9999968296478593}
{'language': 'en', 'score': 0.9999976662413744}
{'language': 'en', 'score': 0.9999979946382995}
{'language': 'en', 'score': 0.9999966792967717}
{'language': 'en', 'score': 0.9999972861236779}

{'language': 'en', 'score': 0.9999969647687972}
{'language': 'en', 'score': 0.9999962086247199}
{'language': 'en', 'score': 0.9999969896945391}
{'language': 'en', 'score': 0.9999974423101202}
{'language': 'en', 'score': 0.9999967203478859}
{'language': 'en', 'score': 0.9999967230906419}
{'language': 'en', 'score': 0.9999960346620212}
{'language': 'en', 'score': 0.9999948780023473}
{'language': 'en', 'score': 0.9999948756905885}
{'language': 'en', 'score': 0.9999965073654262}
{'language': 'en', 'score': 0.9999968389879322}
{'language': 'en', 'score': 0.9999966201266373}
{'language': 'en', 'score': 0.9999946487056492}
{'language': 'en', 'score': 0.9999967425334934}
{'language': 'en', 'score': 0.9999979075753407}
{'language': 'en', 'score': 0.9999984427078245}
{'language': 'en', 'score': 0.9999979512173431}
{'language': 'en', 'score': 0.9999971710171388}
{'language': 'en', 'score': 0.9999984999185534}
{'language': 'en', 'score': 0.9999955324420737}
{'language': 'en', 'score': 0.9999963353

{'language': 'en', 'score': 0.9999979090694879}
{'language': 'en', 'score': 0.9999944353948385}
{'language': 'en', 'score': 0.9999957568315754}
{'language': 'en', 'score': 0.9999963857254928}
{'language': 'en', 'score': 0.9999976761359626}
{'language': 'en', 'score': 0.999998550341579}
{'language': 'en', 'score': 0.9999964195016552}
{'language': 'en', 'score': 0.9999985235541393}
{'language': 'en', 'score': 0.9999972634552609}
{'language': 'en', 'score': 0.9999949158504766}
{'language': 'en', 'score': 0.999996843651605}
{'language': 'en', 'score': 0.9999973704451666}
{'language': 'en', 'score': 0.9999984138291917}
{'language': 'en', 'score': 0.9999972066488698}
{'language': 'en', 'score': 0.9999971084964955}
{'language': 'en', 'score': 0.9999979642269742}
{'language': 'en', 'score': 0.9999977541332737}
{'language': 'en', 'score': 0.999998673537398}
{'language': 'en', 'score': 0.9999951496931743}
{'language': 'en', 'score': 0.9999965889702576}
{'language': 'en', 'score': 0.9999982199038

{'language': 'en', 'score': 0.9999983511682191}
{'language': 'en', 'score': 0.9999968640493484}
{'language': 'en', 'score': 0.9999975985741372}
{'language': 'en', 'score': 0.9999986062003852}
{'language': 'en', 'score': 0.9999963790000421}
{'language': 'en', 'score': 0.9999963207816047}
{'language': 'en', 'score': 0.9999971603017798}
{'language': 'en', 'score': 0.9999967143198223}
{'language': 'en', 'score': 0.9999963471496086}
{'language': 'en', 'score': 0.9999949486925712}
{'language': 'en', 'score': 0.999997629085908}
{'language': 'en', 'score': 0.9999977666691058}
{'language': 'en', 'score': 0.9999961558285237}
{'language': 'en', 'score': 0.9999954523185124}
{'language': 'en', 'score': 0.9999958210021778}
{'language': 'en', 'score': 0.9999984008909476}
{'language': 'en', 'score': 0.9999978624439333}
{'language': 'en', 'score': 0.9999977319449322}
{'language': 'en', 'score': 0.9999968261453019}
{'language': 'en', 'score': 0.999996984850178}
{'language': 'en', 'score': 0.999998855999

In [39]:
# These three are ted talks that are strictly musical or visual in nature, no words
for index in range(0, len(docs)):
    if docs[index]._.language['score'] < 0.9:
        print(index)

1236
2556
3528


In [50]:
for index in range(0, len(docs)):
    if docs[index]._.language['language'] != 'en':
        print(index)

In [10]:
transcripts.reset_index(inplace = True, drop = True)

In [11]:
transcripts[1236]

'(Guitar music starts)(Cheers)(Cheers)(Music ends)'

In [12]:
transcripts[2556]

'(Music)(Applause)(Music)(Applause)(Music)(Applause)(Music)(Applause)'

In [13]:
transcripts[3528]

'(Applause)(Music)(Applause)'

In [14]:
has_transcript = talk_df[talk_df.transcript != '--']

In [15]:
has_transcript.reset_index(inplace = True, drop = True)

In [16]:
has_transcript.iloc[1236]

date                                                      Feb 2017
speaker                                         Rodrigo y Gabriela
title                  An electrifying acoustic guitar performance
url              /talks/rodrigo_y_gabriela_an_electrifying_acou...
length                                                        4:17
summ             Guitar duo Rodrigo y Gabriela combine furiousl...
tags             ['music', 'live music', 'performance', 'guitar...
views                                                      2681680
transcript       (Guitar music starts)(Cheers)(Cheers)(Music ends)
date_recorded                                           2015-03-16
upload_date                                             2017-02-14
occupation                                              Guitar duo
bio              Rodrigo y Gabriela fuse metal, jazz and world ...
comments                                                        39
Name: 1236, dtype: object

In [17]:
has_transcript.iloc[2556]

date                                                      Jun 2012
speaker                                            Quixotic Fusion
title                                           Dancing with light
url                      /talks/quixotic_fusion_dancing_with_light
length                                                       12:22
summ             Quixotic Fusion is an ensemble of artists that...
tags             ['culture', 'dance', 'entertainment', 'technol...
views                                                      1589575
transcript       (Music)(Applause)(Music)(Applause)(Music)(Appl...
date_recorded                                           2012-02-28
upload_date                                             2012-06-01
occupation                                    Performance ensemble
bio              Anthony Magliano and Mica Thomas are the found...
comments                                                       109
Name: 2556, dtype: object

In [18]:
has_transcript.iloc[3528]

date                                                      Oct 2007
speaker                                              Kenichi Ebina
title                                               My magic moves
url                            /talks/kenichi_ebina_my_magic_moves
length                                                        3:32
summ             Kenichi Ebina moves his body in a manner that ...
tags             ['culture', 'dance', 'entertainment', 'perform...
views                                                      1921963
transcript                             (Applause)(Music)(Applause)
date_recorded                                           2007-03-03
upload_date                                             2007-10-03
occupation                                                  Dancer
bio              2013 "America's Got Talent" winner Kenichi Ebi...
comments                                                       103
Name: 3528, dtype: object

### Drop Ted Talks that don't have transcript

In [19]:
has_transcript.drop([1236, 2556, 3528], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [20]:
has_transcript.shape

(3646, 14)

# Convert duration to seconds

In [22]:
has_transcript.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3646 entries, 0 to 3648
Data columns (total 14 columns):
date             3646 non-null object
speaker          3646 non-null object
title            3646 non-null object
url              3646 non-null object
length           3646 non-null object
summ             3646 non-null object
tags             3646 non-null object
views            3646 non-null object
transcript       3646 non-null object
date_recorded    3646 non-null object
upload_date      3646 non-null object
occupation       3223 non-null object
bio              3235 non-null object
comments         3646 non-null object
dtypes: object(14)
memory usage: 427.3+ KB


In [21]:
has_transcript.head()

Unnamed: 0,date,speaker,title,url,length,summ,tags,views,transcript,date_recorded,upload_date,occupation,bio,comments
0,Jan 2020,Ipsita Dasgupta,"To challenge the status quo, find a ""co-conspi...",/talks/ipsita_dasgupta_to_challenge_the_status...,11:03,"In a complex and changing world, how can we ma...","['innovation', 'collaboration', 'society', 'so...",599446,So I've been thinking about how to explain thi...,2019-09-24,2020-01-02,"Business executive, ""co-conspirator""",Ipsita Dasgupta drives the consumption of ente...,7
1,Jan 2020,Rod Phillips,A brief history of alcohol,/talks/rod_phillips_a_brief_history_of_alcohol,4:56,Nobody knows exactly when humans began to crea...,"['TED-Ed', 'education', 'animation', 'history'...",501290,This chimpanzee stumbles across a windfall of ...,2020-01-02,2020-01-02,,,--
2,Jan 2020,Pat Mitchell,Dangerous times call for dangerous women,/talks/pat_mitchell_dangerous_times_call_for_d...,17:14,Pat Mitchell has nothing left to prove and muc...,"['women', 'women in business', 'community', 'a...",461705,"Recently, I've been declaring to anyone who wo...",2019-12-04,2020-01-02,Dangerous woman,Pat Mitchell is a lifelong advocate for women ...,14
3,Dec 2019,Cara E. Yar Khan,The beautiful balance between courage and fear,/talks/cara_e_yar_khan_the_beautiful_balance_b...,9:55,After being diagnosed with a rare genetic cond...,"['fear', 'personal growth', 'health', 'life', ...",880662,"When we're young, we're innocently brave, and ...",2019-12-04,2019-12-23,Human rights and disability activist,Cara E. Yar Khan is an international human rig...,17
4,Dec 2019,Valorie Kondos Field,Why winning doesn't always equal success,/talks/valorie_kondos_field_why_winning_doesn_...,15:49,Valorie Kondos Field knows a lot about winning...,"['success', 'sports', 'leadership', 'empathy',...",956498,"OK, I have a question for all of us. You ready...",2019-12-04,2019-12-20,Gymnastics coach,Valorie Kondos Field is the retired head coach...,18


In [23]:
has_transcript.length.isna().sum()

0

In [24]:
has_transcript['duration'] = [int(length.split(':')[0]) * 60 + int(length.split(':')[1]) for length in has_transcript.length]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [25]:
has_transcript.head()

Unnamed: 0,date,speaker,title,url,length,summ,tags,views,transcript,date_recorded,upload_date,occupation,bio,comments,duration
0,Jan 2020,Ipsita Dasgupta,"To challenge the status quo, find a ""co-conspi...",/talks/ipsita_dasgupta_to_challenge_the_status...,11:03,"In a complex and changing world, how can we ma...","['innovation', 'collaboration', 'society', 'so...",599446,So I've been thinking about how to explain thi...,2019-09-24,2020-01-02,"Business executive, ""co-conspirator""",Ipsita Dasgupta drives the consumption of ente...,7,663
1,Jan 2020,Rod Phillips,A brief history of alcohol,/talks/rod_phillips_a_brief_history_of_alcohol,4:56,Nobody knows exactly when humans began to crea...,"['TED-Ed', 'education', 'animation', 'history'...",501290,This chimpanzee stumbles across a windfall of ...,2020-01-02,2020-01-02,,,--,296
2,Jan 2020,Pat Mitchell,Dangerous times call for dangerous women,/talks/pat_mitchell_dangerous_times_call_for_d...,17:14,Pat Mitchell has nothing left to prove and muc...,"['women', 'women in business', 'community', 'a...",461705,"Recently, I've been declaring to anyone who wo...",2019-12-04,2020-01-02,Dangerous woman,Pat Mitchell is a lifelong advocate for women ...,14,1034
3,Dec 2019,Cara E. Yar Khan,The beautiful balance between courage and fear,/talks/cara_e_yar_khan_the_beautiful_balance_b...,9:55,After being diagnosed with a rare genetic cond...,"['fear', 'personal growth', 'health', 'life', ...",880662,"When we're young, we're innocently brave, and ...",2019-12-04,2019-12-23,Human rights and disability activist,Cara E. Yar Khan is an international human rig...,17,595
4,Dec 2019,Valorie Kondos Field,Why winning doesn't always equal success,/talks/valorie_kondos_field_why_winning_doesn_...,15:49,Valorie Kondos Field knows a lot about winning...,"['success', 'sports', 'leadership', 'empathy',...",956498,"OK, I have a question for all of us. You ready...",2019-12-04,2019-12-20,Gymnastics coach,Valorie Kondos Field is the retired head coach...,18,949


In [26]:
has_transcript.to_csv('has_transcript.csv')

### Drop Ted Talks that don't have views listed or 0 views

In [27]:
has_transcript.views.value_counts()

--          39
0            8
1477907      2
1367741      2
2393761      2
1014756      1
1946761      1
657035       1
2622571      1
2035973      1
1288142      1
1048772      1
1441928      1
1753981      1
1992913      1
1308264      1
362095       1
2726173      1
3737550      1
5101478      1
1303476      1
790363       1
1605628      1
1981445      1
3013782      1
1723666      1
1617695      1
1543941      1
867110       1
11788235     1
            ..
667540       1
1394472      1
7766684      1
517391       1
4275390      1
618705       1
3169127      1
484988       1
2169686      1
2227977      1
1347395      1
1757855      1
1050151      1
346978       1
24255597     1
1872524      1
1182517      1
9612471      1
2001358      1
1028565      1
1572030      1
6632702      1
2111755      1
895723       1
3072362      1
1749826      1
2465206      1
1585904      1
958577       1
18011        1
Name: views, Length: 3598, dtype: int64

In [28]:
has_views = has_transcript[(has_transcript.views != '--')]

In [29]:
has_views.views = has_views.views.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [30]:
has_views = has_views[has_views.views > 0]

In [31]:
has_views.to_csv('has_views.csv')