# Preprocessing TED Talk Corpus Data

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import re
import textstat
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [None]:
# load data
tr = pd.read_csv('../data/raw/transcript_data.csv')
td = pd.read_csv('../data/raw/talk_data.csv')

## Merge Data

In [None]:
# remove talks that do not have well-defined titles
rem_talks = [
    'None',
    'My wish',
    'If superpowers were real',
    'Ugly History',
    'Demo']

td = td[~td.talk_name.isin(rem_talks)]
tr = tr[~tr.title.isin(rem_talks)]

In [None]:
# merge data
ted = td.merge(tr, left_on='talk_name', right_on='title')
ted = ted.drop(['title'], axis=1)

## Filter Data

**Filtering operations**

1. Remove TED-Ed videos as these are animated and not delivered on the stage
2. Remove talks that do not have TED in the title
3. Remove talks that are too long (set the cut-off point at 30 minutes or 1800 seconds)
4. Remove talks that can't be placed in a specific category of TED talks (Check https://en.wikipedia.org/wiki/TED_(conference))
5. Remove talks with transcripts that are under 15 sentences (by inspection, it was found that these were artistic performances)

In [None]:
# remove ted ed videos
ted = ted[ted.event!='TED-Ed']

# remove talks that do not have the name TED in the event title
ted['ted_ev'] = ted['event'].apply(lambda x: 1 if 'TED' in x else 0)
ted = ted[ted.ted_ev==1]
ted = ted.drop("ted_ev", axis=1)

# remove very long talks
ted = ted[ted.duration <= 1800]

In [None]:
# all events
events = list(ted.event.unique())

# categories
events_tedx = [e for e in events if 'TEDx' in e]
events_salon = [e for e in events if 'Salon' in e]
events_women = [e for e in events if 'TEDWomen' in e]
events_summit = [e for e in events if 'Summit' in e]
events_youth = [e for e in events if 'TED-Ed Weekend' in e] +  [e for e in events if 'Youth' in e]# TEDYouth renamed to TED-Ed Weekend, check site
events_partnerships = [e for e in events if 'TED@' in e]
events_med = [e for e in events if 'TEDMED' in e]
events_global = [e for e in events if 'Global' in e] + [e for e in events if 'TEDIndia' in e]
events_active = [e for e in events if 'Active' in e] # stopped in 2015
events_res = ['TED Residency']
events_main = [e for e in events if 'TED1' in e] + [e for e in events if 'TED2' in e] + [e for e in events if 'TED Talks Education' in e]

# special
events_sp = events_res + events_youth + events_active + events_global + events_women + events_partnerships + events_med + events_summit + events_salon

# events to remove
events_rem = list(set(events).difference(
    set(events_main + events_sp + events_tedx)
))

# remove all talks that do not belong to any one of the event categories above
ted = ted[~ted.event.isin(events_rem)]

In [None]:
# add a new feature depicting the category of the TED talk

def assign_category(event):
    """
    Assign the talk a category
    """

    # create dictionary
    events_dic = {
        'TEDx': events_tedx,
        'TED Salon': events_salon,
        'TED Women': events_women,
        'TED Summit': events_summit,
        'TED-Ed Weekend': events_youth,
        'TED Institute': events_partnerships,
        'TED MED': events_med,
        'TED Global': events_global,
        'TED Active': events_active,
        'TED Residency': events_res,
        'TED Main': events_main
    }

    for cat in events_dic:
        if (event in events_dic[cat]):
            return cat

ted['event_cat'] = ted['event'].apply(assign_category)

## Calculate basic descriptive statistics

The purpose of this section is to perform further Exploratory Data Analysis to clean the data to include only those entries that are actually talks. For example, if the number of sentences in a transcript is too small, then it is highly likely that this transcript belongs to a musical performance rather than a talk.

In [None]:
pattern = r"(\([A-Za-z]*\))"

def rep_markers(x):
    """
    Replace markers like (Laughter)
    """

    return re.sub(pattern, '', x)

def syll_cnt(x):

    return textstat.syllable_count(rep_markers(x))

def word_cnt(x):

    return textstat.lexicon_count(rep_markers(x))

def sent_cnt(x):

    return textstat.sentence_count(rep_markers(x))

In [None]:
# text units
ted['syll'] = ted['transcript'].apply(syll_cnt)
ted['words'] = ted['transcript'].apply(word_cnt)
ted['sent'] = ted['transcript'].apply(sent_cnt)

# remove talks that are performances
ted = ted[ted.sent > 15]

## Process data and time

In [None]:
def get_year(x):

    return int(x.strftime("%Y"))

def get_month(x):

    return int(x.strftime("%m"))

# process datetime
ted['recorded_at'] = pd.to_datetime(ted['recorded_at'])
ted['published on'] = pd.to_datetime(ted['published on'], unit="s")
ted["p_year"] = ted["published on"].apply(get_year)
ted["p_month"] = ted["published on"].apply(get_month)

In [None]:
# save processed corpus
ted.to_csv('../data/interim/ted_preprocessed.csv')