In [2]:
import re
import pandas as pd
import numpy as np
import nltk

### Load in all transcripts with meta data

In [63]:
# load meta data of talks
df_meta = pd.read_csv('./data/ted_main.csv')
df_meta.head(1)

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110


In [64]:
# load full transcripts of talks
df_scripts = pd.read_csv('./data/transcripts.csv')
df_scripts.head(1)

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...


In [65]:
# join meta data with transcripts for all talks
df_all = df_meta.merge(df_scripts, on='url')
print(len(df_all))
df_all.head(1)

2467


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,transcript
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,Good morning. How are you?(Laughter)It's been ...


In [66]:
# select talks tagged with "education"
df_edu = df_all[df_all['tags'].apply(lambda x: 'education' in x)]
print(len(df_edu))
df_edu.head(1)

147


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,transcript
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,Good morning. How are you?(Laughter)It's been ...


### Basic processing on the transcripts

In [67]:
docs_all_raw = list(df_all['transcript'])
docs_edu_raw = list(df_edu['transcript'])

In [68]:
len(docs_all_raw[0])

17409

* Remove non-talk words, such as descriptions of audience interaction "(Applause)" "(Laughter)"
* Condense all whitespaces
* Remove non-alpha chars
* Use lemmatizer/stemmer

In [69]:
from nltk.tokenize import sent_tokenize

def cleanup(text, stem_method):
    
    # remove things like '(Applause)', '(Laughter)', etc.
    new_text = re.sub(r'\([\w ]+\)', r'', text)
    
    # remove first two and last one sentence
    new_text = new_text.replace('.','. ')
    new_text = new_text.replace('?','? ')
    new_text = new_text.replace('!','! ')
    new_text = new_text.replace('"','" ')
    sent_list = sent_tokenize(new_text)[2:-1]
    
    # for each sentence, condense all whitespace and remove non-alpha chars
    new_sent_list = []
    for sent in sent_list:
        content = re.sub('\s+', ' ', sent.lower())  # condense all whitespace
        content = re.sub('[^A-Za-z ]+', '', content)  # remove non-alpha chars
    
        # lemmatize words
        words = content.split()
        if stem_method == 'lemma':
            stemmer = nltk.stem.WordNetLemmatizer()
            words = [stemmer.lemmatize(word) for word in words]
        elif stem_method == 'porter':
            stemmer = nltk.stem.PorterStemmer()
            words = [stemmer.stem(word) for word in words]
        new_sent = ' '.join(words)
        new_sent_list.append(new_sent)
    
    return '. '.join(new_sent_list)

In [70]:
docs_edu_porter = [cleanup(doc,'porter') for doc in docs_edu_raw]
docs_all_porter = [cleanup(doc,'porter') for doc in docs_all_raw]
docs_edu_lemma = [cleanup(doc,'lemma') for doc in docs_edu_raw]
docs_all_lemma = [cleanup(doc,'lemma') for doc in docs_all_raw]

In [79]:
# docs_all_lemma[1]

In [82]:
# df = df_all.iloc[:,:]
# df['transcript'] = docs_all_lemma
# df.head()

In [81]:
# df.to_pickle('./data/df_all_lemma.pkl')

### EDA: LDA with TF terms, NMF with TF-IDF terms

In [46]:
docs = docs_edu_lemma

In [47]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

n_samples = 2000
n_features = 1000
n_components = 15
n_top_words = 10
n_gram = 1
alpha = 0.1
stop_choice= 'english'#stop_list[:100]

max_df = 0.3
min_df = 3
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df,
                                   ngram_range=(n_gram,n_gram),
                                   max_features=n_features,
                                   stop_words=stop_choice)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                                ngram_range=(n_gram,n_gram),
                                max_features=n_features,
                                stop_words=stop_choice)
t0 = time()
tf = tf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=alpha, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=alpha,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Extracting tf-idf features for NMF...
done in 0.191s.
Extracting tf features for LDA...
done in 0.212s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.091s.

Topics in NMF model (Frobenius norm):
Topic #0: art grade math stuff public information america adult challenge business
Topic #1: video online interactive feedback lecture interaction camera peer youtube discussion
Topic #2: plastic bag water ocean local plant nuclear collect significant map
Topic #3: english india slum delhi language village experiment grandmother indian cloud
Topic #4: food lunch cook meal plant eat sugar dollar farmer feeding
Topic #5: game player team pie reward video brain playing task win
Topic #6: baby ball pregnant evidence hypothesis toy broccoli brain mother blue
Topic #7: project opensource code robot scratch bird engineering block game tool
Topic #8: daughter father afghanistan men village mother sister refugee brother training
Topic #9: m

### Methodology conclusion
* Use TF-IDF with NMF (Frobenius norm) gives the best coherent topic (in both cases: all talks and edu talks)
* Lemmatizer works better than stemmer
* However, lemmatizer does not distinguish verbs in different tenses
* Consider Word2Vec to combine terms into groups, eg. {africa, african}
* Need to refine stop-word list