In [1]:
import re
import pandas as pd
import numpy as np
import nltk

### Get cleaned transcripts

In [48]:
df_all = pd.read_pickle('./data/df_all_lemma.pkl')
print(len(df_all))
df_all.head(1)

2467


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,transcript
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,it been great hasnt it. ive been blown away by...


In [51]:
df_all.iloc[1].tags

"['alternative energy', 'cars', 'climate change', 'culture', 'environment', 'global issues', 'science', 'sustainability', 'technology']"

In [3]:
df_all.transcript[0]

'it been great hasnt it. ive been blown away by the whole thing. in fact im leaving. there have been three theme running through the conference which are relevant to what i want to talk about. one is the extraordinary evidence of human creativity in all of the presentation that weve had and in all of the people here. just the variety of it and the range of it. the second is that it put u in a place where we have no idea whats going to happen in term of the future. no idea how this may play out. i have an interest in education. actually what i find is everybody ha an interest in education. dont you. i find this very interesting. if youre at a dinner party and you say you work in education actually youre not often at dinner party frankly. if you work in education youre not asked. and youre never asked back curiously. thats strange to me. but if you are and you say to somebody you know they say what do you do. and you say you work in education you can see the blood run from their face. th

In [4]:
# select talks tagged with "education"
df_edu = df_all[df_all['tags'].apply(lambda x: 'education' in x)]
print(len(df_edu))

147


In [5]:
docs_all = list(df_all['transcript'])
docs_edu = list(df_edu['transcript'])

In [6]:
# docs_edu[0]

### Get word count for all talks (stop_all) and for edu talks (stop_edu)

In [7]:
# Get most common 5000 words in English
df_5000 = pd.read_pickle('./data/common_5000.pkl')
common_5000 = set(s[3:] for s in list(df_5000['   Word']))
df_5000.head(1)

Unnamed: 0,Rank,Word,Part of speech,Frequency,Dispersion
0,1,the,a,22038615,0.98


In [8]:
from collections import Counter

def get_word_count(docs):

    # Get the content
    content = ' '.join(docs)
    content = re.sub('[^A-Za-z ]+', '', content)  # remove non-alpha chars
    words = content.split()

    # Start counting
    word_count = Counter(words)
    
    return word_count

In [9]:
# get word count for all talks
word_count_all = get_word_count(docs_all)
# get word count for all edu talks
word_count_edu = get_word_count(docs_edu)

In [12]:
# Get most common 5000 words in all TED talks
# Get most common 5000 words in all TED-edu talks
N = 5000
common_all = [word for (word, count) in word_count_all.most_common(N)]
common_edu = [word for (word, count) in word_count_edu.most_common(N)]

### NMF and LDA with standard stop words 'english'

In [35]:
docs = docs_all

# df_temp = df_all[df_all['tags'].apply(lambda x: 'children' in x)]
# docs = list(df_temp['transcript'])
# len(docs)

140

In [36]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

n_samples = 2000
n_features = 1000
n_components = 15
n_top_words = 10
n_gram = 1
alpha = 0.1
stop_choice= 'english'

max_df = 0.3
min_df = 3
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df,
                                   ngram_range=(n_gram,n_gram),
                                   max_features=n_features,
                                   stop_words=stop_choice)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                                ngram_range=(n_gram,n_gram),
                                max_features=n_features,
                                stop_words=stop_choice)
t0 = time()
tf = tf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=alpha, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=alpha,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Extracting tf-idf features for NMF...
done in 0.180s.
Extracting tf features for LDA...
done in 0.188s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.055s.

Topics in NMF model (Frobenius norm):
Topic #0: men money class film stuff building car fear art american
Topic #1: baby statistic ball hypothesis language fetus brain milk evidence birth
Topic #2: father dance photograph dad type door watched needed wear authority
Topic #3: autism individual gene genetic cause disorder diagnosis condition brain identify
Topic #4: grade college successful childhood success score summer wait principle principal
Topic #5: refugee camp war parenting coach conflict crisis support grandmother journey
Topic #6: game project video code scratch engineering write interactive cancer paper
Topic #7: english india computer slum experiment educational google area village internet
Topic #8: laptop drop project government rest village teaching televi

In [None]:
# n_samples = 2000
# n_features = 1000
# n_components = 15
# n_top_words = 10
# n_gram = 1
# alpha = 0.1
# stop_choice= 'english'#stop_list[:100]

# max_df = 0.3
# min_df = 3

### NMF and LDA with customized stop words, based on word counts

In [41]:
stops_custom = common_all[:300]# + list(set(common_edu).difference(set(common_5000)))
print(stops_custom)

['the', 'and', 'to', 'a', 'of', 'that', 'in', 'it', 'i', 'is', 'you', 'we', 'this', 'so', 'wa', 'for', 'are', 'have', 'but', 'they', 'on', 'with', 'what', 'can', 'about', 'there', 'be', 'not', 'at', 'all', 'my', 'one', 'do', 'people', 'were', 'like', 'if', 'from', 'our', 'or', 'now', 'just', 'these', 'an', 'he', 'when', 'because', 'thing', 'me', 'out', 'by', 'how', 'them', 'know', 'very', 'more', 'up', 'going', 'had', 'think', 'their', 'time', 'who', 'u', 'your', 'get', 'year', 'thats', 'see', 'would', 'which', 'here', 'really', 'way', 'then', 'world', 'some', 'im', 'ha', 'dont', 'make', 'go', 'into', 'will', 'actually', 'where', 'well', 'want', 'could', 'other', 'no', 'right', 'been', 'look', 'life', 'say', 'those', 'said', 'work', 'something', 'first', 'than', 'need', 'also', 'two', 'take', 'she', 'new', 'even', 'come', 'most', 'lot', 'over', 'much', 'got', 'kind', 'little', 'only', 'back', 'his', 'youre', 'day', 'many', 'theyre', 'did', 'every', 'why', 'good', 'around', 'her', 'let'

In [42]:
from nltk.corpus import stopwords

stops_standard = stopwords.words('english')
stop_list = stops_standard + stops_custom

In [58]:
# docs = docs_edu

df_temp = df_all[df_all['tags'].apply(lambda x: 'children' in x)]
docs = list(df_temp['transcript'])
len(docs)

140

In [59]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

n_samples = 2000
n_features = 2000
n_components = 15
n_top_words = 10
n_gram = 1
alpha = 0.1
stop_choice= stop_list

max_df = 0.3
min_df = 3
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df,
                                   ngram_range=(n_gram,n_gram),
                                   max_features=n_features,
                                   stop_words=stop_choice)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                                ngram_range=(n_gram,n_gram),
                                max_features=n_features,
                                stop_words=stop_choice)
t0 = time()
tf = tf_vectorizer.fit_transform(docs)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=alpha, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=alpha,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Extracting tf-idf features for NMF...
done in 0.161s.
Extracting tf features for LDA...
done in 0.180s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=2000...
done in 0.058s.

Topics in NMF model (Frobenius norm):
Topic #0: men stuff class american father film classroom dream art health
Topic #1: baby statistic ball language milk evidence hypothesis fetus detector breast
Topic #2: war arm weapon zone notice armed dying aid roughly security
Topic #3: autism individual gene disorder genetic diagnosis cause spectrum contact condition
Topic #4: college summer scholar grade empowered york principal chore lowincome childhood
Topic #5: refugee camp war syria muslim leadership parenting coach conflict displaced
Topic #6: game video write cancer award interactive player classroom sequence elementary
Topic #7: laptop display drop rest government market village teaching television itll
Topic #8: electrical fire intelligent circuit energy injury inventi

Topic #0: design computer project art machine video object image space book
Topic #1: universe planet earth star galaxy mar space light sun solar
Topic #2: water energy forest plant oil climate tree carbon fuel river
Topic #3: patient cancer disease health doctor drug medical treatment tumor care
Topic #4: data information algorithm web phone map computer decision google machine
Topic #5: city car street space neighborhood community urban map york architecture
Topic #6: brain neuron memory behavior signal activity region pattern consciousness study
Topic #7: robot machine leg intelligence animal control task lab computer moving
Topic #8: music sound song play hear playing piece listen video played
Topic #9: cell dna gene genome virus tissue molecule bacteria disease genetic
Topic #10: government company dollar business africa global economy market social economic
Topic #11: ocean animal fish sea coral shark specie planet creature ice
Topic #12: student teacher education classroom learning class teaching teach math english
Topic #13: girl men mother father boy god parent book war shes
Topic #14: food plant eat farmer feed animal grow growing healthy waste

Topic #0: design computer project machine art video image object space piece
Topic #1: universe planet earth star galaxy mar space solar light sun
Topic #2: cancer patient disease doctor health drug treatment medical care surgery
Topic #3: government company dollar business africa global economy market social economic
Topic #4: data information algorithm computer web phone map decision google machine
Topic #5: city space street neighborhood community urban design map york architecture
Topic #6: robot machine leg intelligence animal control task computer lab moving
Topic #7: music sound song play hear playing piece listen video played
Topic #8: cell dna gene genome tissue virus molecule disease animal bacteria
Topic #9: language english book chinese sound write speak read writing letter
Topic #10: water ocean animal sea fish specie forest ice coral earth
Topic #11: car vehicle mile road drive oil energy driving ca fuel
Topic #12: student teacher education learning class teaching teach math college university
Topic #13: girl men mother father boy god parent war shes felt
Topic #14: food plant eat farmer feed grow waste growing energy healthy


Topic #0: men money class film stuff building car fear art american
Topic #1: baby statistic ball hypothesis language fetus brain milk evidence birth
Topic #2: father dance photograph dad type door watched needed wear authority
Topic #3: autism individual gene genetic cause disorder diagnosis condition brain identify
Topic #4: grade college successful childhood success score summer wait principle principal
Topic #5: refugee camp war parenting coach conflict crisis support grandmother journey
Topic #6: game project video code scratch engineering write interactive cancer paper
Topic #7: english india computer slum experiment educational google area village internet
Topic #8: laptop drop project government rest village teaching television certainly scale
Topic #9: brain disorder helmet skull adolescence football injury cortex gray adolescent
Topic #10: brother yearold song fell pound excited wed wrote key discovered
Topic #11: sex sexual pregnancy pregnant men pleasure partner worth conversation desire
Topic #12: lunch food lady breakfast feeding dollar meal chicken billion eat
Topic #13: nature forest plant specie tree park wild animal national planet
Topic #14: water bottle planet content data method global crisis drink sun

Topic #0: class men college grade american stuff art dream film classroom
Topic #1: baby statistic ball language evidence fetus milk birth consciousness genetic
Topic #2: war arm zone notice dying aid died forced peace democracy
Topic #3: autism individual disorder gene diagnosis genetic cause condition treatment expert
Topic #4: lunch lady feeding chicken meal billion breakfast feed eat staff
Topic #5: refugee camp war coach parenting conflict support crisis grandmother journey
Topic #6: game video write cancer engineering interactive classroom code title player
Topic #7: laptop government drop rest village teaching television certainly scale ok
Topic #8: fire fear energy injury record safety prevent outside began digital
Topic #9: india english slum experiment educational google village area internet urban
Topic #10: father dance photograph dad door type watched needed wear authority
Topic #11: brother yearold song fell pound excited wed key wrote discovered
Topic #12: dna monkey gene animal disease stress textbook signal develop poor
Topic #13: forest nature plant specie tree park wild animal planet national
Topic #14: sex sexual pregnancy pregnant men pleasure partner worth conversation desire
