In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import pickle
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.probability import FreqDist

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA

from sklearn.manifold import TSNE

from sklearn.pipeline import Pipeline

from gensim import corpora, models, similarities, matutils

import re
import string

from wordcloud import WordCloud, STOPWORDS

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors



# call glove2word2vec script
# default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>
from gensim.scripts.glove2word2vec import glove2word2vec


In [2]:

glove_file = datapath('/Users/robinleoknauth/downloads/glove/glove.6B.300d.txt')
tmp_file = get_tmpfile("glove_word2vec.txt")

glove2word2vec(glove_file, tmp_file)

model = KeyedVectors.load_word2vec_format(tmp_file)

In [4]:
df_merged = pd.read_pickle("./data/df_merged_cleaned.pkl")

In [40]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
clean_parens = lambda x: re.sub(r'\([^)]+\)', ' ', x)

# df_merged['text'] = df_merged.transcript.map(clean_parens).map(punc_lower).map(alphanumeric)

### Tokenization

In [5]:
stop_words = set(stopwords.words('english'))

In [6]:
df_merged['tokenized_text'] = df_merged['text'].apply(word_tokenize)

In [7]:
df_merged.head(3)

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,transcript,text,tokenized_text
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,Good morning. How are you?(Laughter)It's been ...,good morning how are you it s been great ha...,"[good, morning, how, are, you, it, s, been, gr..."
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520,"Thank you so much, Chris. And it's truly a gre...",thank you so much chris and it s truly a gre...,"[thank, you, so, much, chris, and, it, s, trul..."
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292,"(Music: ""The Sound of Silence,"" Simon & Garfun...",hello voice mail my old friend i ve called ...,"[hello, voice, mail, my, old, friend, i, ve, c..."


### Removing Stop Words

In [8]:
df_merged['tokenized_text'] = df_merged['tokenized_text'].apply(lambda x: [word for word in x if word not in stop_words])



## Lemmatization

In [9]:
lemmatizer=WordNetLemmatizer()


df_merged['lemmatized_text'] = df_merged['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

In [10]:
df_merged.tail(2)

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,...,related_talks,speaker_occupation,tags,title,url,views,transcript,text,tokenized_text,lemmatized_text
2465,32,In an unmissable talk about race and politics ...,1100,TEDxMileHigh,1499472000,1,Theo E.J. Wilson,Theo E.J. Wilson: A black man goes undercover ...,1,1506024042,...,"[{'id': 2512, 'hero': 'https://pe.tedcdn.com/i...",Public intellectual,"['Internet', 'TEDx', 'United States', 'communi...",A black man goes undercover in the alt-right,https://www.ted.com/talks/theo_e_j_wilson_a_bl...,419309,I took a cell phone and accidentally made myse...,i took a cell phone and accidentally made myse...,"[took, cell, phone, accidentally, made, famous...","[took, cell, phone, accidentally, made, famous..."
2466,8,With more than half of the world population li...,519,TED2017,1492992000,1,Karoliina Korppoo,Karoliina Korppoo: How a video game might help...,1,1506092422,...,"[{'id': 2682, 'hero': 'https://pe.tedcdn.com/i...",Game designer,"['cities', 'design', 'future', 'infrastructure...",How a video game might help us build better ci...,https://www.ted.com/talks/karoliina_korppoo_ho...,391721,"We humans are becoming an urban species, so ci...",we humans are becoming an urban species so ci...,"[humans, becoming, urban, species, cities, nat...","[human, becoming, urban, specie, city, natural..."


join lemmatized words to string for processing.

In [11]:
df_merged['lemma_text_string'] = df_merged['lemmatized_text'].apply(', '.join)

In [None]:
# df_topics_token = pipe.transform(df_merged['tokenized_text']) 
# df_topics_token = pd.DataFrame(df_topics_token, columns=[str(t_words[i]) for i in range(0,10)])
# df_topics_token.head()

In [12]:
def display_topics(model, feature_names, number_top_words):
        for ix, topic in enumerate(model.components_):
            print("Topic: ", ix)
            print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-number_top_words - 1:-1]]))
    

In [13]:
def create_topics(model, feature_names, number_top_words):
    result = list()
    for ix, topic in enumerate(model.components_):
#         result.append(" ".join([feature_names[i]
#                     for i in topic.argsort()[:-number_top_words - 1:-1]]))
        result.append([feature_names[i]
                    for i in topic.argsort()[:-number_top_words - 1:-1]])
    return result

In [14]:
def make_topic_mod_lda(data = df_merged['lemma_text_string'], topics = 5,
                       iters = 10, ngram_min = 1,
                       ngram_max = 3, max_df=0.35,
                       min_df = 0.1,
                       max_feats=5000, number_top_words = 20,
                       seed = 0):
    
    """ 
    vectorizer - turn words into numbers for each document
    use Latent Dirichlet Allocation to produce topics
    """
    
    
    vectorizer = CountVectorizer(ngram_range = (ngram_min , ngram_max), 
                             stop_words ='english', 
                             max_df = max_df, 
                             max_features = max_feats)
    
#     vectorizer = TfidfVectorizer(stop_words="english",
# #                         use_idf=True,
#                         ngram_range = (ngram_min , ngram_max),
#                         min_df = min_df,
                             
#                         max_df = max_df,
#                         max_features = max_feats,
#                         )  
    
    
    #  `fit (train), then transform` to convert text to a bag of words

    vect_data = vectorizer.fit_transform(data)
    
    
    
    lda = LatentDirichletAllocation(n_components = topics,
                                    max_iter = iters,
                                    random_state = seed,
                                    learning_method = 'online',
                                    n_jobs =- 1,
                                    )
    
    lda_data = lda.fit_transform(vect_data)
    
    display_topics(lda, vectorizer.get_feature_names(), number_top_words)
    
    return vectorizer, vect_data, lda, lda_data

In [138]:
vectorizer_lda, vect_data, lda_model_lemma, lda_data_lemma = make_topic_mod_lda(data = df_merged['lemma_text_string'],
                                    topics=20,
                                    iters=100,
                                    ngram_min=1, 
                                    ngram_max=1, 
                                    max_df=0.5, 
                                    min_df=0.1,
                                    max_feats=2000
                                                                               )

Topic  0
water ocean earth sea foot surface ice planet mar robot fish air fly mile space cloud coral meter shark satellite
Topic  1
city design building space project create built build community street designer material public architecture house art live wall form york
Topic  2
africa african country aid south knowledge poor village continent leader nigeria poverty kenya region opportunity west community farmer east rural
Topic  3
car ca em road power drive hour vehicle mile driving yeah driver energy technology speed percent electric chris engine traffic
Topic  4
political group power society american social rule democracy believe community war culture movement muslim history reason moral election value wrong
Topic  5
child school kid student teacher education family parent learning learn class high old percent teach young college help girl classroom
Topic  6
company money dollar business percent market job product value cost million economy pay buy innovation industry example choice

In [139]:
pd.DataFrame(lda_data_lemma).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,8.3e-05,8.3e-05,8.3e-05,8.3e-05,8.3e-05,0.27804,0.0246,0.021508,8.3e-05,0.072261,0.024452,0.208052,8.3e-05,8.3e-05,0.129159,0.11061,0.130402,8.3e-05,8.3e-05,8.3e-05
1,0.000108,0.038549,0.02402,0.097305,0.128625,0.000108,0.188545,0.000108,0.000108,0.000108,0.000108,0.234207,0.232263,0.000108,0.000108,0.042473,0.000108,0.000108,0.012819,0.000108
2,7.2e-05,7.2e-05,7.2e-05,0.016996,7.2e-05,7.2e-05,0.137799,7.2e-05,7.2e-05,7.2e-05,7.2e-05,0.111167,7.2e-05,0.414792,0.190844,0.07374,0.011755,7.2e-05,0.02547,0.016641
3,6.1e-05,0.313576,0.048592,0.015511,0.059395,0.026528,0.107254,6.1e-05,6.1e-05,6.1e-05,6.1e-05,0.123169,0.143243,6.1e-05,6.1e-05,6.1e-05,0.023341,6.1e-05,6.1e-05,0.138786
4,7.3e-05,7.3e-05,0.132381,7.3e-05,7.3e-05,0.10077,0.037371,0.018579,7.3e-05,7.3e-05,7.3e-05,7.3e-05,7.3e-05,0.129651,7.3e-05,7.3e-05,7.3e-05,0.019522,0.011534,0.549311


In [227]:
topic_ind = np.argmax(lda_data_lemma, axis=1)
topic_ind.shape
y = topic_ind

# create text labels for plotting
tsne_labels = pd.DataFrame(y)

In [18]:
# topic_names = tsne_labels
# topic_names[topic_names==0] = "History"
# topic_names[topic_names==1] = "Medicine, Vaccines, Global Health"
# topic_names[topic_names==2] = "Education"
# topic_names[topic_names==3] = "Family"
# topic_names[topic_names==4] = "Politics, War"
# topic_names[topic_names==5] = "Technolgy"
# topic_names[topic_names==6] = "Gender"
# topic_names[topic_names==7] = "Astronomy, Quantum Physics"
# topic_names[topic_names==8] = "Machine Learning, AI"

# topic_names[topic_names==9] = "Gaming, Music, Video"
# topic_names[topic_names==10] = "Tech, Business"
# topic_names[topic_names==11] = "Biology, Genetics"

# topic_names[topic_names==12] = "Medicine, Healthcare"
# topic_names[topic_names==13] = "Energy, Transportation, Climate Change"

# topic_names[topic_names==14] = "Astronomy, Space Travel"
# topic_names[topic_names==15] = "Art, Language, Literature"  
# topic_names[topic_names==16] = "Environmentalism, Oceans"
# topic_names[topic_names==17] = "Mindfulness, Culture, Self Care"
# topic_names[topic_names==18] = "Urban Dev, Architecture"
# topic_names[topic_names==19] = "Economy, Global Econ, Development"

In [145]:
topics_lda = create_topics(lda_model_lemma, vectorizer_lda.get_feature_names(), 20)

In [146]:
topics_lda

[['water',
  'ocean',
  'earth',
  'sea',
  'foot',
  'surface',
  'ice',
  'planet',
  'mar',
  'robot',
  'fish',
  'air',
  'fly',
  'mile',
  'space',
  'cloud',
  'coral',
  'meter',
  'shark',
  'satellite'],
 ['city',
  'design',
  'building',
  'space',
  'project',
  'create',
  'built',
  'build',
  'community',
  'street',
  'designer',
  'material',
  'public',
  'architecture',
  'house',
  'art',
  'live',
  'wall',
  'form',
  'york'],
 ['africa',
  'african',
  'country',
  'aid',
  'south',
  'knowledge',
  'poor',
  'village',
  'continent',
  'leader',
  'nigeria',
  'poverty',
  'kenya',
  'region',
  'opportunity',
  'west',
  'community',
  'farmer',
  'east',
  'rural'],
 ['car',
  'ca',
  'em',
  'road',
  'power',
  'drive',
  'hour',
  'vehicle',
  'mile',
  'driving',
  'yeah',
  'driver',
  'energy',
  'technology',
  'speed',
  'percent',
  'electric',
  'chris',
  'engine',
  'traffic'],
 ['political',
  'group',
  'power',
  'society',
  'american',
  'so

In [219]:
tsne_labels.head(7)


Unnamed: 0,0
0,child school kid student teacher education fam...
1,story man home wanted old took told saw family...
2,data technology computer information machine u...
3,city design building space project create buil...
4,country government state percent global united...
5,woman love men feel girl experience live frien...
6,story man home wanted old took told saw family...


In [196]:
# remove_whitespace = lambda x: re.sub('^\s+|\s+$|\s+(?=\s)', ' ', x)

In [213]:
remove_whitespace = lambda x: re.sub(' +', ' ', x)

In [228]:
topic_names = tsne_labels

for i in list( range(0, len(topics_lda))):
    topic_names[topic_names == i] = ' '.join(topics_lda[ i ])


topic_names.head()

Unnamed: 0,0
0,child school kid student teacher education fam...
1,story man home wanted old took told saw family...
2,data technology computer information machine u...
3,city design building space project create buil...
4,country government state percent global united...


In [209]:
# topic_names['topic_names'] = topic_names[0]

In [211]:
# topic_names['topic_names'] = topic_names.topic_names.map(remove_whitespace)

In [231]:
# topic_names.head()

In [230]:
# topic_names.to_pickle('./data/topic_names.pkl')

In [22]:
df_merged.tags

0       ['children', 'creativity', 'culture', 'dance',...
1       ['alternative energy', 'cars', 'climate change...
2       ['computers', 'entertainment', 'interface desi...
3       ['MacArthur grant', 'activism', 'business', 'c...
4       ['Africa', 'Asia', 'Google', 'demo', 'economic...
5       ['business', 'culture', 'entertainment', 'goal...
6       ['Christianity', 'God', 'atheism', 'comedy', '...
7       ['architecture', 'collaboration', 'culture', '...
8       ['God', 'TED Brain Trust', 'atheism', 'brain',...
9       ['Christianity', 'God', 'culture', 'happiness'...
10      ['activism', 'architecture', 'collaboration', ...
11      ['TED Prize', 'art', 'culture', 'entertainment...
12      ['TED Prize', 'collaboration', 'disease', 'ebo...
13      ['demo', 'design', 'interface design', 'techno...
14      ['children', 'design', 'education', 'entrepren...
15      ['entertainment', 'music', 'performance', 'vio...
16      ['creativity', 'entertainment', 'music', 'perf...
17      ['MacA

In [30]:
model[topics_lda[0][0]]

array([-6.7916e-01, -1.2207e-01, -2.2908e-01,  3.4682e-01,  3.0810e-01,
       -4.8354e-01, -1.2872e-01, -4.6677e-02, -3.6068e-02, -2.1610e+00,
        2.7579e-01,  1.1901e-01,  1.6028e-01, -6.1450e-02,  1.1554e-01,
       -3.5981e-01,  4.8871e-01,  7.9839e-02,  3.3019e-02,  2.9834e-01,
       -9.9063e-02,  6.6332e-01,  3.3691e-01,  5.7280e-02, -2.9128e-01,
        2.7535e-01,  5.6892e-01,  2.5413e-01, -4.3233e-02,  2.0520e-02,
        5.0779e-02,  5.4505e-01, -4.3760e-01,  1.0415e-01, -6.3288e-01,
        1.4536e-01,  3.9711e-01,  4.3029e-01, -1.2240e-01, -6.6137e-02,
        1.0925e-01, -5.0684e-01,  3.9058e-01, -5.6568e-01, -4.3140e-01,
       -1.4120e-01, -1.9887e-01,  3.1592e-01, -3.6014e-01, -4.1200e-02,
        6.4403e-02,  8.0507e-02, -1.7772e-02, -6.5428e-01,  1.6071e-01,
       -3.9521e-01,  1.4980e-01, -5.0789e-01, -4.1719e-01,  4.7289e-02,
        4.4494e-01, -2.7687e-01,  2.2294e-01,  1.7580e-02, -2.1816e-01,
       -3.1820e-01,  2.9193e-01,  6.4577e-01,  2.4003e-01, -3.70

In [220]:
df_merged.iloc[0].tags

"['children', 'creativity', 'culture', 'dance', 'education', 'parenting', 'teaching']"

In [87]:
tags = df_merged.tags

In [88]:
# tags = tags.str.split(',')

In [89]:
tags = tags.apply(punc_lower)

In [90]:
tags = tags.apply(lambda x: x.strip())

In [91]:
tags = tags.apply(lambda x: re.sub('^\s+|\s+$', '', x))

In [92]:
tags = tags.apply(lambda x: re.sub(' +', ' ', x))

In [93]:
tags = tags.apply(lambda x: x.split(' '))

In [94]:
tags[0]

['children',
 'creativity',
 'culture',
 'dance',
 'education',
 'parenting',
 'teaching']

## remove words from tags that are not in word2vec

In [125]:
l = list( range(0, 2467) )

In [124]:
for i in l:
    tags[i] = [word for word in model.vocab]

## calculate centroids for the tags

NOTE: This might take a long time. With a 2017 MacBood Pro it took around 3.5 hours. You might want to just import the pickle instead.

In [126]:
# centroids = list()
# for i in l:
#     centroids.append(np.mean(model[tags[i]], axis=0))
    
# centroids = np.array(centroids)
# centroids = centroids.reshape((2467, 300))

In [127]:
centroids

array([[ 9.2382111e-02, -8.3333418e-02, -4.7429472e-05, ...,
         2.1872786e-01,  1.8184374e-01, -6.7022957e-02],
       [ 9.2382111e-02, -8.3333418e-02, -4.7429472e-05, ...,
         2.1872786e-01,  1.8184374e-01, -6.7022957e-02],
       [ 9.2382111e-02, -8.3333418e-02, -4.7429472e-05, ...,
         2.1872786e-01,  1.8184374e-01, -6.7022957e-02],
       ...,
       [ 9.2382111e-02, -8.3333418e-02, -4.7429472e-05, ...,
         2.1872786e-01,  1.8184374e-01, -6.7022957e-02],
       [ 9.2382111e-02, -8.3333418e-02, -4.7429472e-05, ...,
         2.1872786e-01,  1.8184374e-01, -6.7022957e-02],
       [ 9.2382111e-02, -8.3333418e-02, -4.7429472e-05, ...,
         2.1872786e-01,  1.8184374e-01, -6.7022957e-02]], dtype=float32)

In [129]:
centroids = pd.DataFrame(centroids)

In [130]:
# centroids.to_pickle('./data/centrodis_tags.pkl')

## calculate centroids for the topics of LDA

In [149]:
len_topics_lda = list( range(0, len(topics_lda)))
topic_centroids = list()
for i in len_topics_lda:
    topic_centroids.append(np.mean(model[topics_lda[i]], axis=0))
    
topic_centroids = np.array(topic_centroids)
topic_centroids = topic_centroids.reshape((len(len_topics_lda), 300))

In [161]:
topic_centroids[0]

array([ 1.91942956e-02, -2.94518527e-02, -1.77461311e-01, -1.98520631e-01,
       -1.79088175e-01,  1.19112037e-01,  9.54154693e-03,  1.51460171e-01,
        1.85122803e-01, -1.14885700e+00,  3.40971947e-01, -9.17031989e-02,
        4.19294536e-02, -9.66227502e-02,  1.39629720e-02,  1.98253140e-01,
        9.09077935e-03,  2.91654259e-01, -1.43830642e-01,  4.42994982e-01,
       -2.54677534e-01,  1.33470193e-01, -4.68761101e-02,  2.79659092e-01,
        5.14540598e-02,  9.01829079e-02,  7.17936605e-02,  2.07484007e-01,
       -2.74153322e-01,  1.41550899e-01,  1.91401213e-01,  1.42637357e-01,
       -4.24818814e-01, -2.66560435e-01, -5.92478514e-02,  1.03942834e-01,
        8.48085620e-03, -1.21008590e-01, -3.17262001e-02,  4.37371075e-01,
       -2.69010752e-01,  1.46936670e-01,  1.06667958e-01,  2.61291325e-01,
       -1.68888390e-01,  9.48779956e-02,  3.81530732e-01,  1.32891461e-01,
        1.67832226e-01, -6.17890060e-02,  8.38654581e-03,  5.88715682e-03,
       -3.20000462e-02, -

In [164]:
model.most_similar(positive=[topic_centroids[0]], topn=5)

[('ocean', 0.7241881489753723),
 ('sea', 0.6944864988327026),
 ('surface', 0.6720656156539917),
 ('earth', 0.6614603400230408),
 ('water', 0.6562643647193909)]

In [168]:
topic_names = []

In [170]:
le = list( range(0, len(topic_centroids)) )

In [172]:
for i in le:
    print(model.most_similar(positive=[topic_centroids[i]], topn = 3))
    topic_names.append(model.most_similar(positive=[topic_centroids[i]], topn = 3))

[('ocean', 0.7241881489753723), ('sea', 0.6944864988327026), ('surface', 0.6720656156539917)]
[('building', 0.7508234977722168), ('new', 0.6651170253753662), ('built', 0.6584510803222656)]
[('africa', 0.7412379384040833), ('country', 0.7243189811706543), ('african', 0.7052187919616699)]
[('car', 0.7246485948562622), ('driving', 0.7131131887435913), ('vehicle', 0.677789568901062)]
[('political', 0.7203155755996704), ('that', 0.6875946521759033), ('what', 0.6868358850479126)]
[('school', 0.7647520899772644), ('students', 0.7405155897140503), ('teacher', 0.7326071262359619)]
[('cost', 0.7275770306587219), ('companies', 0.7208005785942078), ('market', 0.7180787324905396)]
[('animal', 0.7121767401695251), ('animals', 0.7056883573532104), ('bird', 0.6857052445411682)]
[('cells', 0.7295016646385193), ('genetic', 0.7097228169441223), ('genes', 0.7032840251922607)]
[('kind', 0.6999107003211975), ('how', 0.697600245475769), ('mind', 0.6975491046905518)]
[('earth', 0.7260396480560303), ('planet',

## Saving the lda_model as pickle

In [152]:
pkl_filepath_model = "./data/lda_model.pkl"  
with open(pkl_filepath_model, 'wb') as file:  
    pickle.dump(lda_model_lemma, file)

In [153]:
pkl_filepath_lda_data = "./data/lda_model_data.pkl"  
with open(pkl_filepath_lda_data, 'wb') as file:  
    pickle.dump(lda_data_lemma, file)

In [165]:
pkl_filepath_vectorizer = "./data/vectorizer.pkl"  
with open(pkl_filepath_vectorizer, 'wb') as file:  
    pickle.dump(vectorizer_lda, file)