In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import pickle
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.probability import FreqDist

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA

from sklearn.manifold import TSNE

from sklearn.pipeline import Pipeline

from gensim import corpora, models, similarities, matutils

import re
import string

from wordcloud import WordCloud, STOPWORDS

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors



# call glove2word2vec script
# default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>
from gensim.scripts.glove2word2vec import glove2word2vec


In [2]:

glove_file = datapath('/Users/robinleoknauth/downloads/glove/glove.6B.300d.txt')
tmp_file = get_tmpfile("glove_word2vec.txt")

glove2word2vec(glove_file, tmp_file)

model = KeyedVectors.load_word2vec_format(tmp_file)

In [4]:
df_merged = pd.read_pickle("./data/df_merged_cleaned.pkl")

### Tokenization

In [5]:
stop_words = set(stopwords.words('english'))

In [6]:
df_merged['tokenized_text'] = df_merged['text'].apply(word_tokenize)

In [7]:
df_merged.head(3)

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,transcript,text,tokenized_text
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,Good morning. How are you?(Laughter)It's been ...,good morning how are you it s been great ha...,"[good, morning, how, are, you, it, s, been, gr..."
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520,"Thank you so much, Chris. And it's truly a gre...",thank you so much chris and it s truly a gre...,"[thank, you, so, much, chris, and, it, s, trul..."
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292,"(Music: ""The Sound of Silence,"" Simon & Garfun...",hello voice mail my old friend i ve called ...,"[hello, voice, mail, my, old, friend, i, ve, c..."


### Removing Stop Words

In [8]:
df_merged['tokenized_text'] = df_merged['tokenized_text'].apply(lambda x: [word for word in x if word not in stop_words])



## Lemmatization

In [None]:
lemmatizer=WordNetLemmatizer()


df_merged['lemmatized_text'] = df_merged['tokenized_text'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])

In [None]:
df_merged.tail(2)

join lemmatized words to string for processing.

In [None]:
df_merged['lemma_text_string'] = df_merged['lemmatized_text'].apply(', '.join)

In [None]:
df_topics_token = pipe.transform(df_merged['tokenized_text']) 
df_topics_token = pd.DataFrame(df_topics_token, columns=[str(t_words[i]) for i in range(0,10)])
df_topics_token.head()

In [None]:
def display_topics(model, feature_names, number_top_words):
        for ix, topic in enumerate(model.components_):
            print("Topic ", ix)
            print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-number_top_words - 1:-1]]))
    

In [None]:
def create_topics(model, feature_names, number_top_words):
    result = list()
    for ix, topic in enumerate(model.components_):
#         result.append(" ".join([feature_names[i]
#                     for i in topic.argsort()[:-number_top_words - 1:-1]]))
        result.append([feature_names[i]
                    for i in topic.argsort()[:-number_top_words - 1:-1]])
    return result

In [None]:
def make_topic_mod_lda(data = df_merged['lemma_text_string'], topics = 5,
                       iters = 10, ngram_min = 1,
                       ngram_max = 3, max_df=0.35,
                       min_df = 0.1,
                       max_feats=5000, number_top_words = 20,
                       seed = 0):
    
    """ 
    vectorizer - turn words into numbers for each document
    use Latent Dirichlet Allocation to produce topics
    """
    
    
    vectorizer = CountVectorizer(ngram_range = (ngram_min , ngram_max), 
                             stop_words ='english', 
                             max_df = max_df, 
                             max_features = max_feats)
    
#     vectorizer = TfidfVectorizer(stop_words="english",
# #                         use_idf=True,
#                         ngram_range = (ngram_min , ngram_max),
#                         min_df = min_df,
                             
#                         max_df = max_df,
#                         max_features = max_feats,
#                         )  
    
    
    #  `fit (train), then transform` to convert text to a bag of words

    vect_data = vectorizer.fit_transform(data)
    
    
    
    lda = LatentDirichletAllocation(n_components = topics,
                                    max_iter = iters,
                                    random_state = seed,
                                    learning_method = 'online',
                                    n_jobs =- 1,
                                    )
    
    lda_data = lda.fit_transform(vect_data)
    
    display_topics(lda, vectorizer.get_feature_names(), number_top_words)
    
    return vectorizer, vect_data, lda, lda_data

In [None]:
vectorizer_lda, vect_data, lda_model_lemma, lda_data_lemma = make_topic_mod_lda(data = df_merged['lemma_text_string'],
                                    topics=20,
                                    iters=100,
                                    ngram_min=1, 
                                    ngram_max=2, 
                                    max_df=0.5, 
                                    min_df=0.1,
                                    max_feats=2000
                                                                               )

In [None]:
pd.DataFrame(lda_data_lemma).head()

In [None]:
topic_ind = np.argmax(lda_data_lemma, axis=1)
topic_ind.shape
y = topic_ind

# create text labels for plotting
tsne_labels = pd.DataFrame(y)

In [None]:
# topic_names = tsne_labels
# topic_names[topic_names==0] = "History"
# topic_names[topic_names==1] = "Medicine, Vaccines, Global Health"
# topic_names[topic_names==2] = "Education"
# topic_names[topic_names==3] = "Family"
# topic_names[topic_names==4] = "Politics, War"
# topic_names[topic_names==5] = "Technolgy"
# topic_names[topic_names==6] = "Gender"
# topic_names[topic_names==7] = "Astronomy, Quantum Physics"
# topic_names[topic_names==8] = "Machine Learning, AI"

# topic_names[topic_names==9] = "Gaming, Music, Video"
# topic_names[topic_names==10] = "Tech, Business"
# topic_names[topic_names==11] = "Biology, Genetics"

# topic_names[topic_names==12] = "Medicine, Healthcare"
# topic_names[topic_names==13] = "Energy, Transportation, Climate Change"

# topic_names[topic_names==14] = "Astronomy, Space Travel"
# topic_names[topic_names==15] = "Art, Language, Literature"  
# topic_names[topic_names==16] = "Environmentalism, Oceans"
# topic_names[topic_names==17] = "Mindfulness, Culture, Self Care"
# topic_names[topic_names==18] = "Urban Dev, Architecture"
# topic_names[topic_names==19] = "Economy, Global Econ, Development"

In [None]:
topics_lda = create_topics(lda_model_lemma, vectorizer_lda.get_feature_names(), 20)

In [None]:
topics_lda

In [None]:
tsne_labels


In [None]:
df_merged.tags

In [233]:
model[topics_lda[0]]

array([[-0.43036 , -0.40622 , -0.18279 , ...,  0.028737, -0.42117 ,
         0.27851 ],
       [-0.67916 , -0.12207 , -0.22908 , ...,  0.027425, -0.28656 ,
        -0.030284],
       [-0.67832 , -0.28658 , -0.28904 , ..., -0.1705  , -0.52997 ,
         0.29491 ],
       ...,
       [-0.09774 , -0.53084 , -0.28962 , ...,  0.40103 , -0.11882 ,
         0.1739  ],
       [-0.55598 ,  0.027967, -0.32289 , ...,  0.17415 , -0.12442 ,
         0.1537  ],
       [-0.033888, -0.11745 , -0.023725, ..., -0.1686  ,  0.055808,
         0.25512 ]], dtype=float32)