# Import Packages

In [2]:
# NLP Packages
import nltk
from nltk.corpus import wordnet
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import re 
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV, train_test_split

In [3]:
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn

import plotly.offline as py
import plotly.graph_objects as go
py.init_notebook_mode(connected=False)

In [4]:
# Standard Python packages
import numpy as np
import pandas as pd

# Read in Data

In [9]:
# Load from file
import pickle
with open('final_raw_data.pkl', 'rb') as file:
    talk_df = pickle.load(file)

In [10]:
talk_df.head()

Unnamed: 0,date,speaker,title,url,length,summ,tags,views,transcript,date_recorded,upload_date,occupation,bio,comments,event,duration,tag_len,transcript_cc,transcript_wc
0,Jan 2020,Ipsita Dasgupta,"To challenge the status quo, find a ""co-conspi...",/talks/ipsita_dasgupta_to_challenge_the_status...,11:03,"In a complex and changing world, how can we ma...","[innovation, collaboration, society, social ch...",599446.0,So I've been thinking about how to explain thi...,2019-09-24,2020-01-02,"Business executive, ""co-conspirator""",Ipsita Dasgupta drives the consumption of ente...,7.0,TED@BCG Mumbai,663,10,10379,1874
1,Jan 2020,Rod Phillips,A brief history of alcohol,/talks/rod_phillips_a_brief_history_of_alcohol,4:56,Nobody knows exactly when humans began to crea...,"[TED-Ed, education, animation, history, cultur...",501290.0,This chimpanzee stumbles across a windfall of ...,2020-01-02,2020-01-02,,,,TED-Ed,296,7,4107,649
2,Jan 2020,Pat Mitchell,Dangerous times call for dangerous women,/talks/pat_mitchell_dangerous_times_call_for_d...,17:14,Pat Mitchell has nothing left to prove and muc...,"[women, women in business, community, activism...",461705.0,"Recently, I've been declaring to anyone who wo...",2019-12-04,2020-01-02,Dangerous woman,Pat Mitchell is a lifelong advocate for women ...,14.0,TEDWomen 2019,1034,12,10691,1884
3,Dec 2019,Cara E. Yar Khan,The beautiful balance between courage and fear,/talks/cara_e_yar_khan_the_beautiful_balance_b...,9:55,After being diagnosed with a rare genetic cond...,"[fear, personal growth, health, life, humanity...",880662.0,"When we're young, we're innocently brave, and ...",2019-12-04,2019-12-23,Human rights and disability activist,Cara E. Yar Khan is an international human rig...,17.0,TEDWomen 2019,595,7,7658,1373
4,Dec 2019,Valorie Kondos Field,Why winning doesn't always equal success,/talks/valorie_kondos_field_why_winning_doesn_...,15:49,Valorie Kondos Field knows a lot about winning...,"[success, sports, leadership, empathy, compass...",956498.0,"OK, I have a question for all of us. You ready...",2019-12-04,2019-12-20,Gymnastics coach,Valorie Kondos Field is the retired head coach...,18.0,TEDWomen 2019,949,11,10161,1793


In [11]:
talk_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3646 entries, 0 to 3648
Data columns (total 19 columns):
date             3646 non-null object
speaker          3646 non-null object
title            3646 non-null object
url              3646 non-null object
length           3646 non-null object
summ             3646 non-null object
tags             3646 non-null object
views            3607 non-null float64
transcript       3646 non-null object
date_recorded    3557 non-null datetime64[ns]
upload_date      3557 non-null datetime64[ns]
occupation       3223 non-null object
bio              3235 non-null object
comments         3008 non-null float64
event            3646 non-null object
duration         3646 non-null int64
tag_len          3646 non-null int64
transcript_cc    3646 non-null int64
transcript_wc    3646 non-null int64
dtypes: datetime64[ns](2), float64(2), int64(4), object(11)
memory usage: 569.7+ KB


# Train, Test, Split

In [12]:
X = talk_df.transcript # Independent Variable
y = talk_df.views      # Dependent Variable

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [21]:
X_train.to_csv('train_data.csv', header = True)

In [22]:
X_test.to_csv('test_data.csv', header = True)

In [25]:
X_train.shape

(2916,)

In [26]:
X_test.shape

(730,)

# Tokenize Transcripts

## Helper Functions

In [14]:
# Adds in space between punctuation and words
def add_spaces(text):
    
    # Find instances of punctuation followed by two letters
    cleanr = re.compile('[.!?,;][A-Za-z][A-Za-z]')
    no_spaces = re.findall(cleanr, text)

    if len(no_spaces) > 0:
        for match in no_spaces:
            punc = match[0] # get the punctuation mark
            word = match[1:] # get the start of the word
            
            if punc != '?':
                text = re.sub(f"{match}", f"{punc} {word}", text)
            
            # Special case of ?, cannot be escaped
            else:
                text = re.sub(f"[?]{word}", f"? {word}", text)
    
    return text

In [15]:
# Handles numbers with 1) commas, 2) before hyphens, 3) in places (1st, 2nd, 3rd, etc.)

def handle_numbers(text):
    
    # Handle numbers with commas
    clean_commas = re.compile('[0-9]+,[0-9][0-9][0-9]')
    num_commas = re.findall(clean_commas, text)

    if len(num_commas) > 0:
        for match in num_commas:
            replace_str = match.replace(',', '')
            text = re.sub(f"{match}", f"{replace_str}", text)
    
    # Handle numbers with hyphens
    clean_numbers = re.compile('[0-9]+-')
    numbers = re.findall(clean_numbers, text)

    if len(numbers) > 0:
        for match in numbers:
            text = re.sub(f"{match}", "# ", text)
    
    # Handle 1st, 2nd, 3rd with hyphens
    text = text.replace('1st-', '1st ').replace('2nd-', '2nd ').replace('3rd-', '3rd ')
    
    # Handle -th with hyphens
    clean_places = re.compile('[0-9]th-')
    places_hyphen = re.findall(clean_places, text)

    if len(places_hyphen) > 0:
        for match in places_hyphen:
            text = re.sub(f"{match}", f"{match[:-1]} ", text)
    
    return text

In [16]:
def handle_parentheses(text):
    
    # Add spaces before and after parentheses
    text = text.replace(')', ')\n').replace('(', ' (')
    
    # Find all parenthetical phrases
    clean_parentheses = re.compile('\(.*\)')
    parentheses = re.findall(clean_parentheses, text)
    
    if len(parentheses) > 0:
        for match in parentheses:
            try:
                text = re.sub(f'{match}', ' ', text)
            except:
                text = re.sub('(Applause.)', '. ', text)
                text = re.sub('(Laughter.)', '. ', text)
                text = re.sub('(Music.)', '. ', text)
                text = text.replace('(Applause.', '. ')
                text = text.replace('(Laughter.', '. ')
                text = text.replace('(Music.', '. ')
                text = re.sub('\(|\)', '. ', text)
    
    text = text.replace('( )', '')
    
    return ' '.join(text.split())


invalid escape sequence \(


invalid escape sequence \(


invalid escape sequence \(


invalid escape sequence \(


invalid escape sequence \(


invalid escape sequence \(


invalid escape sequence \(


invalid escape sequence \(



## Stop Words & Punctuation

In [17]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Add some more stopwords
stop_words = list(stop_words)
stop_words.extend(['yeah', 'ya', 'ah', 'um', 'oh', 'actually', 'literally', 'like', 's', 'applause'])

## Tokenization Function

In [18]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [19]:
def spacy_tokenizer(text):
    
    # Remove .., ..., ....
    no_ellipses = text.replace('....', '').replace('...', '').replace('..','').replace('…', '')
    
    # Remove parenthetical phrases
    no_parentheses = handle_parentheses(no_ellipses)
    
    # Add missing spaces after punctuation
    with_spaces = add_spaces(no_parentheses)
    
    # Handle numbers with commas
    clean_numbers = handle_numbers(with_spaces)
    
    # Remove quotation marks
    no_quotes = clean_numbers.replace('\"', ' ').replace('”', ' ').replace('’', '')
    
    # Address hyphenation issue -- need to revisit
    no_ism = no_quotes.replace('-ism', 'ism')
    no_dash = no_ism.replace('–', ' ').replace('—', ' ').replace('-', '')
    
    # Remove music notes
    no_notes = no_dash.replace('♪', '').replace('♫', '')
    
    # SPECIFIC RULE
    no_spec = no_notes.replace('R and D', 'research and development').replace('R & D', 'research and development')
    
    # Replace all whitespace with one space
    cleantext = ' '.join(no_spec.split())
    cleantext = cleantext.strip()
    
    # Creating our token object, which is used to create documents with linguistic annotations.
    # we disabled the parser and ner parts of the pipeline in order to speed up parsing
    mytokens = nlp(cleantext.lower(), disable=['parser', 'ner'])
    
    # Lemmatizing each token and converting each token into lowercase
    lemmas = []
    for word in mytokens:
        if word.pos_ == 'NOUN':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.NOUN))
        elif word.pos_ == 'VERB':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.VERB))
        elif word.pos_ == 'ADV':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.ADV))
        elif word.pos_ == 'ADJ':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.ADJ))

    lemmas = [word for word in lemmas if word not in stop_words and word not in punctuations]
    
    # return preprocessed list of tokens
    return lemmas

# Create Tokens For Training Set

In [20]:
X_train_lemmas = X_train.apply(spacy_tokenizer)

In [19]:
X_train_lemmas

2987    [bug, lover, childhood, way, late, bachelor, m...
2285    [ancient, great, idea, universe, simple, mind,...
1802    [let, south, south, direction, way, kilometer,...
2163    [cool, thing, game, throne, inhabitant, real, ...
1583    [tell, parent, gay, thing, bring, mind, sexual...
1816    [expose, tear, gas, tear, gas, sorry, hear, kn...
2354    [good, sex, fade, couple, continue, love, good...
1361    [funny, thing, forget, mother, day, tell, stor...
576     [childhood, memory, stand, rest, time, brave, ...
3178    [writer, journalist, insanely, curious, person...
2417    [today, talk, sketch, thing, electrical, engin...
1475    [talk, today, story, condition, believe, possi...
3510    [thing, hold, want, start, today, talk, struct...
528     [way, tell, story, naturally, change, define, ...
176     [climate, change, hear, nearly, psychological,...
3635    [excited, today, stuff, ready, come, lab, glad...
422     [believe, world, good, place, try, design, wor...
680     [socia

In [44]:
X_train_gensim = list(X_train_lemmas)

In [47]:
X_train_gensim[0]

['bug',
 'lover',
 'childhood',
 'way',
 'late',
 'bachelor',
 'major',
 'zoology',
 'kind',
 'fell',
 'love',
 'bug',
 'zoology',
 'course',
 'discipline',
 'entomology',
 'science',
 'insect',
 'think',
 'practical',
 'help',
 'science',
 'entomology',
 'world',
 'plant',
 'protection',
 'plant',
 'protection',
 'insect',
 'bad',
 'bug',
 'plant',
 'protection',
 'come',
 'discipline',
 'biological',
 'pest',
 'control',
 'define',
 'use',
 'live',
 'organism',
 'reduce',
 'population',
 'noxious',
 'plant',
 'pest',
 'discipline',
 'plant',
 'protection',
 'aim',
 'reduction',
 'chemical',
 'biological',
 'pest',
 'control',
 'way',
 'good',
 'bug',
 'talk',
 'exist',
 'world',
 'thousand',
 'thousand',
 'year',
 'long',
 'long',
 'time',
 'year',
 'people',
 'start',
 'people',
 'know',
 'exploit',
 'use',
 'biological',
 'control',
 'phenomenon',
 'fact',
 'natural',
 'control',
 'phenomenon',
 'need',
 'biological',
 'control',
 'phenomenon',
 'backyard',
 'magnify',
 'glass',
 '

# Gensim

In [333]:
from gensim.models import CoherenceModel, LdaModel, HdpModel
from gensim.corpora import Dictionary

In [334]:
dictionary = Dictionary(X_train_gensim)
dictionary.filter_extremes(no_below=5, no_above=0.7)
corpus = [dictionary.doc2bow(text) for text in X_train_gensim]

In [337]:
test_gensim_lda = LdaModel(corpus=corpus, id2word=dictionary, iterations=25, num_topics=15)

In [338]:
test_gensim_cm = CoherenceModel(model=test_gensim_lda, texts=X_train_gensim, dictionary=dictionary, coherence='c_v')

In [339]:
test_gensim_cm.get_coherence()

0.25898325206422596

# SKLearn

## Create String of Tokens

In [21]:
def get_new_transcript(lemmas):
    transcript = ''
    for lemma in lemmas:
        transcript += lemma.strip() + ' '
    return transcript.strip()

In [22]:
X_train_tok_trans = X_train_lemmas.apply(get_new_transcript)

In [20]:
X_train_tok_trans

2987    bug lover childhood way late bachelor major zo...
2285    ancient great idea universe simple mind need e...
1802    let south south direction way kilometer room c...
2163    cool thing game throne inhabitant real languag...
1583    tell parent gay thing bring mind sexual orient...
1816    expose tear gas tear gas sorry hear know toxic...
2354    good sex fade couple continue love good intima...
1361    funny thing forget mother day tell story compl...
576     childhood memory stand rest time brave parent ...
3178    writer journalist insanely curious person year...
2417    today talk sketch thing electrical engineer me...
1475    talk today story condition believe possible st...
3510    thing hold want start today talk structure pol...
528     way tell story naturally change define rule tr...
176     climate change hear nearly psychological impac...
3635    excited today stuff ready come lab glad guy pe...
422     believe world good place try design world star...
680     social

# Vectorize Tokens

## CountVectorizer

In [23]:
cv = CountVectorizer(max_df = 0.7, min_df = 4)

In [24]:
dtm_cv_train = cv.fit_transform(X_train_tok_trans)

In [18]:
len(cv.get_feature_names())

15146

## TfidfVectorizer

In [30]:
tfidf = TfidfVectorizer(max_features = 1000, max_df = 0.6, min_df = 4)

In [31]:
dtm_tfidf_train = tfidf.fit_transform(X_train_tok_trans)

# Latent Semantic Analysis (Truncated SVD)

In [24]:
# Returns dataframe of top n words
def topic_top_words(model, feature_names, n):
    topic_word_dict = {}
    
    for topic_idx, topic in enumerate(model.components_):
        
        topic_word_dict['Topic ' + str(topic_idx + 1)] = [feature_names[i] for i in topic.argsort()[:-n - 1:-1]]
    
    return pd.DataFrame.from_dict(topic_word_dict, orient='index', columns = range(1, n + 1))

## CountVectorizer

In [33]:
lsa_cv = TruncatedSVD(n_components = 20, algorithm = 'randomized', n_iter = 3, random_state = 42)

In [34]:
lsa_cv.fit(dtm_cv_train)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=3, random_state=42,
             tol=0.0)

In [340]:
topic_top_words(lsa_cv, cv.get_feature_names(), 20)

NameError: name 'lsa_cv' is not defined

## TfidfVectorizer

In [36]:
lsa_tfidf = TruncatedSVD(n_components = 20, algorithm = 'randomized', n_iter = 3, random_state = 42)

In [37]:
lsa_tfidf.fit(dtm_tfidf_train)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=3, random_state=42,
             tol=0.0)

# LDA (n = 25)

In [25]:
lda_cv_25 = LatentDirichletAllocation(n_components=25,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=100,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                      )

In [26]:
lda_cv_25.fit(dtm_cv_train)

LatentDirichletAllocation(batch_size=100, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=25, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [40]:
lda_cv_25.score(dtm_cv_train)

-12601995.233764943

In [41]:
lda_cv_25.perplexity(dtm_cv_train)

2716.9177544742574

In [341]:
topic_top_words(lda_cv_25, cv.get_feature_names(), 10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Topic 0,brain,human,tell,kind,try,talk,life,question,different,let
Topic 1,light,energy,earth,planet,space,universe,car,big,life,small
Topic 2,cell,brain,disease,animal,body,human,patient,system,specie,new
Topic 3,political,democracy,vote,election,government,politics,medium,citizen,ant,party
Topic 4,country,percent,change,company,problem,new,money,today,happen,lot
Topic 5,day,story,kid,little,tell,love,life,try,school,great
Topic 6,data,technology,computer,robot,machine,information,internet,build,video,phone
Topic 7,dandy,emperor,community,change,problem,life,city,job,government,day
Topic 8,algorithm,score,programmer,rectangle,data,prevalence,math,match,fitness,area
Topic 9,language,word,write,book,speak,letter,learn,sentence,english,speaker


# Latent Dirichlet Allocation (n = 20)

In [27]:
lda_cv = LatentDirichletAllocation(n_components=20,           # Number of topics
                                   max_iter=10,               # Max learning iterations
                                   learning_method='online',   
                                   random_state=42,          # Random state
                                   batch_size=100,            # n docs in each learning iter
                                   evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                   n_jobs = -1,               # Use all available CPUs
                                   )

In [28]:
lda_cv.fit(dtm_cv_train)

LatentDirichletAllocation(batch_size=100, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=20, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [343]:
topic_top_words(lda_cv, cv.get_feature_names(), 10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Topic 0,life,tell,brain,talk,feel,human,mean,happen,try,ask
Topic 1,light,energy,earth,planet,universe,space,water,life,big,carbon
Topic 2,brain,human,technology,computer,system,robot,new,different,data,kind
Topic 3,political,government,medium,democracy,vote,power,election,country,politics,citizen
Topic 4,country,percent,company,change,problem,new,lot,money,today,happen
Topic 5,little,day,story,tell,love,kind,try,life,great,talk
Topic 6,internet,web,online,privacy,site,email,data,website,ork,surveillance
Topic 7,interviewer,edi,emperor,dandy,illusionist,orchard,chant,wilderness,baking,bce
Topic 8,patient,health,cancer,disease,doctor,care,medical,treatment,hospital,medicine
Topic 9,school,student,language,learn,kid,word,teacher,teach,book,education


In [146]:
# Helper functions for dtm style
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)


In [236]:
def print_dtm(topic_model, dtm):
    # Create Document - Topic Matrix
    output = topic_model.transform(dtm)

    # column names
    topicnames = ["Topic" + str(i) for i in range(topic_model.n_components)]

    # index names
    docnames = ["Doc" + str(i) for i in range(dtm.shape[0])]

    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(output, columns=topicnames, index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)

    # Get next dominant topic for each document
    sorted_topics = np.argsort(df_document_topic.values, axis = 1)
    secondary_topic = [doc[-2] for doc in sorted_topics]

    # Get third dominant topic for each document
    tertiary_topic = [doc[-3] for doc in sorted_topics]

    df_document_topic['dominant_topic'] = dominant_topic
    df_document_topic['secondary_topic'] = secondary_topic
    df_document_topic['tertiary_topic'] = tertiary_topic
    
    return df_document_topic

In [237]:
lda_cv_table = print_dtm(lda_cv, dtm_cv_train)

In [239]:
lda_cv_table.head(100).style.applymap(color_green).applymap(make_bold) # apply style

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic,secondary_topic,tertiary_topic
Doc0,0.228186,0.0814962,0.385412,6.94444e-05,6.94444e-05,6.94444e-05,0.00590307,6.94444e-05,6.94444e-05,6.94444e-05,6.94444e-05,0.163519,6.94444e-05,6.94444e-05,0.0177241,6.94444e-05,6.94444e-05,0.0616797,0.0552463,6.94444e-05,2,0,11
Doc1,0.000149254,0.997164,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,0.000149254,1,19,0
Doc2,0.0189717,0.345509,0.0281151,9.80392e-05,0.234154,0.23682,9.80392e-05,9.80392e-05,9.80392e-05,9.80392e-05,9.80392e-05,9.80392e-05,9.80392e-05,9.80392e-05,9.80392e-05,9.80392e-05,9.80392e-05,0.131571,9.80392e-05,0.00358472,1,5,4
Doc3,0.0252674,0.0152286,0.000247525,0.000247525,0.000247525,0.155415,0.000247525,0.000247525,0.000247525,0.785081,0.000247525,0.000247525,0.000247525,0.000247525,0.000247525,0.000247525,0.000247525,0.000247525,0.000247525,0.0152957,9,5,0
Doc4,9.4162e-05,9.4162e-05,9.4162e-05,0.0388515,0.225465,0.228777,9.4162e-05,9.4162e-05,9.4162e-05,9.4162e-05,9.4162e-05,9.4162e-05,9.4162e-05,9.4162e-05,9.4162e-05,9.4162e-05,9.4162e-05,0.5054,9.4162e-05,9.4162e-05,17,5,4
Doc5,0.0731958,9.2081e-05,0.0918831,0.461307,0.174571,0.130472,9.2081e-05,9.2081e-05,9.2081e-05,9.2081e-05,9.2081e-05,9.2081e-05,9.2081e-05,9.2081e-05,0.0286004,9.2081e-05,9.2081e-05,0.0387735,9.2081e-05,9.2081e-05,3,4,5
Doc6,0.539395,6.11995e-05,6.11995e-05,6.11995e-05,0.00849211,0.128325,0.0016121,6.11995e-05,6.11995e-05,0.00929402,6.11995e-05,6.11995e-05,6.11995e-05,6.11995e-05,0.0665479,6.11995e-05,6.11995e-05,0.244175,0.00142436,6.11995e-05,0,17,5
Doc7,0.14196,0.000219298,0.000219298,0.000219298,0.197489,0.521944,0.0783997,0.000219298,0.000219298,0.000219298,0.000219298,0.000219298,0.000219298,0.0364315,0.000219298,0.000219298,0.000219298,0.000219298,0.000219298,0.0207054,5,4,0
Doc8,9.38086e-05,9.38086e-05,0.226391,0.0218058,0.43642,0.158382,9.38086e-05,9.38086e-05,9.38086e-05,9.38086e-05,9.38086e-05,9.38086e-05,9.38086e-05,9.38086e-05,0.147462,9.38086e-05,9.38086e-05,0.0082257,9.38086e-05,9.38086e-05,4,2,5
Doc9,0.535676,0.0589875,0.000144928,0.024265,0.0733187,0.117105,0.000144928,0.000144928,0.000144928,0.000144928,0.000144928,0.000144928,0.000144928,0.000144928,0.0417063,0.000144928,0.000144928,0.147058,0.000144928,0.000144928,0,17,5


In [168]:
unique, counts = np.unique(dominant_topic, return_counts=True)
print(np.asarray((unique, counts)).T)

[[  0 561]
 [  1 186]
 [  2 475]
 [  3  16]
 [  4 684]
 [  5 562]
 [  6   1]
 [  9  41]
 [ 11  27]
 [ 13   7]
 [ 14 122]
 [ 17 216]
 [ 18   2]
 [ 19  16]]


In [169]:
unique, counts = np.unique(secondary_topic, return_counts=True)
print(np.asarray((unique, counts)).T)

[[  1 199]
 [  2 598]
 [  3  22]
 [  4 785]
 [  5 686]
 [  6   2]
 [  8   8]
 [  9  65]
 [ 11  36]
 [ 13  19]
 [ 14 136]
 [ 17 327]
 [ 18   4]
 [ 19  29]]


# LDA (n = 17)

In [29]:
lda_cv_17 = LatentDirichletAllocation(n_components=17,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=100,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                      )

In [30]:
lda_cv_17.fit(dtm_cv_train)

LatentDirichletAllocation(batch_size=100, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=17, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [136]:
topic_top_words(lda_cv_17, cv.get_feature_names(), 15)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Topic 0,life,tell,talk,brain,try,human,kind,mean,happen,question,feel,lot,let,ask,different
Topic 1,light,energy,earth,planet,space,car,big,universe,water,little,kind,life,mean,lot,small
Topic 2,cell,brain,human,robot,body,system,technology,new,different,animal,able,little,disease,machine,specie
Topic 3,political,medium,democracy,vote,government,election,politics,power,news,party,journalist,citizen,internet,twitter,social
Topic 4,country,percent,change,today,talk,problem,happen,child,community,new,let,lot,mean,help,system
Topic 5,day,life,story,tell,love,little,feel,talk,child,live,family,friend,walk,ask,man
Topic 6,woman,men,sex,female,male,girl,sexual,body,partner,gay,violence,talk,victim,desire,young
Topic 7,edi,emperor,socket,tk,amputate,prosthesis,com,dandy,amputee,leone,orchard,unpredictability,sierra,gyrus,fusiform
Topic 8,company,data,business,money,computer,dollar,technology,internet,information,product,market,pay,create,innovation,value
Topic 9,school,student,kid,language,learn,game,word,book,teacher,teach,education,read,class,child,write


# LDA (n = 16)

In [31]:
lda_cv_16 = LatentDirichletAllocation(n_components=16,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=100,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                      )

In [32]:
lda_cv_16.fit(dtm_cv_train)

LatentDirichletAllocation(batch_size=100, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=16, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [139]:
topic_top_words(lda_cv_16, cv.get_feature_names(), 15)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
Topic 0,brain,human,tell,life,try,kind,different,happen,let,talk,question,lot,feel,mean,little
Topic 1,light,energy,earth,planet,universe,space,big,life,car,star,carbon,mean,climate,kind,little
Topic 2,cell,human,water,system,body,technology,new,animal,data,little,able,different,specie,ocean,life
Topic 3,political,government,democracy,medium,vote,election,politics,information,internet,citizen,party,journalist,power,news,twitter
Topic 4,country,percent,change,company,problem,happen,new,today,talk,lot,money,let,mean,system,big
Topic 5,life,woman,day,tell,story,love,child,kid,talk,feel,family,school,live,ask,little
Topic 6,sex,woman,female,male,men,sexual,body,egg,partner,smell,desire,hormone,beetle,fusion,sperm
Topic 7,emperor,edi,wilderness,cervantes,dandy,drowning,lance,landmine,orchard,government,unpredictability,kit,bce,lofty,job
Topic 8,patient,cancer,health,disease,doctor,treatment,medicine,medical,care,tumor,patent,breast,woman,cure,physician
Topic 9,language,word,book,game,write,read,computer,learn,student,teacher,letter,page,example,teach,new


# LDA (n = 15)

In [33]:
lda_cv_15 = LatentDirichletAllocation(n_components=15,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=100,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                      )

In [34]:
lda_cv_15.fit(dtm_cv_train)

LatentDirichletAllocation(batch_size=100, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=15, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [250]:
import pickle

In [281]:
# Save to lda model
pkl_filename = "lda_15.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(lda_cv_15, file)
    
# Save vectorizer
cv_filename = 'cv.pkl'
with open(cv_filename, 'wb') as file:
    pickle.dump(cv, file)

# Format Results of Document-Topic Matrix from LDA

In [272]:
lda_cv_15_table = print_dtm(lda_cv_15, dtm_cv_train)

In [242]:
lda_cv_15_table.head(100).style.applymap(color_green).applymap(make_bold)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,dominant_topic,secondary_topic,tertiary_topic
Doc0,0.232992,0.0691214,0.44905,9.25929e-05,0.0237104,9.25929e-05,0.020657,9.25926e-05,0.00159042,9.25929e-05,9.25926e-05,0.15121,0.0137167,9.25932e-05,0.037396,2,0,11
Doc1,0.000199005,0.99031,0.00710259,0.000199005,0.000199005,0.000199005,0.000199005,0.000199005,0.000199005,0.000199005,0.000199005,0.000199005,0.000199005,0.000199005,0.000199005,1,2,0
Doc2,0.0319699,0.308662,0.000130719,0.000130719,0.273386,0.276192,0.000130719,0.000130719,0.000130719,0.000130719,0.000130719,0.000130719,0.0897555,0.000130719,0.0188582,1,5,4
Doc3,0.114946,0.000330034,0.000330034,0.000330034,0.000330034,0.114692,0.000330034,0.000330033,0.000330033,0.766402,0.000330033,0.000330034,0.000330033,0.000330033,0.000330034,9,0,5
Doc4,0.0370656,0.00012555,0.000125549,0.140017,0.343803,0.464657,0.000125549,0.000125549,0.000125549,0.000125549,0.000125549,0.000125549,0.0132019,0.000125549,0.000125549,5,4,3
Doc5,0.0352352,0.000122775,0.102792,0.520933,0.165625,0.152066,0.000122775,0.000122775,0.000122775,0.000122775,0.000122775,0.000122775,0.000122775,0.000122775,0.0222429,3,4,5
Doc6,0.572122,0.00687903,8.15995e-05,8.15996e-05,0.0562653,0.244913,0.00636477,8.15993e-05,8.15994e-05,0.0123951,8.15994e-05,8.15994e-05,0.0295505,8.15995e-05,0.070939,0,5,14
Doc7,0.000292399,0.000292398,0.000292398,0.000292399,0.243331,0.557705,0.121288,0.000292398,0.000292398,0.000292398,0.000292398,0.000292398,0.000292398,0.07446,0.000292398,5,4,6
Doc8,0.000125078,0.0511372,0.142892,0.014686,0.47389,0.125654,0.0515937,0.000125078,0.000125078,0.000125079,0.000125078,0.000125078,0.000125078,0.000125078,0.139146,4,2,14
Doc9,0.423786,0.079312,0.000193237,0.0214904,0.194217,0.240919,0.000193237,0.000193237,0.000193237,0.0132403,0.000193237,0.000193238,0.00535079,0.000193237,0.020332,0,5,4


In [153]:
X_train_df_lda_15 = pd.DataFrame(X_train)

In [158]:
len(X_train_df_lda_15)

2916

In [274]:
lda_cv_15_table.index = X_train_df_lda_15.index

In [273]:
lda_cv_15_table.columns = ['00_general', '01_science', '02_technology', '03_politics', '04_problems', 
                           '05_interpersonal', '06_AI', '07_miscellaneous', '08_healthcare', '09_linguistics/humanities', 
                           '10_space', '11_agriculture/nature', '12_gender/sexuality', '13_audio/visual', '14_urban planning/design', 
                           'dominant_topic', 'secondary_topic', 'tertiary_topic']

In [275]:
X_train_df_lda_top = pd.concat([X_train_df_lda_15, lda_cv_15_table], axis = 1)

In [276]:
X_train_df_lda_top.head()

Unnamed: 0,transcript,00_general,01_science,02_technology,03_politics,04_problems,05_interpersonal,06_AI,07_miscellaneous,08_healthcare,09_linguistics/humanities,10_space,11_agriculture/nature,12_gender/sexuality,13_audio/visual,14_urban planning/design,dominant_topic,secondary_topic,tertiary_topic
2987,"I'm a bug lover, myself — not from childhood, ...",0.232992,0.069121,0.44905,9.3e-05,0.02371,9.3e-05,0.020657,9.3e-05,0.00159,9.3e-05,9.3e-05,0.15121,0.013717,9.3e-05,0.037396,2,0,11
2285,The ancient Greeks had a great idea: The unive...,0.000199,0.99031,0.007103,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,1,2,0
1802,Let's go south. All of you are actually going ...,0.03197,0.308662,0.000131,0.000131,0.273386,0.276192,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.089755,0.000131,0.018858,1,5,4
2163,"To many, one of the coolest things about ""Game...",0.114946,0.00033,0.00033,0.00033,0.00033,0.114692,0.00033,0.00033,0.00033,0.766402,0.00033,0.00033,0.00033,0.00033,0.00033,9,0,5
1583,"Jenni Chang: When I told my parents I was gay,...",0.037066,0.000126,0.000126,0.140017,0.343803,0.464657,0.000126,0.000126,0.000126,0.000126,0.000126,0.000126,0.013202,0.000126,0.000126,5,4,3


In [277]:
X_train_df_lda_top.to_csv('final_lda.csv')

In [308]:
topics_df = pd.concat([X_train_df_lda_top, talk_df[['speaker', 'title', 'summ', 'comments', 'views', 'date_recorded', 'upload_date', 'duration', 'tag_len', 'occupation', 'bio']]], axis=1, join='inner')

In [322]:
topics_df = pd.concat([X_train_df_lda_top, talk_event], axis = 1, join = 'inner')

In [323]:
topics_df.head()

Unnamed: 0,transcript,00_general,01_science,02_technology,03_politics,04_problems,05_interpersonal,06_AI,07_miscellaneous,08_healthcare,09_linguistics/humanities,10_space,11_agriculture/nature,12_gender/sexuality,13_audio/visual,14_urban planning/design,dominant_topic,secondary_topic,tertiary_topic,event
2987,"I'm a bug lover, myself — not from childhood, ...",0.232992,0.069121,0.44905,9.3e-05,0.02371,9.3e-05,0.020657,9.3e-05,0.00159,9.3e-05,9.3e-05,0.15121,0.013717,9.3e-05,0.037396,2,0,11,TED2012
2285,The ancient Greeks had a great idea: The unive...,0.000199,0.99031,0.007103,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,0.000199,1,2,0,TED@Unilever
1802,Let's go south. All of you are actually going ...,0.03197,0.308662,0.000131,0.000131,0.273386,0.276192,0.000131,0.000131,0.000131,0.000131,0.000131,0.000131,0.089755,0.000131,0.018858,1,5,4,TEDxBeaconStreet
2163,"To many, one of the coolest things about ""Game...",0.114946,0.00033,0.00033,0.00033,0.00033,0.114692,0.00033,0.00033,0.00033,0.766402,0.00033,0.00033,0.00033,0.00033,0.00033,9,0,5,TED2014
1583,"Jenni Chang: When I told my parents I was gay,...",0.037066,0.000126,0.000126,0.140017,0.343803,0.464657,0.000126,0.000126,0.000126,0.000126,0.000126,0.000126,0.013202,0.000126,0.000126,5,4,3,TED-Ed


In [324]:
topics_df.to_csv('train_data_with_topics.csv')

In [303]:
topic_top_words(lda_cv_15, cv.get_feature_names(), 50).to_csv('lda_15_topics_words.csv')

In [304]:
topic_top_words(lda_cv_15, cv.get_feature_names(), 50)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,41,42,43,44,45,46,47,48,49,50
Topic 0,life,tell,brain,human,talk,feel,experience,study,ask,person,...,love,answer,reason,attention,point,hand,friend,important,heart,second
Topic 1,light,energy,earth,planet,space,universe,big,water,car,life,...,problem,mile,new,fact,bit,matter,place,point,ocean,particle
Topic 2,cell,brain,human,technology,system,different,new,data,little,body,...,area,bit,dna,ocean,simple,mean,blood,form,structure,type
Topic 3,political,government,power,medium,war,democracy,country,vote,internet,politics,...,change,view,violence,fact,religious,terrorist,nation,force,revolution,military
Topic 4,country,percent,change,company,problem,new,talk,today,lot,happen,...,build,pay,future,value,technology,example,try,high,kind,society
Topic 5,day,tell,story,life,love,little,kid,talk,feel,try,...,family,book,help,realize,hear,stand,remember,turn,watch,experience
Topic 6,robot,machine,computer,game,build,intelligence,leg,control,ant,car,...,goal,send,wheel,password,autonomous,fun,user,privacy,force,fall
Topic 7,socket,tk,amputate,prosthesis,amputee,dandy,leone,sierra,blister,emperor,...,new,sector,happen,decade,young,lot,economic,today,country,state
Topic 8,cancer,patient,health,disease,doctor,medicine,medical,tumor,treatment,surgery,...,metastasis,ovarian,cause,bmi,pollinator,survival,flop,outcome,internship,procedure
Topic 9,language,word,book,write,read,letter,learn,speak,example,different,...,culture,phrase,spelling,writer,spell,common,history,arabic,kind,form


## Test on testing set

In [264]:
X_test_lemmas = X_test.apply(spacy_tokenizer) # Get test sets lemmas

In [265]:
X_test_tok_trans = X_test_lemmas.apply(get_new_transcript) # Get test sets new transcript

In [266]:
dtm_cv_test = cv.transform(X_test_tok_trans) # Get vectorized data

In [283]:
lda_cv_15_table_test = print_dtm(lda_cv_15, dtm_cv_test) # Apply lda and get results

In [285]:
lda_cv_15_table_test.columns = lda_cv_15_table.columns

In [327]:
lda_cv_15_table_test.index = X_test.index
X_test_df_lda_top = pd.concat([X_test, lda_cv_15_table_test], axis = 1, join = 'inner')

In [328]:
topics_test_df = pd.concat([X_test_df_lda_top, talk_df[['speaker', 'title', 'summ', 'comments', 'views', 'date_recorded', 'upload_date', 'duration', 'tags', 'tag_len', 'occupation', 'bio']]], axis=1, join='inner')

In [329]:
topics_test_df = pd.concat([topics_test_df, talk_event], axis = 1, join = 'inner')

In [350]:
topics_test_df

Unnamed: 0,transcript,00_general,01_science,02_technology,03_politics,04_problems,05_interpersonal,06_AI,07_miscellaneous,08_healthcare,...,comments,views,date_recorded,upload_date,duration,tags,tag_len,occupation,bio,event
415,My subject is the future of life on Earth. Thi...,0.142867,0.358765,0.375429,0.000081,0.081337,0.000081,0.000081,0.000081,0.000081,...,,,2017-04-22,2019-02-15,1010,"['science', 'future', 'history', 'paleontology...",5,Paleontologist - Head of the School of Biologi...,Michael Benton is a paleontologist who has mad...,TEDWomen 2018
2930,I'm going to have a pretty simple idea that I'...,0.016962,0.059396,0.095658,0.000142,0.179244,0.361579,0.087735,0.000142,0.000142,...,99.0,823506.0,2011-01-09,2011-02-01,707,"['cities', 'collaboration', 'computers', 'curi...",11,Tech publisher,"A technology and publishing enthusiast, Dale D...",TEDSalon London Spring 2012
3197,I have a very difficult task. I'm a spectrosco...,0.000097,0.809492,0.156424,0.002677,0.020076,0.000097,0.000097,0.000097,0.000097,...,71.0,658151.0,2009-07-23,2009-10-01,952,"['astronomy', 'chemistry', 'life', 'science', ...",8,Astrophysicist,Garik Israelian's stargazing on the Canary Isl...,TEDGlobal 2011
298,Black holes are among the most destructive obj...,0.026053,0.934478,0.028500,0.000214,0.000214,0.000214,0.000214,0.000214,0.000214,...,,841088.0,2019-05-16,2019-05-16,289,"['animation', 'TED-Ed', 'space', 'universe', '...",7,,,TED Salon: Belonging
1875,I have the feeling that we can all agree that ...,0.042291,0.000123,0.013015,0.294276,0.503375,0.079493,0.000123,0.000123,0.000123,...,134.0,1300909.0,2014-10-07,2014-10-08,804,"['government', 'open-source', 'politics', 'tec...",4,Democracy activist,Using software to inspire public debate and en...,Mission Blue II
2730,"Ladies and gentlemen, gather around. I would l...",0.058451,0.000498,0.109763,0.074474,0.000498,0.667681,0.000498,0.000498,0.000498,...,132.0,1342613.0,2011-05-24,2011-11-23,231,"['creativity', 'design', 'entertainment', 'sto...",5,Storyteller,Joe Sabia investigates new ways to tell storie...,TEDxRiodelaPlata
32,Corn currently accounts for more than one tent...,0.032942,0.080558,0.150901,0.010845,0.265777,0.000216,0.000216,0.000216,0.000216,...,,322551.0,2019-11-26,2019-12-02,300,"['animation', 'education', 'TED-Ed', 'agricult...",12,,,TED-Ed
2866,"Mark Zuckerberg, a journalist was asking him a...",0.010824,0.000184,0.336912,0.175805,0.171785,0.288408,0.000184,0.000184,0.000184,...,518.0,5262389.0,2011-03-03,2011-05-02,544,"['culture', 'global issues', 'journalism', 'po...",5,Online democracy advocate,"As a cofounder of Upworthy, and the author of ...",TEDxCambridge
2632,"You know, I had a real rough time in school wi...",0.005304,0.487911,0.265689,0.000144,0.000144,0.164252,0.005989,0.000144,0.000144,...,62.0,679513.0,2012-03-12,2012-03-13,507,"['TED-Ed', 'biology', 'deextinction', 'explora...",10,Oceanographer,"A pioneer in ocean exploration, David Gallo is...",TED-Ed
2900,There's a beautiful statement on the screen th...,0.044135,0.368093,0.145710,0.000085,0.021430,0.206059,0.000085,0.000085,0.000085,...,130.0,629271.0,2010-10-10,2011-03-17,1011,"['TEDx', 'architecture', 'design']",3,Lighting designer,Rogier van der Heide creates architectural spa...,TED-Ed


In [331]:
topics_test_df.to_csv('test_data_with_topics.csv')

# LDA (n = 13)

In [35]:
lda_cv_13 = LatentDirichletAllocation(n_components=13,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=100,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                      )

In [36]:
lda_cv_13.fit(dtm_cv_train)

LatentDirichletAllocation(batch_size=100, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=13, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [39]:
topic_top_words(lda_cv_13, cv.get_feature_names(), 10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Topic 0,brain,life,patient,disease,body,health,human,study,tell,cancer
Topic 1,water,city,energy,space,light,earth,planet,building,big,place
Topic 2,technology,cell,human,computer,different,kind,new,data,system,little
Topic 3,political,government,power,medium,democracy,vote,war,internet,law,country
Topic 4,country,percent,change,company,problem,child,talk,new,community,today
Topic 5,tell,life,story,day,love,little,talk,feel,try,kind
Topic 6,robot,leg,body,arm,robotics,walk,robotic,control,drone,autonomous
Topic 7,rope,anatomy,oyster,emperor,illustration,bce,roman,gladiator,infest,inflated
Topic 8,patent,algorithm,bee,gold,sloth,breast,failure,medicine,pharmaceutical,company
Topic 9,language,word,book,write,read,learn,speak,letter,example,text


# LDA (n = 10)

In [37]:
lda_cv_10 = LatentDirichletAllocation(n_components=10,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=100,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                      )

In [38]:
lda_cv_10.fit(dtm_cv_train)

LatentDirichletAllocation(batch_size=100, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [56]:
lda_cv_10_table, dominant_topic_10, secondary_topic_10 = print_dtm(lda_cv_10, dtm_cv_train)

In [40]:
topic_top_words(lda_cv_10, cv.get_feature_names(), 10)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Topic 0,brain,patient,body,disease,life,health,human,cancer,study,tell
Topic 1,energy,city,space,light,earth,building,planet,build,big,universe
Topic 2,technology,kind,different,human,computer,new,system,data,little,cell
Topic 3,political,government,medium,democracy,internet,vote,information,election,power,politics
Topic 4,country,woman,child,change,talk,community,family,school,live,life
Topic 5,tell,life,day,love,story,little,feel,talk,try,kid
Topic 6,water,food,animal,specie,ocean,plant,fish,tree,eat,live
Topic 7,ork,orld,kit,ore,ory,orking,ortant,oyster,rape,com
Topic 8,company,percent,money,dollar,business,problem,number,lot,market,let
Topic 9,language,music,sound,word,student,play,learn,voice,teacher,hear


# LDA Vis
## n = 25

In [None]:
pyLDAvis.enable_notebook()
panel_lda_25 = pyLDAvis.sklearn.prepare(lda_cv, dtm_cv_train, cv, mds='tsne')
panel_lda_25

## n = 20

In [69]:
pyLDAvis.enable_notebook()
panel_lda_20 = pyLDAvis.sklearn.prepare(lda_cv, dtm_cv_train, cv, mds='tsne')
panel_lda_20


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [91]:
pyLDAvis.save_html(panel_lda_20, 'tm_20.html')

## n = 17

In [70]:
pyLDAvis.enable_notebook()
panel_lda_17 = pyLDAvis.sklearn.prepare(lda_cv_17, dtm_cv_train, cv, mds='tsne')
panel_lda_17


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [89]:
pyLDAvis.save_html(panel_lda_17, 'tm_17.html')

## n = 16

In [140]:
pyLDAvis.enable_notebook()
panel_lda_16 = pyLDAvis.sklearn.prepare(lda_cv_16, dtm_cv_train, cv, mds='tsne')
panel_lda_16


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [142]:
pyLDAvis.save_html(panel_lda_16, 'tm_16.html')

## n = 15

In [143]:
pyLDAvis.enable_notebook()
panel_lda_15 = pyLDAvis.sklearn.prepare(lda_cv_15, dtm_cv_train, cv, mds='tsne')
panel_lda_15


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [72]:
pyLDAvis.save_html(panel_lda_15, 'tm_15.html')

## n = 13

In [71]:
pyLDAvis.enable_notebook()
panel_lda_13 = pyLDAvis.sklearn.prepare(lda_cv_13, dtm_cv_train, cv, mds='tsne')
panel_lda_13


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [90]:
pyLDAvis.save_html(panel_lda_13, 'tm_13.html')

## n = 10

In [57]:
pyLDAvis.enable_notebook()
panel_lda_10 = pyLDAvis.sklearn.prepare(lda_cv_10, dtm_cv_train, cv, mds='tsne')
panel_lda_10


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





# Prior Vis

In [170]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_cv, dtm_cv_train, cv, mds='tsne')
panel


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [185]:
panel_ = pyLDAvis.sklearn.prepare(lda_cv, dtm_cv_train, cv)
panel_


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





# LDA (n = 15)

In [187]:
lda_cv_15 = LatentDirichletAllocation(n_components=15,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=100,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                      )

In [188]:
lda_cv_15.fit(dtm_cv_train)

LatentDirichletAllocation(batch_size=100, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=15, n_jobs=-1,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [191]:
display_topics(lda_cv_15, cv.get_feature_names(), 15)

Topic 0:
life tell brain human talk feel experience study ask person happen question mean try child
Topic 1:
light energy earth planet space universe big water car life mean kind little star carbon
Topic 2:
cell brain human technology system different new data computer little body kind able information animal
Topic 3:
political government power medium war democracy country vote internet politics election information law citizen religion
Topic 4:
country percent change company problem new talk today lot happen money let community big child
Topic 5:
day tell story life love little kid talk feel try ask school kind learn child
Topic 6:
robot machine computer build game leg ant intelligence control try car fly problem number design
Topic 7:
socket tk amputate prosthesis amputee dandy leone sierra emperor fusiform gyrus edi blister government problem
Topic 8:
patient cancer health disease doctor medicine medical treatment tumor surgery breast patent bee cure physician
Topic 9:
language word

In [192]:
pyLDAvis.enable_notebook()
panel_15 = pyLDAvis.sklearn.prepare(lda_cv_15, dtm_cv_train, cv, mds='tsne')
panel_15


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





# GridSearch LDA

In [171]:
# Define Search Param
search_params = {'n_components': [15, 20, 25], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation(max_iter = 5, 
                                learning_method = 'online', 
                                random_state = 42, 
                                batch_size = 100, 
                                evaluate_every = -1, 
                                n_jobs = -1)

In [172]:
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(dtm_cv_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=LatentDirichletAllocation(batch_size=100,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='online',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=5,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=-1,
                                                 perp_tol=0.1, random_state=42,
                                                 topic_word_prior=None,
                                                 total_samples=1000000.0,
                                

In [174]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(dtm_cv_train))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 15}
Best Log Likelihood Score:  -2663245.809228584
Model Perplexity:  2729.5166491392133


In [183]:
lda_cv.score(dtm_cv_train)

-12588505.9196547

In [184]:
lda_cv.perplexity(dtm_cv_train)

2694.0187784956825

In [180]:
model.cv_results_

{'mean_fit_time': array([31.29911532, 36.14828281, 41.87749352, 35.78892303, 36.5821672 ,
        44.91563559, 33.91294436, 36.47420406, 42.02768722]),
 'std_fit_time': array([2.64034123, 0.48089545, 2.28763294, 1.90581123, 2.04208753,
        3.15282506, 2.21647108, 1.17721809, 1.10350948]),
 'mean_score_time': array([0.45925689, 0.66473627, 0.52204843, 0.57197323, 0.56140862,
        0.77946038, 0.52758923, 0.67964816, 0.66788373]),
 'std_score_time': array([0.06798872, 0.10029592, 0.02519988, 0.07459057, 0.06073278,
        0.1724118 , 0.03105565, 0.07978363, 0.09849965]),
 'param_learning_decay': masked_array(data=[0.5, 0.5, 0.5, 0.7, 0.7, 0.7, 0.9, 0.9, 0.9],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_n_components': masked_array(data=[15, 20, 25, 15, 20, 25, 15, 20, 25],
              mask=[False, False, False, False, False, False, False, False,
                

In [175]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [15, 20, 25]
log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.5]
log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.7]
log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.grid_scores_ if gscore.parameters['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

AttributeError: 'GridSearchCV' object has no attribute 'grid_scores_'