# Import Packages

In [37]:
import nltk
from nltk.corpus import wordnet
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import random
import re 
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV, train_test_split

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import seaborn
import matplotlib.pyplot as plt
%matplotlib inline

# Load Data

In [11]:
talk_df = pd.read_csv('has_transcript_clean.csv', index_col = 0)

In [12]:
talk_df.head()

Unnamed: 0,date,speaker,title,url,length,summ,tags,views,transcript,date_recorded,upload_date,occupation,bio,comments,duration
0,Jan 2020,Ipsita Dasgupta,"To challenge the status quo, find a ""co-conspi...",/talks/ipsita_dasgupta_to_challenge_the_status...,11:03,"In a complex and changing world, how can we ma...","['innovation', 'collaboration', 'society', 'so...",599446.0,So I've been thinking about how to explain thi...,2019-09-24,2020-01-02,"Business executive, ""co-conspirator""",Ipsita Dasgupta drives the consumption of ente...,7.0,663
1,Jan 2020,Rod Phillips,A brief history of alcohol,/talks/rod_phillips_a_brief_history_of_alcohol,4:56,Nobody knows exactly when humans began to crea...,"['TED-Ed', 'education', 'animation', 'history'...",501290.0,This chimpanzee stumbles across a windfall of ...,2020-01-02,2020-01-02,,,,296
2,Jan 2020,Pat Mitchell,Dangerous times call for dangerous women,/talks/pat_mitchell_dangerous_times_call_for_d...,17:14,Pat Mitchell has nothing left to prove and muc...,"['women', 'women in business', 'community', 'a...",461705.0,"Recently, I've been declaring to anyone who wo...",2019-12-04,2020-01-02,Dangerous woman,Pat Mitchell is a lifelong advocate for women ...,14.0,1034
3,Dec 2019,Cara E. Yar Khan,The beautiful balance between courage and fear,/talks/cara_e_yar_khan_the_beautiful_balance_b...,9:55,After being diagnosed with a rare genetic cond...,"['fear', 'personal growth', 'health', 'life', ...",880662.0,"When we're young, we're innocently brave, and ...",2019-12-04,2019-12-23,Human rights and disability activist,Cara E. Yar Khan is an international human rig...,17.0,595
4,Dec 2019,Valorie Kondos Field,Why winning doesn't always equal success,/talks/valorie_kondos_field_why_winning_doesn_...,15:49,Valorie Kondos Field knows a lot about winning...,"['success', 'sports', 'leadership', 'empathy',...",956498.0,"OK, I have a question for all of us. You ready...",2019-12-04,2019-12-20,Gymnastics coach,Valorie Kondos Field is the retired head coach...,18.0,949


# Create Tokenizer

In [13]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def spacy_tokenizer(text):
    
    # Remove audience reactions -- need to revisit (remove )
    no_audience_reacts = text.replace('(Applause)', '').replace('(Laughter)', '').replace('(applause)', '').replace('(laughter)', '')

    # Remove quotation marks
    no_quotes = no_audience_reacts.replace('\"', ' ').replace('”', ' ').replace('’', '')
    
    # Address hyphenation issue -- need to revisit
    no_ism = no_quotes.replace('-ism', 'ism')
    no_dash = no_ism.replace('–', ' ').replace('-', '').replace('—', ' ')
    
    # Remove parentheses
    no_parentheses = no_dash.replace('(', ' ').replace(')', ' ')
    
    # Remove .., ..., ....
    no_ellipses = no_parentheses.replace('....', '').replace('...', '').replace('..','').replace('…', '')
    
    # Remove music notes
    no_notes = no_ellipses.replace('♪', '').replace('♫', '')
    
    # HANDLE NUMBERS!
    
    # If ' ' then any alphanumeric then punctuation then any capitalized word then ' ' (correct for missing space after end of sentence)
    
    # Replace all whitespace with one space
    cleantext = ' '.join(no_notes.split())
    cleantext = cleantext.strip()
    
    # Creating our token object, which is used to create documents with linguistic annotations.
    # we disabled the parser and ner parts of the pipeline in order to speed up parsing
    mytokens = nlp(cleantext.lower(), disable=['parser', 'ner'])

    # Removing words except for nouns, verbs, adjectives, and adverbs
    mytokens = [word for word in mytokens if (word.pos_ == 'NOUN') or (word.pos_ == 'VERB') or (word.pos_ == 'ADJ') or (word.pos_ == 'ADV')]
    
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.strip() for word in mytokens]

    # Removing stop words
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # return preprocessed list of tokens
    return mytokens

In [36]:
lemmatizer = WordNetLemmatizer()

# Test Tokenizer on Sentence-level and Speech-level

In [15]:
sentence_1 = "So we knew the numbers were deplorable, and it was manifesting itself in our own organization.So we decided — actually, I\'ll just give you a quick example of a young engineer, a 25-year-old woman, who told us a great story about her daily life, to just exemplify it for us.People, people, place, person, personally."

In [8]:
lemmatizer.lemmatize(sentence_1)

"So we knew the numbers were deplorable, and it was manifesting itself in our own organization.So we decided — actually, I'll just give you a quick example of a young engineer, a 25-year-old woman, who told us a great story about her daily life, to just exemplify it for us.People, people, place, person, personally."

In [306]:
# remove html tags from all of the text before processing
cleanr = re.compile('[.!?,][A-Z][A-Za-z]+ ')
cleantext = re.search(cleanr, sentence_1)

In [307]:
cleantext.group(0)

'.So '

In [16]:
transcript_1 = talk_df.transcript[0]

In [18]:
tokenized = spacy_tokenizer(sentence_1)

In [21]:
tokenized

['know',
 'number',
 'deplorable',
 'manifest',
 'organization.so',
 'decide',
 'actually',
 'quick',
 'example',
 'young',
 'engineer',
 '25yearold',
 'woman',
 'tell',
 'great',
 'story',
 'daily',
 'life',
 'exemplify',
 'people',
 'place',
 'person',
 'personally']

In [310]:
cv_test = CountVectorizer(tokenizer=spacy_tokenizer, min_df=0, stop_words='english')

In [311]:
sentence_cv = cv_test.fit_transform([sentence_1, transcript_1])

In [312]:
cv_test.get_feature_names()

['20',
 '25yearold',
 '40',
 'able',
 'absorb',
 'accent',
 'accountant',
 'achieve',
 'activity',
 'actually',
 'age',
 'allow',
 'american',
 'analysis',
 'annual',
 'apprehensive',
 'aside',
 'ask',
 'baby',
 'bank',
 'believe',
 'bend',
 'big',
 'bit',
 'boardroom',
 'bomb',
 'brace',
 'break',
 'brilliant',
 'bring',
 'building',
 'bunch',
 'bus',
 'business',
 'buy',
 'buyin',
 'calcutta',
 'care',
 'career',
 'ceo',
 'challenge',
 'champion',
 'change',
 'child',
 'chore',
 'class',
 'close',
 'coconspirator',
 'coconspire',
 'colleague',
 'come',
 'communicate',
 'commute',
 'company',
 'completely',
 'complex',
 'concept',
 'conspire',
 'context',
 'conventional',
 'conversation',
 'corporate',
 'couple',
 'courage',
 'create',
 'crystallize',
 'cup',
 'cure',
 'customer',
 'cute',
 'd',
 'daily',
 'daughtersinlaw',
 'day',
 'decade',
 'decide',
 'deplorable',
 'desert',
 'different',
 'disadvantage',
 'disease',
 'distinct',
 'dry',
 'dynamic',
 'early',
 'earn',
 'easy',
 'e

# Create Vectorizers
### Considerations
* Which vectorizer?
    * CountVectorizer
    * TfidfVectorizer
* Which topic modeling technique?
    * LSA (latent semantic analysis)
    * LDA (latent dirichlet allocation
* How many times can a token appear in the corpus?
    * Start: max_df = 0.9
    * Used:
        * 0.5
* How many tokens to include?
    * Start: all
    * Used:
        * 1000
        * 10000 --> best so far, but way too many as a result (with nouns, verbs, adj, adv)
* How many topics to use?
    * Used:
        * 25
        * 50
* How to evaluate topics


* Issue with lemmatization
* Issue with hyphens
* Consider numbers --> only if n-grams
* Issue with periods
* What is the most common word? for longer speeches?
* Default arguments

In [345]:
cv = CountVectorizer(tokenizer=spacy_tokenizer, max_df=0.50, min_df=3, stop_words='english')
tfidf = TfidfVectorizer(tokenizer = spacy_tokenizer, max_features=10000, min_df=3, max_df=0.50)

# Subset for data with views

In [11]:
talk_views = talk_df[talk_df.views.notnull() & talk_df.views > 0]

In [13]:
talk_views.shape

(3599, 15)

In [14]:
talk_views.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3599 entries, 0 to 3648
Data columns (total 15 columns):
date             3599 non-null object
speaker          3599 non-null object
title            3599 non-null object
url              3599 non-null object
length           3599 non-null object
summ             3599 non-null object
tags             3599 non-null object
views            3599 non-null float64
transcript       3599 non-null object
date_recorded    3510 non-null object
upload_date      3510 non-null object
occupation       3181 non-null object
bio              3193 non-null object
comments         2982 non-null float64
duration         3599 non-null int64
dtypes: float64(2), int64(1), object(12)
memory usage: 449.9+ KB


# Train, Test, Split

In [16]:
X = talk_views.transcript
y = talk_views.views

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Preprocess Transcripts

### CountVectorizer

In [99]:
# fit and transform training data
dtm_train_cv = cv.fit_transform(X_train)
dtm_test_cv = cv.transform(X_test)

In [100]:
len(cv.get_feature_names())

18530

In [101]:
dtm_train_cv_df = pd.DataFrame(dtm_train_cv.toarray())
dtm_train_cv_df.columns = cv.get_feature_names()

In [181]:
cv.get_feature_names()

NotFittedError: Vocabulary not fitted or provided

In [186]:
dtm_train_cv_df.columns[8000:8100]

Index(['ideally', 'ideas.and', 'ideas.now', 'ideas.so', 'ideation',
       'identical', 'identically', 'identifiable', 'identification',
       'identifier', 'identify', 'identity', 'identity.now', 'ideo',
       'ideological', 'ideologically', 'ideologue', 'ideology',
       'idiosyncratic', 'idiot', 'idiotic', 'idle', 'idleness', 'idly', 'idol',
       'idyllic', 'ied', 'ifs', 'igloo', 'ignite', 'ignition', 'ignorance',
       'ignorant', 'ignore', 'iguana', 'ikea', 'iliad', 'ill', 'illegal',
       'illegally', 'illegitimate', 'illicit', 'illiteracy', 'illiterate',
       'illness', 'illogical', 'illuminate', 'illuminating', 'illumination',
       'illusion', 'illusionist', 'illusory', 'illustrate', 'illustration',
       'illustrator', 'illustrious', 'image', 'image.so', 'imagery',
       'imaginable', 'imaginary', 'imagination', 'imagination.so',
       'imagination.thank', 'imaginative', 'imaginatively', 'imagine',
       'imagine.thank', 'imagine.this', 'imaging', 'imagining', '

## TfidfVectorizer

In [346]:
# fit and transform training data
dtm_train_tfidf = tfidf.fit_transform(X_train)
dtm_test_tfidf = tfidf.transform(X_test)

In [347]:
len(tfidf.get_feature_names())

10000

In [348]:
dtm_train_tfidf_df = pd.DataFrame(dtm_train_tfidf.toarray())
dtm_train_tfidf_df.columns = tfidf.get_feature_names()

In [349]:
dtm_train_tfidf_df.columns

Index(['10th', '11th', '11yearold', '15th', '15yearold', '1600', '1700',
       '17th', '1800', '1930',
       ...
       'zillion', 'zinc', 'zip', 'zipcar', 'zipper', 'zombie', 'zone',
       'zoning', 'zoo', 'zoom'],
      dtype='object', length=10000)

# Fit TruncatedSVD (LSA)

## CountVectorizer

In [109]:
lsa_cv = TruncatedSVD(n_components = 25, algorithm = 'randomized', n_iter = 3, random_state = 42)

In [110]:
lsa_cv.fit(dtm_train_cv)

TruncatedSVD(algorithm='randomized', n_components=25, n_iter=3, random_state=42,
             tol=0.0)

In [111]:
for index,topic in enumerate(lsa_cv.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['kid', 'important', 'love', 'understand', 'man', 'build', 'learn', 'percent', 'brain', 'country', 'child', 'question', 'problem', 'story', 'woman']


THE TOP 15 WORDS FOR TOPIC #1
['friend', 'boy', 'kid', 'community', 'mother', 'young', 'love', 'family', 'school', 'country', 'girl', 'child', 'story', 'man', 'woman']


THE TOP 15 WORDS FOR TOPIC #2
['experience', 'sound', 'cancer', 'memory', 'girl', 'study', 'music', 'sleep', 'love', 'neuron', 'body', 'man', 'cell', 'woman', 'brain']


THE TOP 15 WORDS FOR TOPIC #3
['problem', 'government', 'dollar', 'money', 'drug', 'care', 'cancer', 'patient', 'disease', 'child', 'cell', 'health', 'percent', 'country', 'brain']


THE TOP 15 WORDS FOR TOPIC #4
['sound', 'parent', 'teach', 'love', 'education', 'brain', 'student', 'teacher', 'game', 'learn', 'music', 'play', 'school', 'child', 'kid']


THE TOP 15 WORDS FOR TOPIC #5
['design', 'learn', 'community', 'grow', 'family', 'cancer', 'building', 'city', 'food', 'lig

In [112]:
# Create Document - Topic Matrix
lsa_output = lsa_cv.transform(dtm_train_cv)

# column names
topicnames = ["Topic" + str(i) for i in range(lsa_cv.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(X_train))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lsa_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(100).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,dominant_topic
Doc0,8.81,-3.62,-1.64,-1.39,-0.9,-1.1,-1.25,-2.32,-0.2,-1.37,0.42,0.67,-1.65,-0.84,1.46,0.78,1.92,2.76,0.33,-1.27,-0.05,0.13,-0.03,0.97,-0.22,0
Doc1,6.32,1.35,-1.51,1.11,0.99,1.32,-0.26,0.08,-0.84,0.11,-0.13,-0.37,1.61,-1.94,-0.73,-0.83,-0.95,-0.39,1.99,-0.17,0.9,-0.56,-1.07,-0.58,2.21,0
Doc2,4.07,-0.86,1.85,0.53,-0.29,0.56,0.79,0.39,-0.33,0.7,0.16,-0.25,0.7,-1.06,-0.58,0.03,-1.36,-1.08,0.6,1.55,0.78,-0.27,-0.21,-0.02,1.12,0
Doc3,19.45,-6.49,3.59,-6.9,-3.16,2.94,1.83,-3.26,0.55,-0.06,0.39,-1.3,-0.41,0.76,-3.93,0.49,-7.45,2.67,-0.28,3.63,1.91,-3.73,2.2,2.49,2.01,0
Doc4,10.67,3.67,0.6,-1.99,-0.42,-3.08,0.98,3.01,-0.02,-1.15,-0.24,0.34,-0.24,-0.52,-1.51,3.41,0.3,0.26,-1.3,-0.0,-0.42,-2.09,-1.88,-1.74,0.01,0
Doc5,12.12,-0.0,-3.47,-1.23,1.87,-0.37,0.36,1.3,-1.21,3.52,-1.45,-1.52,-1.65,1.68,0.01,-2.33,1.9,-1.36,1.85,-2.42,1.51,2.29,-0.78,-0.12,-1.19,0
Doc6,2.08,-0.29,-0.27,-0.91,-0.02,-0.08,-0.3,-0.21,0.25,-0.38,0.51,0.02,0.14,-0.48,-0.73,-0.06,0.05,-0.22,-0.57,0.3,0.25,0.28,0.24,-0.61,-0.32,0
Doc7,12.16,-0.67,2.34,7.98,4.81,4.25,-3.71,2.18,1.22,1.52,2.41,-0.98,5.33,0.01,-0.17,-0.92,-1.59,1.79,4.51,0.29,-4.05,-7.83,3.8,2.34,-2.44,0
Doc8,10.54,6.48,0.46,-2.62,-0.44,-0.57,1.69,7.51,-4.99,-1.29,3.89,-0.25,-3.32,6.16,2.64,-1.96,1.11,-1.1,-0.09,1.13,1.43,-0.72,1.28,-0.63,1.34,0
Doc9,3.83,-0.15,0.39,-1.96,1.29,-0.79,0.81,-0.17,-0.29,-0.51,-0.19,0.31,0.54,-0.13,-0.08,-0.5,-0.94,0.26,0.98,-0.5,-0.24,0.84,0.44,1.11,-0.07,0


In [113]:
unique, counts = np.unique(dominant_topic, return_counts=True)

In [114]:
print(np.asarray((unique, counts)).T)

[[   0 2535]
 [   1   65]
 [   2   72]
 [   3    2]
 [   4   12]
 [   5    3]
 [   6   44]
 [   7    9]
 [   8   18]
 [   9   71]
 [  10    8]
 [  11    1]
 [  12    1]
 [  13    3]
 [  14    2]
 [  15    5]
 [  16    1]
 [  17    4]
 [  18    1]
 [  20    4]
 [  21    3]
 [  22    5]
 [  23    6]
 [  24    4]]


## TfidfVectorizer

In [350]:
lsa_tfidf = TruncatedSVD(n_components = 25, algorithm = 'randomized', n_iter = 3, random_state = 42)

In [351]:
lsa_tfidf.fit(dtm_train_tfidf)

TruncatedSVD(algorithm='randomized', n_components=25, n_iter=3, random_state=42,
             tol=0.0)

In [352]:
for index,topic in enumerate(lsa_tfidf.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['city', 'percent', 'question', 'problem', 'love', 'build', 'system', 'school', 'kid', 'man', 'brain', 'country', 'story', 'child', 'woman']


THE TOP 15 WORDS FOR TOPIC #1
['boy', 'parent', 'love', 'community', 'education', 'kid', 'mother', 'country', 'story', 'family', 'school', 'child', 'girl', 'man', 'woman']


THE TOP 15 WORDS FOR TOPIC #2
['thank', 'piano', 'sing', 'video', 'instrument', 'hear', 'art', 'musician', 'piece', 'love', 'brain', 'song', 'sound', 'play', 'music']


THE TOP 15 WORDS FOR TOPIC #3
['memory', 'sleep', 'man', 'doctor', 'blood', 'gene', 'drug', 'disease', 'neuron', 'body', 'patient', 'cancer', 'woman', 'cell', 'brain']


THE TOP 15 WORDS FOR TOPIC #4
['girl', 'sun', 'animal', 'galaxy', 'space', 'story', 'ocean', 'star', 'light', 'man', 'water', 'planet', 'universe', 'earth', 'woman']


THE TOP 15 WORDS FOR TOPIC #5
['carbon', 'plant', 'city', 'percent', 'disease', 'patient', 'drug', 'climate', 'water', 'country', 'health', 'cance

In [342]:
import numpy as np
# Create Document - Topic Matrix
lsa_output = lsa_tfidf.transform(dtm_train_tfidf)

# column names
topicnames = ["Topic" + str(i) for i in range(lsa_tfidf.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(X_train))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lsa_output, 4), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Get next dominant topic for each document
sorted_topics = np.argsort(df_document_topic.values, axis = 1)
secondary_topic = [doc[-2] for doc in sorted_topics]
df_document_topic['seconary_topic'] = secondary_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(100).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,dominant_topic,seconary_topic
Doc0,0.2893,-0.1298,-0.0077,-0.0802,-0.0394,-0.1118,-0.0652,0.0798,0.0306,0.0244,0.0031,0.0663,-0.0908,-0.0013,-0.0813,-0.0681,0.0231,0.019,-0.0321,-0.0108,-0.0099,0.0414,0.1027,0.0315,-0.0165,0,22
Doc1,0.2427,0.0602,-0.0721,-0.0224,-0.0415,0.0304,0.0724,0.0359,0.0335,-0.0079,0.0075,-0.0222,0.0132,0.0523,0.0194,0.019,0.0015,-0.0605,-0.017,0.074,-0.0301,0.0775,0.0076,-0.0524,0.0247,0,21
Doc2,0.2028,-0.0519,0.0429,0.1673,0.0173,0.007,0.0354,0.0654,0.0126,-0.0371,-0.002,-0.0509,0.0697,0.0268,0.1105,0.0221,0.0032,-0.0643,-0.0274,-0.0711,-0.0296,0.0202,0.0883,-0.016,0.0104,0,3
Doc3,0.2888,-0.1029,0.0607,0.0238,0.0674,-0.0483,-0.0119,0.046,-0.0443,0.0023,-0.0305,-0.0019,0.0565,0.037,0.0142,0.0232,0.0542,-0.1038,-0.0072,-0.0535,-0.01,0.0316,-0.0722,0.0023,0.0062,0,4
Doc4,0.3206,0.1082,-0.0095,0.0066,0.0681,-0.0531,-0.0616,-0.0697,0.0265,-0.1345,0.0028,0.0352,0.0122,0.0356,-0.0047,-0.0626,0.0339,0.06,0.0132,0.0404,-0.0619,-0.0137,-0.0805,-0.044,-0.0872,0,1
Doc5,0.3464,-0.0044,0.0478,-0.1418,-0.0535,0.1313,0.0058,-0.0792,0.0359,0.0002,-0.0493,0.0704,-0.0004,-0.0318,0.0162,-0.0268,-0.0624,-0.02,0.0087,0.0844,0.033,0.0346,0.0675,-0.0416,0.1044,0,5
Doc6,0.1652,-0.0147,0.0344,-0.0567,0.0439,-0.1132,-0.0281,0.0422,-0.0285,-0.0063,0.0881,0.0021,0.0196,-0.0692,-0.0178,0.006,-0.0976,-0.066,0.0272,-0.0153,0.0197,-0.0481,0.0262,0.0698,0.0446,0,10
Doc7,0.2402,0.0044,-0.0252,0.1227,-0.0789,0.0278,0.1711,-0.0144,0.1132,0.0722,0.0747,-0.0813,0.0206,0.0061,-0.0114,0.0365,0.0551,-0.0024,0.0455,-0.0078,0.0496,0.0525,-0.0752,-0.1014,0.0187,0,6
Doc8,0.3099,0.1684,0.0174,-0.0181,0.0814,0.0666,-0.043,0.0058,0.0031,-0.0756,0.0258,0.0383,0.0492,0.0306,-0.0406,0.1416,-0.0616,0.152,0.0116,0.1237,0.0191,0.0112,-0.0421,0.0537,0.0348,0,1
Doc9,0.1618,0.0232,0.0561,-0.0213,-0.038,-0.1021,-0.0094,-0.0055,-0.0459,-0.0527,-0.0627,-0.0545,0.0124,0.0122,-0.0545,-0.0253,0.0054,-0.0211,-0.0058,0.0553,-0.0127,-0.0185,0.0445,0.0056,-0.0218,0,2


In [159]:
np.argmax(df_document_topic.values, axis=1)

array([ 0,  0,  0, ..., 25, 25,  0])

In [157]:
unique, counts = np.unique(dominant_topic, return_counts=True)

In [158]:
print(np.asarray((unique, counts)).T)

[[   0 2445]
 [   1   26]
 [   2  101]
 [   3   89]
 [   5    2]
 [   6   13]
 [   7    8]
 [   8   28]
 [  10   28]
 [  11   16]
 [  12   26]
 [  13    3]
 [  14    6]
 [  15    5]
 [  16    6]
 [  17    9]
 [  18    4]
 [  19   29]
 [  20    1]
 [  21   13]
 [  22    3]
 [  23    5]
 [  24   13]]


In [343]:
unique, counts = np.unique(secondary_topic, return_counts=True)

In [344]:
print(np.asarray((unique, counts)).T)

[[  1 324]
 [  2 215]
 [  3 276]
 [  4 222]
 [  5 128]
 [  6 159]
 [  7 153]
 [  8  72]
 [  9  54]
 [ 10 103]
 [ 11 114]
 [ 12 116]
 [ 13  78]
 [ 14  96]
 [ 15 114]
 [ 16 121]
 [ 17  87]
 [ 18  85]
 [ 19  52]
 [ 20  48]
 [ 21  65]
 [ 22  72]
 [ 23  72]
 [ 24  53]]


# Fit LDA

## CountVectorizer

## TfidfVectorizer

In [177]:
lda_tfidf = LatentDirichletAllocation(n_components=25,           # Number of topics
                                      max_iter=5,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=100,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )

In [178]:
lda_tfidf.fit_transform(dtm_train_tfidf)

array([[0.00297412, 0.00297412, 0.00297412, ..., 0.00297412, 0.92862114,
        0.00297412],
       [0.00470189, 0.00470189, 0.00470189, ..., 0.00470189, 0.88715454,
        0.00470189],
       [0.00448913, 0.00448913, 0.00448913, ..., 0.00448913, 0.89226077,
        0.00448913],
       ...,
       [0.00389272, 0.00389272, 0.00389272, ..., 0.00389272, 0.90657474,
        0.00389272],
       [0.00534758, 0.00534758, 0.00534758, ..., 0.00534758, 0.87165807,
        0.00534758],
       [0.00340726, 0.00340726, 0.00340726, ..., 0.00340726, 0.91822585,
        0.00340726]])

In [179]:
for index,topic in enumerate(lda_tfidf.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['process', 'banjo', 'critical', 'light', 'play', 'chant', 'astrophysicist', 'clinical', 'extremist', 'limb', 'phantom', 'trial', 'cancer', 'music', 'map']


THE TOP 15 WORDS FOR TOPIC #1
['transcript', 'director', 'company', 'mobile', 'org', 'conversation', 'chart', 'phone', 'team', 'structure', 'gold', 'innovation', 'rhythm', 'technology', 'beat']


THE TOP 15 WORDS FOR TOPIC #2
['donor', 'arm', 'refine', 'flight', 'dilemma', 'augment', 'consensus', 'patient', 'scaffold', 'liver', 'group', 'transplant', 'stem', 'organ', 'cell']


THE TOP 15 WORDS FOR TOPIC #3
['oil', 'unit', 'planet', 'beetle', 'refrigeration', 'star', 'diplomat', 'antibiotic', 'posture', 'dung', 'fuel', 'cloud', 'joint', 'car', 'knee']


THE TOP 15 WORDS FOR TOPIC #4
['basic', 'labor', 'plank', 'beast', 'mountain', 'exile', 'person', 'door', 'mouth', 'chinese', 'fire', 'sun', 'tree', 'character', 'hercule']


THE TOP 15 WORDS FOR TOPIC #5
['young', 'prison', 'camp', 'bomb', 'suspicious'

In [180]:
# Create Document - Topic Matrix
lda_output = lda_tfidf.transform(dtm_train_tfidf)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_tfidf.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(X_train))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Get next dominant topic for each document
sorted_topics = np.argsort(df_document_topic.values, axis = 1)
secondary_topic = [doc[-2] for doc in sorted_topics]
df_document_topic['seconary_topic'] = secondary_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(100).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,dominant_topic,seconary_topic
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93,0.0,23,23
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.0,23,23
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.0,23,23
Doc3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.94,0.0,23,23
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.0,23,23
Doc5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93,0.0,23,23
Doc6,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.88,0.01,23,23
Doc7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.0,23,23
Doc8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.91,0.0,23,23
Doc9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.88,0.0,23,23


In [144]:
unique, counts = np.unique(dominant_topic, return_counts=True)
print(np.asarray((unique, counts)).T)

[[   1 2871]
 [   8    6]
 [  11    2]]
