# Import Packages

In [1]:
import nltk
from nltk.corpus import wordnet
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import random
import re 
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV, train_test_split

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import seaborn
import matplotlib.pyplot as plt
%matplotlib inline

# Load Data

In [2]:
talk_df = pd.read_csv('has_transcript_clean.csv', index_col = 0)

In [3]:
talk_df.head()

Unnamed: 0,date,speaker,title,url,length,summ,tags,views,transcript,date_recorded,upload_date,occupation,bio,comments,duration
0,Jan 2020,Ipsita Dasgupta,"To challenge the status quo, find a ""co-conspi...",/talks/ipsita_dasgupta_to_challenge_the_status...,11:03,"In a complex and changing world, how can we ma...","['innovation', 'collaboration', 'society', 'so...",599446.0,So I've been thinking about how to explain thi...,2019-09-24,2020-01-02,"Business executive, ""co-conspirator""",Ipsita Dasgupta drives the consumption of ente...,7.0,663
1,Jan 2020,Rod Phillips,A brief history of alcohol,/talks/rod_phillips_a_brief_history_of_alcohol,4:56,Nobody knows exactly when humans began to crea...,"['TED-Ed', 'education', 'animation', 'history'...",501290.0,This chimpanzee stumbles across a windfall of ...,2020-01-02,2020-01-02,,,,296
2,Jan 2020,Pat Mitchell,Dangerous times call for dangerous women,/talks/pat_mitchell_dangerous_times_call_for_d...,17:14,Pat Mitchell has nothing left to prove and muc...,"['women', 'women in business', 'community', 'a...",461705.0,"Recently, I've been declaring to anyone who wo...",2019-12-04,2020-01-02,Dangerous woman,Pat Mitchell is a lifelong advocate for women ...,14.0,1034
3,Dec 2019,Cara E. Yar Khan,The beautiful balance between courage and fear,/talks/cara_e_yar_khan_the_beautiful_balance_b...,9:55,After being diagnosed with a rare genetic cond...,"['fear', 'personal growth', 'health', 'life', ...",880662.0,"When we're young, we're innocently brave, and ...",2019-12-04,2019-12-23,Human rights and disability activist,Cara E. Yar Khan is an international human rig...,17.0,595
4,Dec 2019,Valorie Kondos Field,Why winning doesn't always equal success,/talks/valorie_kondos_field_why_winning_doesn_...,15:49,Valorie Kondos Field knows a lot about winning...,"['success', 'sports', 'leadership', 'empathy',...",956498.0,"OK, I have a question for all of us. You ready...",2019-12-04,2019-12-20,Gymnastics coach,Valorie Kondos Field is the retired head coach...,18.0,949


# Create Tokenizer

In [273]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def spacy_tokenizer(text):
    
    # Remove audience reactions -- need to revisit (remove )
    no_audience_reacts = text.replace('(Applause.', ' ').replace('(Laughter.', ' ').replace('(Applause)', ' ').replace('(Laughter)', ' ')
    
    # Handle missing spaces after sentences
    cleanr = re.compile('[.!?,;][A-Za-z][A-Za-z]')
    no_spaces = re.findall(cleanr, no_audience_reacts)

    new_text = no_audience_reacts

    if len(no_spaces) > 0:
        for match in no_spaces:
            punc = match[0] # get the punctuation mark
            word = match[1:] # get the start of the word
            if punc != '?':
                new_text = re.sub(f"{match}", f"{punc} {word}", new_text)
            else:
                new_text = re.sub(f"[?]{word}", f"? {word}", new_text)
    
    # Remove quotation marks
    no_quotes = new_text.replace('\"', ' ').replace('”', ' ').replace('’', '')
    
    # Address hyphenation issue -- need to revisit
    no_ism = no_quotes.replace('-ism', 'ism')
    # no_ism = new_text.replace('-ism', 'ism')
    no_dash = no_ism.replace('–', ' ').replace('[0-9]*-', '').replace('—', ' ').replace('-', '')
    
    # Remove parentheses
    # no_parentheses = no_dash.replace('(', ' ').replace(')', ' ')
    
    # Remove .., ..., ....
    # no_ellipses = no_parentheses.replace('....', '').replace('...', '').replace('..','').replace('…', '')
    no_ellipses = no_dash.replace('....', '').replace('...', '').replace('..','').replace('…', '')
    # no_ellipses = no_ism.replace('....', '').replace('...', '').replace('..','').replace('…', '')
    
    # Remove music notes
    no_notes = no_ellipses.replace('♪', '').replace('♫', '')
    
    # HANDLE NUMBERS!
    
    # If ' ' then any alphanumeric then punctuation then any capitalized word then ' ' (correct for missing space after end of sentence)
    
    # Replace all whitespace with one space
    cleantext = ' '.join(no_notes.split())
    cleantext = cleantext.strip()
    
    # Creating our token object, which is used to create documents with linguistic annotations.
    # we disabled the parser and ner parts of the pipeline in order to speed up parsing
    mytokens = nlp(cleantext.lower(), disable=['parser', 'ner'])

    # Removing words except for nouns, verbs, adjectives, and adverbs
    mytokens = [word for word in mytokens if (word.pos_ == 'NOUN') or (word.pos_ == 'VERB')] # or (word.pos_ == 'ADJ') or (word.pos_ == 'ADV')]
    
    # Lemmatizing each token and converting each token into lowercase
    lemmas = []
    for word in mytokens:
        if word.pos_ == 'NOUN':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.NOUN))
        elif word.pos_ == 'VERB':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.VERB))
        elif word.pos_ == 'ADV':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.ADV))
        elif word.pos_ == 'ADJ':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.ADJ))
    
#     mytokens = [word.lemma_.lower().strip() for word in mytokens]

#     # Removing stop words
#    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    lemmas = [word for word in lemmas if word not in stop_words and word not in punctuations]
    
    # return preprocessed list of tokens
    return lemmas

In [274]:
lemmatizer = WordNetLemmatizer()

In [275]:
def get_new_transcript(lemmas):
    transcript = ''
    for lemma in lemmas:
        transcript += lemma + ' '
    return transcript.strip()

# Test Tokenizer on Sentence-level and Speech-level

In [27]:
sentence_1 = "So we knew the numbers were deplorable, and it was manifesting itself in our own organization. So we decided — actually, I\'ll just give you a quick example of a young engineer, a 25-year-old woman, who told us a great story about her daily life, to just exemplify it for us. People, people, place, person, personally."

In [28]:
# remove html tags from all of the text before processing
cleanr = re.compile('[.!?,][A-Z][A-Za-z]+')
cleantext = re.findall(cleanr, sentence_1)

new_sentence = sentence_1
for x in cleantext:
    new_sentence = re.sub(f"{x}", f"{x[0]} {x[1:]}", new_sentence)

In [29]:
new_sentence

"So we knew the numbers were deplorable, and it was manifesting itself in our own organization. So we decided — actually, I'll just give you a quick example of a young engineer, a 25-year-old woman, who told us a great story about her daily life, to just exemplify it for us. People, people, place, person, personally."

In [12]:
new_sentence

"So we knew the numbers were deplorable, and it was manifesting itself in our own organization. So we decided — actually, I'll just give you a quick example of a young engineer, a 25-year-old woman, who told us a great story about her daily life, to just exemplify it for us. People, people, place, person, personally."

In [10]:
for x in cleantext:
    print(x[0])

.
.


In [30]:
transcript_1 = talk_df.transcript[0]

In [31]:
sentence = "This sentence has no problems."
spacy_tokenizer(sentence)

['sentence', 'problem']

In [205]:
tokenized = spacy_tokenizer(transcript_1)

In [206]:
tokenized

['think',
 'explain',
 'concept',
 'decide',
 'start',
 'understand',
 'achieve',
 'height',
 'change',
 'world',
 'need',
 'people',
 'people',
 'universe',
 'conspire',
 'think',
 'need',
 'like',
 'coconspirator',
 'coconspirator',
 'people',
 'need',
 'tend',
 'people',
 'bend',
 'rule',
 'break',
 'challenge',
 'status',
 'quo',
 'stand',
 'norm',
 'describe',
 'experience',
 'crystallize',
 'idea',
 'coconspirator',
 'mind',
 'executive',
 'multinational',
 'face',
 'problem',
 'woman',
 'workforce',
 'context',
 'percent',
 'woman',
 'work',
 'look',
 'number',
 'percent',
 'know',
 'number',
 'manifest',
 'organization',
 'decide',
 'example',
 'engineer',
 'woman',
 'tell',
 'story',
 'life',
 'exemplify',
 'walk',
 'house',
 'morning',
 'run',
 'bunch',
 'chore',
 'motherinlaw',
 'live',
 'inlaws',
 'start',
 'bit',
 'leave',
 'housework',
 'evening',
 'overshot',
 'time',
 'hour',
 'champion',
 'fatherinlaw',
 'husband',
 'start',
 'bit',
 'motherinlaw',
 'care',
 'need',
 '

In [207]:
for talk in talk_views.transcript[0:5]:
    print(spacy_tokenizer(talk))
    print('------')

['think', 'explain', 'concept', 'decide', 'start', 'understand', 'achieve', 'height', 'change', 'world', 'need', 'people', 'people', 'universe', 'conspire', 'think', 'need', 'like', 'coconspirator', 'coconspirator', 'people', 'need', 'tend', 'people', 'bend', 'rule', 'break', 'challenge', 'status', 'quo', 'stand', 'norm', 'describe', 'experience', 'crystallize', 'idea', 'coconspirator', 'mind', 'executive', 'multinational', 'face', 'problem', 'woman', 'workforce', 'context', 'percent', 'woman', 'work', 'look', 'number', 'percent', 'know', 'number', 'manifest', 'organization', 'decide', 'example', 'engineer', 'woman', 'tell', 'story', 'life', 'exemplify', 'walk', 'house', 'morning', 'run', 'bunch', 'chore', 'motherinlaw', 'live', 'inlaws', 'start', 'bit', 'leave', 'housework', 'evening', 'overshot', 'time', 'hour', 'champion', 'fatherinlaw', 'husband', 'start', 'bit', 'motherinlaw', 'care', 'need', 'middle', 'day', 'surround', 'man', 'age', 'expectation', 'society', 'achieve', 'career',

['question', 'win', 'retire', 'head', 'coach', 'woman', 'team', 'position', 'hold', 'year', 'thank', 'tenure', 'experience', 'lot', 'win', 'lead', 'team', 'championship', 'induct', 'vote', 'coach', 'century', 'conference', 'win', 'share', 'insight', 'win', 'success', 'world', 'crisis', 'culture', 'create', 'school', 'business', 'politic', 'win', 'cost', 'society', 'honor', 'people', 'pyramid', 'applaud', 'people', 'win', 'championship', 'election', 'award', 'people', 'leave', 'institution', 'damage', 'kid', 'leave', 'school', 'damage', 'award', 'medal', 'athlete', 'leave', 'team', 'damage', 'profit', 'employee', 'leave', 'company', 'damage', 'end', 'result', 'end', 'result', 'win', 'component', 'sweep', 'rug', 'damage', 'timeout', 'timeout', 'need', 'redefine', 'success', 'success', 'develop', 'champion', 'life', 'world', 'win', 'lose', 'success', 'develop', 'champion', 'life', 'team', 'business', 'tell', 'card', 'bragging', 'right', 'dictate', 'way', 'win', 'dictate', 'way', 'success'

In [208]:
spacy_tokenizer(talk_views.transcript[3])

['dream',
 'life',
 'want',
 'astronaut',
 'rocket',
 'scientist',
 'dream',
 'travel',
 'continent',
 'dream',
 'work',
 'country',
 'world',
 'thank',
 'lot',
 'courage',
 'dream',
 'come',
 'thing',
 'courage',
 'appear',
 'need',
 'result',
 'reflection',
 'work',
 'involve',
 'balance',
 'fear',
 'bravery',
 'fear',
 'thing',
 'courage',
 'step',
 'balance',
 'magic',
 'lie',
 'balance',
 'deal',
 'day',
 'word',
 'wheel',
 'use',
 'wheelchair',
 'grow',
 'run',
 'jump',
 'dancing',
 'love',
 'dance',
 'midtwentie',
 'begin',
 'experience',
 'series',
 'fall',
 'year',
 'diagnose',
 'condition',
 'inclusion',
 'body',
 'muscle',
 'waste',
 'disease',
 'affect',
 'muscle',
 'head',
 'toe',
 'people',
 'diagnose',
 'date',
 'treatment',
 'cure',
 'year',
 'onset',
 'lead',
 'use',
 'wheelchair',
 'diagnose',
 'change',
 'news',
 'experience',
 'illness',
 'disability',
 'idea',
 'disease',
 'progress',
 'dishearten',
 'listen',
 'people',
 'advise',
 'limit',
 'ambition',
 'dream',


In [49]:
talk_views.transcript[3]

'When we\'re young, we\'re innocently brave, and we fearlessly dream about what our lives might be like. Maybe you wanted to be an astronaut or a rocket scientist. Maybe you dreamed of traveling to every continent. Since I was very young, I dreamed of working for the United Nations in some of the most difficult countries in the world. And thanks to a lot of courage that dream came true.But here\'s the thing about courage: it doesn\'t just appear whenever we need it. It\'s the result of tough reflection and real work, involving the balance between fear and bravery. Without fear, we\'ll do foolish things. And without courage, we\'ll never step into the unknown. The balance of the two is where the magic lies, and it\'s a balance we all deal with every day.First, a word about my fancy wheels. I haven\'t always used a wheelchair. I grew up like many of you, running, jumping and dancing. I love to dance. However, in my mid-twenties, I began to experience a series of inexplicable falls. And a

In [76]:
# Handle missing spaces after sentences
cleanr = re.compile('[.!?,;][A-Za-z][A-Za-z]')
no_spaces = re.findall(cleanr, talk_views.transcript[3])

new_text = talk_views.transcript[3]
print(no_spaces)
if len(no_spaces) > 0:
    for match in no_spaces:
        punc = match[0]
        word = match[1:]
        if punc != '?':
            new_text = re.sub(f"{match}", f"{punc} {word}", new_text)
        else:
            new_text = re.sub(f"[?]{word}", f"? {word}", new_text)

['.Bu', '.Fi', '.Wh', '.It', '?Di', '.An', '.Ne', '.No', '.Th']
.
.
.
.
?
.
.
.
.


In [170]:
cv_test = CountVectorizer()

In [168]:
new_transcript = ''
new_sentence = ''
for token in tokens_1:
    new_transcript += token + ' '
for token in tokens_2:
    new_sentence += token + ' '

In [171]:
sentence_cv = cv_test.fit_transform([new_transcript.strip(), new_sentence.strip()])

In [163]:
tokens_1 = spacy_tokenizer(transcript_1)
tokens_2 = spacy_tokenizer(sentence_1 + ' potatoes')

In [159]:
len(cv_test.get_feature_names())

348

In [162]:
len(cv_test.get_feature_names())

24

In [172]:
len(cv_test.get_feature_names())

351

# Create Vectorizers
### Considerations
* Which vectorizer?
    * CountVectorizer
    * TfidfVectorizer
* Which topic modeling technique?
    * LSA (latent semantic analysis)
    * LDA (latent dirichlet allocation
* How many times can a token appear in the corpus?
    * Start: max_df = 0.9
    * Used:
        * 0.5
* How many tokens to include?
    * Start: all
    * Used:
        * 1000
        * 10000 --> best so far, but way too many as a result (with nouns, verbs, adj, adv)
* How many topics to use?
    * Used:
        * 25
        * 50
* How to evaluate topics


* Issue with lemmatization
* Issue with hyphens
* Consider numbers --> only if n-grams
* Issue with periods
* What is the most common word? for longer speeches?
* Default arguments

In [34]:
cv = CountVectorizer(tokenizer=spacy_tokenizer, max_df=0.50, min_df=3, stop_words='english')
tfidf = TfidfVectorizer(tokenizer = spacy_tokenizer, max_features=1000, min_df=3, max_df=0.50)

# Subset for data with views

In [20]:
talk_views = talk_df[talk_df.views.notnull() & talk_df.views > 0]

In [13]:
talk_views.shape

(3599, 15)

In [14]:
talk_views.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3599 entries, 0 to 3648
Data columns (total 15 columns):
date             3599 non-null object
speaker          3599 non-null object
title            3599 non-null object
url              3599 non-null object
length           3599 non-null object
summ             3599 non-null object
tags             3599 non-null object
views            3599 non-null float64
transcript       3599 non-null object
date_recorded    3510 non-null object
upload_date      3510 non-null object
occupation       3181 non-null object
bio              3193 non-null object
comments         2982 non-null float64
duration         3599 non-null int64
dtypes: float64(2), int64(1), object(12)
memory usage: 449.9+ KB


# Train, Test, Split

In [245]:
X = talk_views.transcript
y = talk_views.views

In [246]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Preprocess Transcripts

## Tokenize Transcripts

In [276]:
token_obj = X_train.apply(spacy_tokenizer)

In [266]:
X_train_new = token_obj.apply(get_new_transcript)

## Stemming?

In [249]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [252]:
stemmed = []
for doc in token_obj:
    stemmed.append([stemmer.stem(token) for token in doc])

In [256]:
stem_obj = pd.DataFrame(stemmed)

In [259]:
X_train_new = [get_new_transcript(doc) for doc in stemmed]

In [277]:
cv_new = CountVectorizer()

# Find percent match

In [265]:
test = ['anestheseology', 'anestheseologist', 'clinic', 'clinician', 
        'belief', 'believe', 'believer', 'built', 'build', 'builder', 'building']

### CountVectorizer

In [278]:
# fit and transform training data
dtm_train_cv = cv_new.fit_transform(X_train_new)
# dtm_test_cv = cv.transform(X_test)

In [279]:
len(cv_new.get_feature_names())

35066

In [216]:
dtm_train_cv_df = pd.DataFrame(dtm_train_cv.toarray())
dtm_train_cv_df.columns = cv_new.get_feature_names()

In [280]:
cv_new.get_feature_names()

['000',
 '0001',
 '000foot',
 '000letter',
 '000pound',
 '000yearold',
 '0025',
 '10',
 '100',
 '100ths',
 '100yearold',
 '10day',
 '10footby10foot',
 '10th',
 '10ton',
 '10tothe18thpower',
 '10x',
 '10year',
 '11',
 '11th',
 '11yearold',
 '12',
 '1230s',
 '1250s',
 '12bar',
 '12day',
 '12th',
 '1300s',
 '138th',
 '13th',
 '13yearold',
 '1400s',
 '1450s',
 '14th',
 '14to17yearolds',
 '14yearold',
 '14yearolds',
 '15hour',
 '15monthold',
 '15th',
 '15yearold',
 '15yearolds',
 '1600s',
 '16yearold',
 '16yearolds',
 '1700s',
 '1750s',
 '17th',
 '17yearold',
 '1800fightbribes',
 '1800s',
 '1820s',
 '1830s',
 '1840s',
 '1850s',
 '1851',
 '1860s',
 '1863',
 '1870s',
 '1874',
 '1880s',
 '18day',
 '18th',
 '18unit',
 '18yearold',
 '1900s',
 '1920s',
 '1930s',
 '1940s',
 '1950s',
 '1960s',
 '1970s',
 '1970sinspires',
 '1980s',
 '1990s',
 '19hour',
 '19th',
 '19yearold',
 '19yearolds',
 '1it',
 '1k',
 '1l',
 '1st',
 '20',
 '2000s',
 '200yearold',
 '200yearolds',
 '2011',
 '2030s',
 '20day',
 '20

## TfidfVectorizer

In [225]:
tfidf_new = TfidfVectorizer(max_features=1000, min_df=3, max_df=0.50)

In [226]:
# fit and transform training data
dtm_train_tfidf = tfidf_new.fit_transform(X_train_new)
# dtm_test_tfidf = tfidf_new.transform(X_test)

In [228]:
len(tfidf_new.get_feature_names())

1000

In [230]:
dtm_train_tfidf_df = pd.DataFrame(dtm_train_tfidf.toarray())
dtm_train_tfidf_df.columns = tfidf_new.get_feature_names()

In [231]:
dtm_train_tfidf_df.columns

Index(['ability', 'accept', 'access', 'accord', 'account', 'achieve', 'act',
       'action', 'activity', 'actor',
       ...
       'winter', 'wire', 'wish', 'woman', 'wonder', 'word', 'worker', 'worry',
       'write', 'youth'],
      dtype='object', length=1000)

# Fit TruncatedSVD (LSA)

## CountVectorizer

In [218]:
lsa_cv = TruncatedSVD(n_components = 25, algorithm = 'randomized', n_iter = 3, random_state = 42)

In [219]:
lsa_cv.fit(dtm_train_cv)

TruncatedSVD(algorithm='randomized', n_components=25, n_iter=3, random_state=42,
             tol=0.0)

In [220]:
for index,topic in enumerate(lsa_cv.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv_new.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['world', 'need', 'come', 'story', 'want', 'way', 'time', 'year', 'work', 'use', 'look', 'thing', 'know', 'think', 'people']


THE TOP 15 WORDS FOR TOPIC #1
['hear', 'therapist', 'write', 'kind', 'mess', 'letter', 'guy', 'know', 'term', 'book', 'life', 'think', 'people', 'tell', 'story']


THE TOP 15 WORDS FOR TOPIC #2
['power', 'save', 'heat', 'sun', 'tell', 'dollar', 'oil', 'organ', 'cost', 'electricity', 'use', 'cell', 'energy', 'engine', 'story']


THE TOP 15 WORDS FOR TOPIC #3
['engineer', 'bridge', 'structure', 'look', 'liver', 'scaffold', 'regenerate', 'know', 'use', 'muscle', 'robot', 'patient', 'body', 'organ', 'cell']


THE TOP 15 WORDS FOR TOPIC #4
['migrant', 'scaffold', 'liver', 'regenerate', 'dollar', 'body', 'percent', 'people', 'patient', 'country', 'remittance', 'send', 'organ', 'cell', 'money']


THE TOP 15 WORDS FOR TOPIC #5
['technology', 'fuel', 'fire', 'carbon', 'people', 'electricity', 'plant', 'save', 'power', 'think', 'grow', 'gard

In [222]:
# Create Document - Topic Matrix
lsa_output = lsa_cv.transform(dtm_train_cv)

# column names
topicnames = ["Topic" + str(i) for i in range(lsa_cv.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(X_train_new))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lsa_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(100).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,dominant_topic
Doc0,17.58,-3.14,-1.66,2.68,-2.03,-0.41,1.69,0.93,-0.88,-1.72,-2.44,1.89,-1.4,1.83,-0.7,-2.1,4.72,2.3,-5.64,1.0,-0.04,-6.26,-1.26,-8.5,-3.21,0
Doc1,10.1,-2.2,-0.07,-1.07,1.46,-2.17,-0.01,-0.12,-0.5,-0.65,-0.71,0.89,-1.68,-1.65,-0.82,1.42,-0.98,-0.71,-2.89,-0.62,-0.74,-0.07,1.01,0.92,-0.06,0
Doc2,4.66,-2.17,1.58,3.21,1.04,-0.99,-0.36,0.7,0.76,0.58,-0.05,-1.32,-1.96,-0.41,-0.53,0.97,-0.24,0.29,0.32,-0.68,-0.64,0.04,0.03,0.43,-0.25,0
Doc3,35.88,-5.61,-2.24,13.17,-3.54,-4.39,-3.84,-3.07,-2.41,6.43,0.45,-0.78,-6.98,31.54,-20.91,-21.01,-13.02,-19.32,12.54,13.0,9.73,0.23,18.69,1.19,5.57,0
Doc4,19.62,3.44,-4.48,-0.16,-0.55,-2.09,-0.76,-2.76,0.08,0.19,0.48,-0.58,-2.51,1.16,-0.36,1.02,-1.61,-3.73,3.37,-1.96,-0.07,1.64,-8.35,-0.6,-4.35,0
Doc5,22.39,-0.45,-1.71,1.27,-0.26,3.74,-4.58,-1.38,0.39,2.51,-0.09,2.52,-3.65,-1.45,2.05,4.35,0.23,-0.07,-0.89,-0.25,-2.59,-1.76,2.22,1.27,0.76,0
Doc6,5.01,0.24,-0.86,0.34,-0.77,0.55,-1.23,-0.62,-0.21,0.1,0.74,0.07,0.19,1.08,-0.07,0.07,0.0,-0.15,0.49,-0.55,-0.79,0.25,-0.38,0.42,-0.6,0
Doc7,15.31,-3.28,-0.52,1.14,0.19,-0.57,-5.53,-1.59,-2.01,3.7,-1.05,-0.24,-7.22,1.81,-3.58,12.09,-1.6,1.27,-4.02,7.63,-12.59,9.96,4.14,0.48,1.67,0
Doc8,15.71,11.26,6.05,-1.14,1.77,-1.65,0.07,-0.47,-1.06,0.61,-0.44,3.33,-3.04,-1.7,-1.92,1.05,-0.85,2.51,-0.91,-2.62,0.58,-1.11,0.66,0.72,0.54,0
Doc9,7.72,0.97,-0.42,1.46,-2.12,-1.68,-1.1,-0.87,0.45,-0.21,-0.13,0.06,-1.19,0.04,-1.21,-0.85,1.33,0.31,-0.66,0.04,-1.74,-0.89,-1.44,-0.77,-1.33,0


In [223]:
unique, counts = np.unique(dominant_topic, return_counts=True)

In [224]:
print(np.asarray((unique, counts)).T)

[[ 0 87]
 [ 1  3]
 [ 2  2]
 [ 3  1]
 [ 4  1]
 [ 8  1]
 [ 9  1]
 [11  1]
 [20  1]
 [21  1]
 [23  1]]


## TfidfVectorizer

In [232]:
lsa_tfidf = TruncatedSVD(n_components = 25, algorithm = 'randomized', n_iter = 3, random_state = 42)

In [233]:
lsa_tfidf.fit(dtm_train_tfidf)

TruncatedSVD(algorithm='randomized', n_components=25, n_iter=3, random_state=42,
             tol=0.0)

In [235]:
for index,topic in enumerate(lsa_tfidf.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf_new.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['create', 'system', 'build', 'city', 'group', 'music', 'job', 'man', 'ask', 'country', 'power', 'idea', 'woman', 'technology', 'story']


THE TOP 15 WORDS FOR TOPIC #1
['love', 'man', 'night', 'remember', 'clock', 'wake', 'song', 'teenager', 'play', 'woman', 'story', 'brother', 'memory', 'music', 'sleep']


THE TOP 15 WORDS FOR TOPIC #2
['fight', 'narrative', 'future', 'democracy', 'book', 'therapist', 'write', 'war', 'country', 'police', 'government', 'letter', 'journalist', 'woman', 'story']


THE TOP 15 WORDS FOR TOPIC #3
['bank', 'body', 'live', 'wear', 'cell', 'size', 'bone', 'boat', 'limb', 'fish', 'cave', 'river', 'dinosaur', 'story', 'animal']


THE TOP 15 WORDS FOR TOPIC #4
['architecture', 'trust', 'stuff', 'play', 'newspaper', 'musician', 'computer', 'song', 'city', 'building', 'augment', 'page', 'design', 'robot', 'music']


THE TOP 15 WORDS FOR TOPIC #5
['fuel', 'carbon', 'cool', 'sun', 'dollar', 'fire', 'power', 'building', 'heat', 'electric

In [237]:
import numpy as np
# Create Document - Topic Matrix
lsa_output = lsa_tfidf.transform(dtm_train_tfidf)

# column names
topicnames = ["Topic" + str(i) for i in range(lsa_tfidf.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(X_train_new))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lsa_output, 4), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Get next dominant topic for each document
sorted_topics = np.argsort(df_document_topic.values, axis = 1)
secondary_topic = [doc[-2] for doc in sorted_topics]
df_document_topic['seconary_topic'] = secondary_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(100).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,dominant_topic,seconary_topic
Doc0,0.3185,-0.1447,-0.1185,-0.1885,0.0584,0.0662,-0.018,0.0452,0.1354,0.0936,0.0175,0.1426,0.1293,0.0163,-0.073,0.1414,0.0886,-0.1212,-0.2945,0.074,-0.0497,0.015,-0.033,-0.0737,-0.1298,0,11
Doc1,0.1781,0.0564,0.0767,-0.0114,-0.1101,-0.0469,0.071,0.0047,0.0627,-0.1535,-0.1474,0.0594,0.0371,0.0441,0.0871,-0.0287,0.1274,0.1342,0.1243,0.0016,-0.0019,-0.0368,0.1699,0.1983,-0.1446,23,23
Doc2,0.1942,0.0731,-0.1968,0.1396,0.0138,-0.0796,-0.1588,-0.2729,0.1433,0.0997,-0.037,0.0365,-0.1895,0.0128,-0.0378,0.1095,-0.0907,0.0037,-0.0079,0.1741,0.0522,-0.03,0.0017,0.1834,0.02,0,23
Doc3,0.2993,-0.0835,-0.1507,0.1329,0.1059,-0.0388,-0.074,0.0203,0.0574,-0.0933,0.0645,-0.1012,-0.0248,0.1208,0.0318,0.1059,0.1258,-0.0016,-0.1672,0.0419,-0.1023,0.0125,0.0857,-0.1532,-0.1048,0,3
Doc4,0.2785,0.0486,0.1424,0.0645,-0.0711,0.0141,-0.0062,-0.0278,-0.0269,0.0594,0.1367,-0.0607,-0.0703,-0.0932,-0.0448,-0.0198,-0.1395,0.0043,0.0366,0.0399,-0.0619,0.071,0.1296,-0.0342,0.1255,0,2
Doc5,0.3682,-0.0327,0.0709,-0.0335,0.008,0.0504,0.1336,-0.0034,-0.1071,-0.0316,-0.1519,0.0098,0.1847,-0.0693,-0.0661,0.0638,0.06,-0.1552,0.08,0.0029,0.0938,-0.1655,0.1987,0.1434,0.1491,0,22
Doc6,0.1683,-0.016,-0.0352,0.0584,0.072,0.0314,0.0161,0.0399,0.3987,0.2473,0.1274,-0.1865,-0.1301,-0.2909,-0.0572,0.0817,0.1432,0.0,0.2124,0.0819,0.2578,-0.0966,0.0463,-0.1217,-0.0292,8,8
Doc7,0.2389,0.5902,-0.3353,-0.1774,-0.2573,-0.0401,-0.089,0.1392,-0.1133,0.0272,-0.0598,-0.0211,0.0399,0.0173,-0.0018,0.0474,-0.0654,0.0071,0.0445,0.1035,0.0369,-0.0171,-0.0304,-0.0452,0.0178,1,1
Doc8,0.3256,0.1424,0.3006,0.1777,-0.019,0.2715,-0.1461,-0.0307,-0.1672,0.0216,0.0483,0.0959,0.0809,-0.021,0.0029,0.0912,0.0678,0.0447,0.0618,0.0414,0.1102,0.1132,-0.0727,-0.1017,-0.077,0,2
Doc9,0.2075,0.0706,0.0452,0.001,0.1904,0.0649,-0.1936,0.0614,0.0766,0.0112,-0.2194,-0.0685,-0.0325,-0.0128,0.3567,0.0407,0.0219,0.1591,-0.0994,0.0343,-0.0918,-0.0577,-0.0339,0.2049,0.0616,14,14


In [238]:
np.argmax(df_document_topic.values, axis=1)

array([26, 25, 26, 26, 26, 26, 25, 25, 26, 25, 26, 26, 25, 25, 25, 25, 25,
       26, 25, 26, 25, 26, 26, 26, 26, 26, 25, 26, 26, 25, 25, 25, 26, 26,
       26, 25, 25, 26, 25, 26, 25, 26, 26, 25, 25, 25, 25, 26, 25, 25, 26,
       25, 26, 25, 26, 25, 25, 26, 25, 25, 25, 26, 26, 26, 25, 25, 26, 26,
       26, 26, 26, 25, 25, 26, 26, 25, 26, 25, 26, 25, 25, 25, 26, 25, 25,
       26, 25, 25, 26, 26, 26, 25, 26, 25, 25, 25, 26, 26, 26, 25])

In [239]:
unique, counts = np.unique(dominant_topic, return_counts=True)

In [240]:
print(np.asarray((unique, counts)).T)

[[ 0 50]
 [ 1  4]
 [ 2  2]
 [ 3  7]
 [ 4  3]
 [ 5  4]
 [ 6  2]
 [ 7  2]
 [ 8  2]
 [10  1]
 [11  1]
 [12  2]
 [13  3]
 [14  3]
 [16  3]
 [17  1]
 [18  2]
 [19  1]
 [20  1]
 [21  2]
 [22  1]
 [23  2]
 [24  1]]


In [241]:
unique, counts = np.unique(secondary_topic, return_counts=True)

In [242]:
print(np.asarray((unique, counts)).T)

[[ 1  5]
 [ 2  6]
 [ 3 10]
 [ 4  6]
 [ 5  4]
 [ 6  2]
 [ 7  3]
 [ 8  4]
 [ 9  3]
 [10  3]
 [11  8]
 [12  3]
 [13  5]
 [14  4]
 [15  2]
 [16  3]
 [17  6]
 [18  2]
 [19  3]
 [20  4]
 [21  5]
 [22  3]
 [23  4]
 [24  2]]


# Fit LDA

## CountVectorizer

## TfidfVectorizer

In [177]:
lda_tfidf = LatentDirichletAllocation(n_components=25,           # Number of topics
                                      max_iter=5,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=100,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )

In [178]:
lda_tfidf.fit_transform(dtm_train_tfidf)

array([[0.00297412, 0.00297412, 0.00297412, ..., 0.00297412, 0.92862114,
        0.00297412],
       [0.00470189, 0.00470189, 0.00470189, ..., 0.00470189, 0.88715454,
        0.00470189],
       [0.00448913, 0.00448913, 0.00448913, ..., 0.00448913, 0.89226077,
        0.00448913],
       ...,
       [0.00389272, 0.00389272, 0.00389272, ..., 0.00389272, 0.90657474,
        0.00389272],
       [0.00534758, 0.00534758, 0.00534758, ..., 0.00534758, 0.87165807,
        0.00534758],
       [0.00340726, 0.00340726, 0.00340726, ..., 0.00340726, 0.91822585,
        0.00340726]])

In [179]:
for index,topic in enumerate(lda_tfidf.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['process', 'banjo', 'critical', 'light', 'play', 'chant', 'astrophysicist', 'clinical', 'extremist', 'limb', 'phantom', 'trial', 'cancer', 'music', 'map']


THE TOP 15 WORDS FOR TOPIC #1
['transcript', 'director', 'company', 'mobile', 'org', 'conversation', 'chart', 'phone', 'team', 'structure', 'gold', 'innovation', 'rhythm', 'technology', 'beat']


THE TOP 15 WORDS FOR TOPIC #2
['donor', 'arm', 'refine', 'flight', 'dilemma', 'augment', 'consensus', 'patient', 'scaffold', 'liver', 'group', 'transplant', 'stem', 'organ', 'cell']


THE TOP 15 WORDS FOR TOPIC #3
['oil', 'unit', 'planet', 'beetle', 'refrigeration', 'star', 'diplomat', 'antibiotic', 'posture', 'dung', 'fuel', 'cloud', 'joint', 'car', 'knee']


THE TOP 15 WORDS FOR TOPIC #4
['basic', 'labor', 'plank', 'beast', 'mountain', 'exile', 'person', 'door', 'mouth', 'chinese', 'fire', 'sun', 'tree', 'character', 'hercule']


THE TOP 15 WORDS FOR TOPIC #5
['young', 'prison', 'camp', 'bomb', 'suspicious'

In [180]:
# Create Document - Topic Matrix
lda_output = lda_tfidf.transform(dtm_train_tfidf)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_tfidf.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(X_train))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Get next dominant topic for each document
sorted_topics = np.argsort(df_document_topic.values, axis = 1)
secondary_topic = [doc[-2] for doc in sorted_topics]
df_document_topic['seconary_topic'] = secondary_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(100).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,dominant_topic,seconary_topic
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93,0.0,23,23
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.0,23,23
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.0,23,23
Doc3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.94,0.0,23,23
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.0,23,23
Doc5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93,0.0,23,23
Doc6,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.88,0.01,23,23
Doc7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.0,23,23
Doc8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.91,0.0,23,23
Doc9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.88,0.0,23,23


In [144]:
unique, counts = np.unique(dominant_topic, return_counts=True)
print(np.asarray((unique, counts)).T)

[[   1 2871]
 [   8    6]
 [  11    2]]
