# Import Packages

In [1]:
import nltk
from nltk.corpus import wordnet
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import random
import re 
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV, train_test_split

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import seaborn
import matplotlib.pyplot as plt
%matplotlib inline

# Load Data

In [2]:
talk_df = pd.read_csv('has_transcript_clean.csv', index_col = 0)

In [3]:
talk_df.head()

Unnamed: 0,date,speaker,title,url,length,summ,tags,views,transcript,date_recorded,upload_date,occupation,bio,comments,duration
0,Jan 2020,Ipsita Dasgupta,"To challenge the status quo, find a ""co-conspi...",/talks/ipsita_dasgupta_to_challenge_the_status...,11:03,"In a complex and changing world, how can we ma...","['innovation', 'collaboration', 'society', 'so...",599446.0,So I've been thinking about how to explain thi...,2019-09-24,2020-01-02,"Business executive, ""co-conspirator""",Ipsita Dasgupta drives the consumption of ente...,7.0,663
1,Jan 2020,Rod Phillips,A brief history of alcohol,/talks/rod_phillips_a_brief_history_of_alcohol,4:56,Nobody knows exactly when humans began to crea...,"['TED-Ed', 'education', 'animation', 'history'...",501290.0,This chimpanzee stumbles across a windfall of ...,2020-01-02,2020-01-02,,,,296
2,Jan 2020,Pat Mitchell,Dangerous times call for dangerous women,/talks/pat_mitchell_dangerous_times_call_for_d...,17:14,Pat Mitchell has nothing left to prove and muc...,"['women', 'women in business', 'community', 'a...",461705.0,"Recently, I've been declaring to anyone who wo...",2019-12-04,2020-01-02,Dangerous woman,Pat Mitchell is a lifelong advocate for women ...,14.0,1034
3,Dec 2019,Cara E. Yar Khan,The beautiful balance between courage and fear,/talks/cara_e_yar_khan_the_beautiful_balance_b...,9:55,After being diagnosed with a rare genetic cond...,"['fear', 'personal growth', 'health', 'life', ...",880662.0,"When we're young, we're innocently brave, and ...",2019-12-04,2019-12-23,Human rights and disability activist,Cara E. Yar Khan is an international human rig...,17.0,595
4,Dec 2019,Valorie Kondos Field,Why winning doesn't always equal success,/talks/valorie_kondos_field_why_winning_doesn_...,15:49,Valorie Kondos Field knows a lot about winning...,"['success', 'sports', 'leadership', 'empathy',...",956498.0,"OK, I have a question for all of us. You ready...",2019-12-04,2019-12-20,Gymnastics coach,Valorie Kondos Field is the retired head coach...,18.0,949


# Create Tokenizer

In [206]:
# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def spacy_tokenizer(text):
    
    # Handle other parenthetical phrases
    clean_parentheses = re.compile('\(.*\)')
    no_parentheses = re.sub(clean_parentheses, ' ', text)
    
    # Remove audience reactions -- need to revisit (remove )
    no_audience_reacts = no_parentheses.replace('(Applause.', '. ').replace('(Laughter.', '. ')
    
    # Handle missing spaces after sentences
    cleanr = re.compile('[.!?,;][A-Za-z][A-Za-z]')
    no_spaces = re.findall(cleanr, no_audience_reacts)

    new_text = no_audience_reacts

    if len(no_spaces) > 0:
        for match in no_spaces:
            punc = match[0] # get the punctuation mark
            word = match[1:] # get the start of the word
            if punc != '?':
                new_text = re.sub(f"{match}", f"{punc} {word}", new_text)
            else:
                new_text = re.sub(f"[?]{word}", f"? {word}", new_text)
    
    # Handle numbers with commas
    clean_commas = re.compile('[0-9]+,[0-9][0-9][0-9]')
    num_commas = re.findall(clean_commas, new_text)
    
    no_commas = new_text

    if len(num_commas) > 0:
        for match in num_commas:
            replace_str = match.replace(',', '')
            no_commas = re.sub(f"{match}", f"{replace_str}", no_commas)
    
    # Handle numbers with hyphens
    clean_numbers = re.compile('[0-9]+-')
    numbers = re.findall(clean_numbers, no_commas)
    
    no_numbers = no_commas

    if len(numbers) > 0:
        for match in numbers:
            no_numbers = re.sub(f"{match}", "# ", no_numbers)
    
    # Remove quotation marks
    no_quotes = no_numbers.replace('\"', ' ').replace('”', ' ').replace('’', '')
    
    # Address hyphenation issue -- need to revisit
    no_ism = no_quotes.replace('-ism', 'ism')
    # no_ism = new_text.replace('-ism', 'ism')
    no_dash = no_ism.replace('–', ' ').replace('—', ' ')
    
    # Remove parentheses
    no_parentheses = no_dash.replace('(', ' ').replace(')', ' ')
    
    # Remove .., ..., ....
    no_ellipses = no_parentheses.replace('....', '').replace('...', '').replace('..','').replace('…', '')
    
    # Remove music notes
    no_notes = no_ellipses.replace('♪', '').replace('♫', '')
    
    # SPECIFIC RULE
    no_spec = no_notes.replace('R and D', 'research and development').replace('R & D', 'research and development')
    
    # Replace all whitespace with one space
    cleantext = ' '.join(no_spec.split())
    cleantext = cleantext.strip()
    
    # Lemmatize here
    # Creating our token object, which is used to create documents with linguistic annotations.
    # we disabled the parser and ner parts of the pipeline in order to speed up parsing
    mytokens = nlp(cleantext.lower(), disable=['parser', 'ner'])

    # Lemmatizing each token and converting each token into lowercase
    lemmas = []
    for word in mytokens:
        if word.pos_ == 'NOUN':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.NOUN))
        elif word.pos_ == 'VERB':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.VERB))
        elif word.pos_ == 'ADV':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.ADV))
        elif word.pos_ == 'ADJ':
            lemmas.append(lemmatizer.lemmatize(word.text.lower().strip(), wordnet.ADJ))

    lemmas = [word for word in lemmas if word not in stop_words and word not in punctuations]
    
    # return preprocessed list of tokens
    return lemmas

  clean_parentheses = re.compile('\(.*\)')


In [144]:
lemmatizer = WordNetLemmatizer()

In [145]:
def get_new_transcript(lemmas):
    transcript = ''
    for lemma in lemmas:
        transcript += lemma + ' '
    return transcript.strip()

In [164]:
test_sentence = "(Applause.) One year, a 5,000-year-old man ate a peach."

In [168]:
test_sentence.replace('(', '&')

'&Applause.) One year, a 5,000-year-old man ate a peach.'

In [191]:
spacy_tokenizer(test_sentence)

  One year, a # year-old man ate a peach.


['year', 'yearold', 'man', 'eat', 'peach']

# Test Tokenizer on Sentence-level and Speech-level

In [131]:
sentence_1 = "So we knew the numbers were deplorable, and it was manifesting itself in our own organization. So we decided — actually, I\'ll just give you a quick example of a young engineer, a 25-year-old woman, who told us a great story about her daily life, to just exemplify it for us. People, people, place, person, personally."

In [28]:
# remove html tags from all of the text before processing
cleanr = re.compile('[.!?,][A-Z][A-Za-z]+')
cleantext = re.findall(cleanr, sentence_1)

new_sentence = sentence_1
for x in cleantext:
    new_sentence = re.sub(f"{x}", f"{x[0]} {x[1:]}", new_sentence)

In [29]:
new_sentence

"So we knew the numbers were deplorable, and it was manifesting itself in our own organization. So we decided — actually, I'll just give you a quick example of a young engineer, a 25-year-old woman, who told us a great story about her daily life, to just exemplify it for us. People, people, place, person, personally."

In [172]:
spacy_tokenizer(sentence_1)

['so',
 'know',
 'number',
 'deplorable',
 'manifest',
 'own',
 'organization',
 'so',
 'decide',
 'actually',
 "'ll",
 'just',
 'give',
 'quick',
 'example',
 'young',
 'engineer',
 'yearold',
 'woman',
 'tell',
 'great',
 'story',
 'daily',
 'life',
 'just',
 'exemplify',
 'people',
 'people',
 'place',
 'person',
 'personally']

In [173]:
transcript_1 = talk_df.transcript[0]

In [174]:
sentence = "This sentence has no problems."
spacy_tokenizer(sentence)

['sentence', 'problem']

In [175]:
transcript_1

'So I\'ve been thinking about how to explain this concept to you, and I\'ve decided I\'m just going to start with something we all understand. To achieve great heights or change the world, no matter how smart we are, we all need people. And for conventional people, the universe seems to conspire to make them successful. For the unconventional, I think we need something that I like to call "co-conspirators."Co-conspirators are different not because they\'re different themselves, but because of the people who need them. They tend to be people who are willing to bend the rules — actually even break them sometimes — and challenge the status quo to stand beside someone who is going against societal norms. I\'m actually going to describe an experience that I had that first crystallized the idea of co-conspirators in my mind.In 2014, I was a corporate executive with an American multinational in India, and we were actually faced with an interesting problem: we didn\'t have enough women in the 

In [207]:
tokenized = spacy_tokenizer(transcript_1)

In [208]:
tokenized

['think',
 'explain',
 'concept',
 'decide',
 'start',
 'understand',
 'achieve',
 'great',
 'height',
 'change',
 'world',
 'matter',
 'smart',
 'need',
 'people',
 'conventional',
 'people',
 'universe',
 'conspire',
 'successful',
 'unconventional',
 'think',
 'need',
 'like',
 'co',
 'conspirator',
 'co',
 'conspirator',
 'different',
 'different',
 'people',
 'need',
 'tend',
 'people',
 'willing',
 'bend',
 'rule',
 'actually',
 'break',
 'challenge',
 'status',
 'quo',
 'stand',
 'societal',
 'norm',
 'actually',
 'describe',
 'experience',
 'crystallize',
 'idea',
 'co',
 'conspirator',
 'mind',
 'corporate',
 'executive',
 'american',
 'multinational',
 'actually',
 'face',
 'interesting',
 'problem',
 'woman',
 'workforce',
 'context',
 'percent',
 'woman',
 'work',
 'look',
 'number',
 'percent',
 'know',
 'number',
 'deplorable',
 'manifest',
 'organization',
 'decide',
 'actually',
 'quick',
 'example',
 'young',
 'engineer',
 'year',
 'old',
 'woman',
 'tell',
 'great',
 

In [216]:
from nltk.corpus import wordnet as wn
possible_adj = []
for ss in wn.synsets('accessibility'):
    for lemmas in ss.lemmas(): # all possible lemmas
        print(lemmas)
        for ps in lemmas.pertainyms(): # all possible pertainyms
            possible_adj.append(ps.name())

Lemma('handiness.n.02.handiness')
Lemma('handiness.n.02.accessibility')
Lemma('handiness.n.02.availability')
Lemma('handiness.n.02.availableness')
Lemma('approachability.n.01.approachability')
Lemma('approachability.n.01.accessibility')


In [215]:
possible_adj

[]

# Create Vectorizers
### Considerations
* Which vectorizer?
    * CountVectorizer
    * TfidfVectorizer
* Which topic modeling technique?
    * LSA (latent semantic analysis)
    * LDA (latent dirichlet allocation
* How many times can a token appear in the corpus?
    * Start: max_df = 0.9
    * Used:
        * 0.5
* How many tokens to include?
    * Start: all
    * Used:
        * 1000
        * 10000 --> best so far, but way too many as a result (with nouns, verbs, adj, adv)
* How many topics to use?
    * Used:
        * 25
        * 50
* How to evaluate topics


* Issue with lemmatization
* Issue with hyphens
* Consider numbers --> only if n-grams
* Issue with periods
* What is the most common word? for longer speeches?
* Default arguments

In [178]:
cv = CountVectorizer(tokenizer=spacy_tokenizer, max_df=0.50, min_df=3, stop_words='english')
tfidf = TfidfVectorizer(tokenizer = spacy_tokenizer, max_features=1000, min_df=3, max_df=0.50)

# Subset for data with views

In [32]:
talk_views = talk_df[talk_df.views.notnull() & talk_df.views > 0]

In [33]:
talk_views.shape

(3599, 15)

In [34]:
talk_views.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3599 entries, 0 to 3648
Data columns (total 15 columns):
date             3599 non-null object
speaker          3599 non-null object
title            3599 non-null object
url              3599 non-null object
length           3599 non-null object
summ             3599 non-null object
tags             3599 non-null object
views            3599 non-null float64
transcript       3599 non-null object
date_recorded    3510 non-null object
upload_date      3510 non-null object
occupation       3181 non-null object
bio              3193 non-null object
comments         2982 non-null float64
duration         3599 non-null int64
dtypes: float64(2), int64(1), object(12)
memory usage: 449.9+ KB


# Train, Test, Split

In [195]:
X = talk_views.transcript
y = talk_views.views

In [196]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Preprocess Transcripts

## Tokenize Transcripts

In [210]:
token_obj = X_train.apply(spacy_tokenizer)

In [211]:
X_train_new = token_obj.apply(get_new_transcript)

In [199]:
X_train_new.shape

(2879,)

In [49]:
X_train_new

2172    ecologist physicist study network meet couple ...
2425    year serve year deployment find portauprince l...
1653    tell stand scold slouch family dinner comment ...
3357    try think sync connect happiness occur reason ...
1165    come today speak liar lawsuit laughter time he...
181     winter visit grandmother house place way mosqu...
873     object timeless object think work life masterp...
1150    o'clock morning pitch son bed sleep sleep teen...
1452                                  trip time life wall
2576    want try tell story page lesson learn result s...
375     decision today week year intelligence build li...
2673    role try explain people technology come think ...
760     gage love ace geography exam remember location...
629     fish trouble cod population collapse 1990s fis...
3508    listen scientist morning talk experiment reali...
1864    think rhythm element music find world ocean ti...
2586    energy conversation boil question die oil war ...
2224    joke c

In [63]:
X_train[2172]

'Eric Berlow: I\'m an ecologist, and Sean\'s a physicist, and we both study complex networks. And we met a couple years ago when we discovered that we had both given a short TED Talk about the ecology of war, and we realized that we were connected by the ideas we shared before we ever met. And then we thought, you know, there are thousands of other talks out there, especially TEDx Talks, that are popping up all over the world. How are they connected, and what does that global conversation look like? So Sean\'s going to tell you a little bit about how we did that.Sean Gourley: Exactly. So we took 24,000 TEDx Talks from around the world, 147 different countries, and we took these talks and we wanted to find the mathematical structures that underly the ideas behind them. And we wanted to do that so we could see how they connected with each other.And so, of course, if you\'re going to do this kind of stuff, you need a lot of data. So the data that you\'ve got is a great thing called YouTub

In [61]:
for tokens in X_train_new:
    print(tokens)

ecologist physicist study network meet couple year discover talk ecology war realize connect idea share meet think know thousand talk talk pop world connect conversation look tell bit talk world country talk want find structure idea want connect course kind stuff need lot data data thing youtube pull information comment view watch watch comment pull use translation pull transcript work people accent transcript thing language processing algorithm read computer line line extract concept concept form structure idea memeome memeome know mathematics underlie idea analysis want share idea memeome idea course idea borrow steal build memeome talk compare memeome talk similarity create link represent graph connect theory let work practice footprint talk year explode world way corner analyze percent start connection occur connect talk image beauty connect conversation radiate outwards data footprint conversation happen run limit projection computer technology allow space network projection apply

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Stemming?

In [249]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [252]:
stemmed = []
for doc in token_obj:
    stemmed.append([stemmer.stem(token) for token in doc])

In [256]:
stem_obj = pd.DataFrame(stemmed)

In [259]:
X_train_new = [get_new_transcript(doc) for doc in stemmed]

In [41]:
cv_new = CountVectorizer()

# Find percent match

In [265]:
test = ['anestheseology', 'anestheseologist', 'clinic', 'clinician', 
        'belief', 'believe', 'believer', 'built', 'build', 'builder', 'building']

### CountVectorizer

In [209]:
# fit and transform training data
dtm_train_cv = cv_new.fit_transform(X_train_new)

In [201]:
len(cv_new.get_feature_names())

28702

In [182]:
dtm_train_cv_df = pd.DataFrame(dtm_train_cv.toarray())
dtm_train_cv_df.columns = cv_new.get_feature_names()

In [202]:
cv_new.get_feature_names()[0:1000]

['0025',
 '100',
 '100th',
 '10th',
 '10x',
 '11',
 '11th',
 '12',
 '1230s',
 '1250s',
 '125th',
 '12th',
 '1300s',
 '13th',
 '1400s',
 '143rd',
 '1450s',
 '14th',
 '15th',
 '15thcentury',
 '1600s',
 '16th',
 '1700s',
 '17th',
 '1800s',
 '1820s',
 '1830s',
 '1840s',
 '1850s',
 '1851',
 '1863',
 '1870s',
 '1874',
 '1880s',
 '18th',
 '18thcentury',
 '1900s',
 '1920s',
 '1930s',
 '1940s',
 '1950s',
 '1960s',
 '1968digital',
 '1970s',
 '1980s',
 '1990s',
 '19th',
 '1blindly',
 '1it',
 '1l',
 '1st',
 '20',
 '2000s',
 '200th',
 '2011',
 '2030s',
 '20th',
 '21st',
 '21stcentury',
 '22nd',
 '23rd',
 '24th',
 '25th',
 '26th',
 '27th',
 '28th',
 '29th',
 '2d',
 '2n',
 '2nd',
 '30',
 '30th',
 '360s',
 '37th',
 '3d',
 '3dprinted',
 '3rd',
 '3x3',
 '40',
 '40th',
 '44th',
 '49ers',
 '4d',
 '4th',
 '4ths',
 '50',
 '50th',
 '52ndfreest',
 '57th',
 '5k',
 '5th',
 '60',
 '60k',
 '60s',
 '60th',
 '61st',
 '67th',
 '69th',
 '6th',
 '70',
 '701',
 '75th',
 '78',
 '7th',
 '80',
 '8th',
 '90',
 '95th',
 '9s

## TfidfVectorizer

In [225]:
tfidf_new = TfidfVectorizer(max_features=1000, min_df=3, max_df=0.50)

In [226]:
# fit and transform training data
dtm_train_tfidf = tfidf_new.fit_transform(X_train_new)
# dtm_test_tfidf = tfidf_new.transform(X_test)

In [228]:
len(tfidf_new.get_feature_names())

1000

In [230]:
dtm_train_tfidf_df = pd.DataFrame(dtm_train_tfidf.toarray())
dtm_train_tfidf_df.columns = tfidf_new.get_feature_names()

In [231]:
dtm_train_tfidf_df.columns

Index(['ability', 'accept', 'access', 'accord', 'account', 'achieve', 'act',
       'action', 'activity', 'actor',
       ...
       'winter', 'wire', 'wish', 'woman', 'wonder', 'word', 'worker', 'worry',
       'write', 'youth'],
      dtype='object', length=1000)

# Fit TruncatedSVD (LSA)

## CountVectorizer

In [52]:
lsa_cv = TruncatedSVD(n_components = 25, algorithm = 'randomized', n_iter = 3, random_state = 42)

In [53]:
lsa_cv.fit(dtm_train_cv)

TruncatedSVD(algorithm='randomized', n_components=25, n_iter=3, random_state=42,
             tol=0.0)

In [54]:
for index,topic in enumerate(lsa_cv.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv_new.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['life', 'need', 'come', 'use', 'want', 'work', 'way', 'world', 'look', 'time', 'year', 'thing', 'know', 'think', 'people']


THE TOP 15 WORDS FOR TOPIC #1
['technology', 'find', 'design', 'water', 'animal', 'system', 'light', 'energy', 'cancer', 'look', 'neuron', 'body', 'use', 'cell', 'brain']


THE TOP 15 WORDS FOR TOPIC #2
['stuff', 'talk', 'book', 'way', 'bit', 'computer', 'design', 'lot', 'kind', 'idea', 'want', 'look', 'know', 'thing', 'think']


THE TOP 15 WORDS FOR TOPIC #3
['life', 'patient', 'want', 'story', 'feel', 'tell', 'child', 'body', 'neuron', 'woman', 'know', 'think', 'cell', 'people', 'brain']


THE TOP 15 WORDS FOR TOPIC #4
['lot', 'energy', 'build', 'company', 'drug', 'think', 'power', 'world', 'neuron', 'city', 'technology', 'use', 'cell', 'brain', 'people']


THE TOP 15 WORDS FOR TOPIC #5
['look', 'care', 'car', 'doctor', 'disease', 'start', 'system', 'data', 'use', 'health', 'work', 'drug', 'cell', 'patient', 'cancer']


THE TOP 15

In [55]:
# Create Document - Topic Matrix
lsa_output = lsa_cv.transform(dtm_train_cv)

# column names
topicnames = ["Topic" + str(i) for i in range(lsa_cv.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(X_train_new))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lsa_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(100).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,dominant_topic
Doc0,18.1,1.57,7.17,-2.29,1.74,-0.34,2.62,3.66,-1.5,-2.63,-0.79,-0.65,2.17,-2.48,-2.97,2.59,1.34,2.01,6.22,-2.39,3.71,0.31,0.78,0.87,-4.18,0
Doc1,9.98,0.2,-1.42,-1.36,-1.18,1.33,0.88,-2.16,-1.8,0.97,0.71,1.9,-1.06,-1.06,-0.35,-0.39,-0.69,0.82,-0.38,-0.51,1.79,1.17,1.18,0.12,2.73,0
Doc2,4.75,3.31,-0.71,0.58,-0.16,0.43,-0.04,-1.22,-2.49,-1.26,-0.12,-0.55,0.11,-0.15,-0.12,0.47,-0.06,1.04,-0.17,0.18,1.09,-1.14,-0.27,0.87,2.67,0
Doc3,1.6,0.13,1.16,0.49,-0.15,-0.02,0.2,0.69,0.57,-0.1,-0.24,0.36,0.33,0.11,0.32,-0.07,-0.62,-0.76,0.04,0.01,-0.02,-0.35,0.27,-0.07,0.37,0
Doc4,7.06,-2.82,1.12,2.3,1.05,-1.12,-1.08,1.03,-0.46,-0.82,1.84,2.73,0.2,-0.18,1.04,-0.89,-1.66,-0.61,-0.89,0.56,-0.79,-1.01,0.67,-0.25,0.5,0
Doc5,0.28,0.09,-0.03,0.0,-0.04,-0.04,-0.08,0.07,-0.08,0.03,-0.02,0.01,0.05,-0.18,0.05,0.02,-0.2,-0.01,0.1,0.01,0.15,-0.15,-0.13,-0.04,0.26,0
Doc6,4.94,-0.06,1.98,0.65,-0.28,-0.79,-0.18,1.43,-0.26,-0.46,0.43,1.19,-0.05,0.3,1.39,-1.44,-1.46,-1.25,0.13,0.8,-0.95,-0.88,-0.21,-0.02,0.33,0
Doc7,3.68,0.07,-0.85,0.44,-1.94,0.38,2.08,0.64,-0.46,1.06,0.18,0.07,-0.29,-0.12,1.13,-1.25,-0.19,-0.19,-0.33,0.74,-1.24,-1.29,-0.41,-0.29,0.66,0
Doc8,0.33,0.08,-0.08,0.01,-0.32,-0.18,-0.36,-0.01,-0.22,0.06,0.01,0.05,-0.21,0.39,0.15,-0.42,0.04,0.01,0.23,0.15,-0.2,0.08,-0.31,-0.18,-0.04,13
Doc9,2.72,0.16,1.7,0.88,-0.73,0.71,0.63,0.7,-0.29,-0.06,0.36,0.6,0.03,-0.33,-0.17,-0.21,0.32,0.55,-0.91,0.09,1.0,0.03,-0.12,0.04,-0.15,0


In [56]:
unique, counts = np.unique(dominant_topic, return_counts=True)

In [57]:
print(np.asarray((unique, counts)).T)

[[   0 2605]
 [   1   65]
 [   2    5]
 [   3    8]
 [   4    8]
 [   5    7]
 [   6    9]
 [   7    7]
 [   8   10]
 [   9    4]
 [  10   25]
 [  11   14]
 [  12    1]
 [  13    6]
 [  14   24]
 [  15    3]
 [  16    8]
 [  17   14]
 [  18    8]
 [  19    2]
 [  20   21]
 [  21    8]
 [  22    1]
 [  23    6]
 [  24   10]]


## TfidfVectorizer

In [232]:
lsa_tfidf = TruncatedSVD(n_components = 25, algorithm = 'randomized', n_iter = 3, random_state = 42)

In [233]:
lsa_tfidf.fit(dtm_train_tfidf)

TruncatedSVD(algorithm='randomized', n_components=25, n_iter=3, random_state=42,
             tol=0.0)

In [235]:
for index,topic in enumerate(lsa_tfidf.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf_new.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['create', 'system', 'build', 'city', 'group', 'music', 'job', 'man', 'ask', 'country', 'power', 'idea', 'woman', 'technology', 'story']


THE TOP 15 WORDS FOR TOPIC #1
['love', 'man', 'night', 'remember', 'clock', 'wake', 'song', 'teenager', 'play', 'woman', 'story', 'brother', 'memory', 'music', 'sleep']


THE TOP 15 WORDS FOR TOPIC #2
['fight', 'narrative', 'future', 'democracy', 'book', 'therapist', 'write', 'war', 'country', 'police', 'government', 'letter', 'journalist', 'woman', 'story']


THE TOP 15 WORDS FOR TOPIC #3
['bank', 'body', 'live', 'wear', 'cell', 'size', 'bone', 'boat', 'limb', 'fish', 'cave', 'river', 'dinosaur', 'story', 'animal']


THE TOP 15 WORDS FOR TOPIC #4
['architecture', 'trust', 'stuff', 'play', 'newspaper', 'musician', 'computer', 'song', 'city', 'building', 'augment', 'page', 'design', 'robot', 'music']


THE TOP 15 WORDS FOR TOPIC #5
['fuel', 'carbon', 'cool', 'sun', 'dollar', 'fire', 'power', 'building', 'heat', 'electric

In [237]:
import numpy as np
# Create Document - Topic Matrix
lsa_output = lsa_tfidf.transform(dtm_train_tfidf)

# column names
topicnames = ["Topic" + str(i) for i in range(lsa_tfidf.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(X_train_new))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lsa_output, 4), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Get next dominant topic for each document
sorted_topics = np.argsort(df_document_topic.values, axis = 1)
secondary_topic = [doc[-2] for doc in sorted_topics]
df_document_topic['seconary_topic'] = secondary_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(100).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,dominant_topic,seconary_topic
Doc0,0.3185,-0.1447,-0.1185,-0.1885,0.0584,0.0662,-0.018,0.0452,0.1354,0.0936,0.0175,0.1426,0.1293,0.0163,-0.073,0.1414,0.0886,-0.1212,-0.2945,0.074,-0.0497,0.015,-0.033,-0.0737,-0.1298,0,11
Doc1,0.1781,0.0564,0.0767,-0.0114,-0.1101,-0.0469,0.071,0.0047,0.0627,-0.1535,-0.1474,0.0594,0.0371,0.0441,0.0871,-0.0287,0.1274,0.1342,0.1243,0.0016,-0.0019,-0.0368,0.1699,0.1983,-0.1446,23,23
Doc2,0.1942,0.0731,-0.1968,0.1396,0.0138,-0.0796,-0.1588,-0.2729,0.1433,0.0997,-0.037,0.0365,-0.1895,0.0128,-0.0378,0.1095,-0.0907,0.0037,-0.0079,0.1741,0.0522,-0.03,0.0017,0.1834,0.02,0,23
Doc3,0.2993,-0.0835,-0.1507,0.1329,0.1059,-0.0388,-0.074,0.0203,0.0574,-0.0933,0.0645,-0.1012,-0.0248,0.1208,0.0318,0.1059,0.1258,-0.0016,-0.1672,0.0419,-0.1023,0.0125,0.0857,-0.1532,-0.1048,0,3
Doc4,0.2785,0.0486,0.1424,0.0645,-0.0711,0.0141,-0.0062,-0.0278,-0.0269,0.0594,0.1367,-0.0607,-0.0703,-0.0932,-0.0448,-0.0198,-0.1395,0.0043,0.0366,0.0399,-0.0619,0.071,0.1296,-0.0342,0.1255,0,2
Doc5,0.3682,-0.0327,0.0709,-0.0335,0.008,0.0504,0.1336,-0.0034,-0.1071,-0.0316,-0.1519,0.0098,0.1847,-0.0693,-0.0661,0.0638,0.06,-0.1552,0.08,0.0029,0.0938,-0.1655,0.1987,0.1434,0.1491,0,22
Doc6,0.1683,-0.016,-0.0352,0.0584,0.072,0.0314,0.0161,0.0399,0.3987,0.2473,0.1274,-0.1865,-0.1301,-0.2909,-0.0572,0.0817,0.1432,0.0,0.2124,0.0819,0.2578,-0.0966,0.0463,-0.1217,-0.0292,8,8
Doc7,0.2389,0.5902,-0.3353,-0.1774,-0.2573,-0.0401,-0.089,0.1392,-0.1133,0.0272,-0.0598,-0.0211,0.0399,0.0173,-0.0018,0.0474,-0.0654,0.0071,0.0445,0.1035,0.0369,-0.0171,-0.0304,-0.0452,0.0178,1,1
Doc8,0.3256,0.1424,0.3006,0.1777,-0.019,0.2715,-0.1461,-0.0307,-0.1672,0.0216,0.0483,0.0959,0.0809,-0.021,0.0029,0.0912,0.0678,0.0447,0.0618,0.0414,0.1102,0.1132,-0.0727,-0.1017,-0.077,0,2
Doc9,0.2075,0.0706,0.0452,0.001,0.1904,0.0649,-0.1936,0.0614,0.0766,0.0112,-0.2194,-0.0685,-0.0325,-0.0128,0.3567,0.0407,0.0219,0.1591,-0.0994,0.0343,-0.0918,-0.0577,-0.0339,0.2049,0.0616,14,14


In [238]:
np.argmax(df_document_topic.values, axis=1)

array([26, 25, 26, 26, 26, 26, 25, 25, 26, 25, 26, 26, 25, 25, 25, 25, 25,
       26, 25, 26, 25, 26, 26, 26, 26, 26, 25, 26, 26, 25, 25, 25, 26, 26,
       26, 25, 25, 26, 25, 26, 25, 26, 26, 25, 25, 25, 25, 26, 25, 25, 26,
       25, 26, 25, 26, 25, 25, 26, 25, 25, 25, 26, 26, 26, 25, 25, 26, 26,
       26, 26, 26, 25, 25, 26, 26, 25, 26, 25, 26, 25, 25, 25, 26, 25, 25,
       26, 25, 25, 26, 26, 26, 25, 26, 25, 25, 25, 26, 26, 26, 25])

In [239]:
unique, counts = np.unique(dominant_topic, return_counts=True)

In [240]:
print(np.asarray((unique, counts)).T)

[[ 0 50]
 [ 1  4]
 [ 2  2]
 [ 3  7]
 [ 4  3]
 [ 5  4]
 [ 6  2]
 [ 7  2]
 [ 8  2]
 [10  1]
 [11  1]
 [12  2]
 [13  3]
 [14  3]
 [16  3]
 [17  1]
 [18  2]
 [19  1]
 [20  1]
 [21  2]
 [22  1]
 [23  2]
 [24  1]]


In [241]:
unique, counts = np.unique(secondary_topic, return_counts=True)

In [242]:
print(np.asarray((unique, counts)).T)

[[ 1  5]
 [ 2  6]
 [ 3 10]
 [ 4  6]
 [ 5  4]
 [ 6  2]
 [ 7  3]
 [ 8  4]
 [ 9  3]
 [10  3]
 [11  8]
 [12  3]
 [13  5]
 [14  4]
 [15  2]
 [16  3]
 [17  6]
 [18  2]
 [19  3]
 [20  4]
 [21  5]
 [22  3]
 [23  4]
 [24  2]]


# Fit LDA

## CountVectorizer

## TfidfVectorizer

In [177]:
lda_tfidf = LatentDirichletAllocation(n_components=25,           # Number of topics
                                      max_iter=5,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=100,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )

In [178]:
lda_tfidf.fit_transform(dtm_train_tfidf)

array([[0.00297412, 0.00297412, 0.00297412, ..., 0.00297412, 0.92862114,
        0.00297412],
       [0.00470189, 0.00470189, 0.00470189, ..., 0.00470189, 0.88715454,
        0.00470189],
       [0.00448913, 0.00448913, 0.00448913, ..., 0.00448913, 0.89226077,
        0.00448913],
       ...,
       [0.00389272, 0.00389272, 0.00389272, ..., 0.00389272, 0.90657474,
        0.00389272],
       [0.00534758, 0.00534758, 0.00534758, ..., 0.00534758, 0.87165807,
        0.00534758],
       [0.00340726, 0.00340726, 0.00340726, ..., 0.00340726, 0.91822585,
        0.00340726]])

In [179]:
for index,topic in enumerate(lda_tfidf.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['process', 'banjo', 'critical', 'light', 'play', 'chant', 'astrophysicist', 'clinical', 'extremist', 'limb', 'phantom', 'trial', 'cancer', 'music', 'map']


THE TOP 15 WORDS FOR TOPIC #1
['transcript', 'director', 'company', 'mobile', 'org', 'conversation', 'chart', 'phone', 'team', 'structure', 'gold', 'innovation', 'rhythm', 'technology', 'beat']


THE TOP 15 WORDS FOR TOPIC #2
['donor', 'arm', 'refine', 'flight', 'dilemma', 'augment', 'consensus', 'patient', 'scaffold', 'liver', 'group', 'transplant', 'stem', 'organ', 'cell']


THE TOP 15 WORDS FOR TOPIC #3
['oil', 'unit', 'planet', 'beetle', 'refrigeration', 'star', 'diplomat', 'antibiotic', 'posture', 'dung', 'fuel', 'cloud', 'joint', 'car', 'knee']


THE TOP 15 WORDS FOR TOPIC #4
['basic', 'labor', 'plank', 'beast', 'mountain', 'exile', 'person', 'door', 'mouth', 'chinese', 'fire', 'sun', 'tree', 'character', 'hercule']


THE TOP 15 WORDS FOR TOPIC #5
['young', 'prison', 'camp', 'bomb', 'suspicious'

In [180]:
# Create Document - Topic Matrix
lda_output = lda_tfidf.transform(dtm_train_tfidf)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_tfidf.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(X_train))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Get next dominant topic for each document
sorted_topics = np.argsort(df_document_topic.values, axis = 1)
secondary_topic = [doc[-2] for doc in sorted_topics]
df_document_topic['seconary_topic'] = secondary_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(100).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,dominant_topic,seconary_topic
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93,0.0,23,23
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.0,23,23
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.89,0.0,23,23
Doc3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.94,0.0,23,23
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.0,23,23
Doc5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.93,0.0,23,23
Doc6,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.88,0.01,23,23
Doc7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.0,23,23
Doc8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.91,0.0,23,23
Doc9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.88,0.0,23,23


In [144]:
unique, counts = np.unique(dominant_topic, return_counts=True)
print(np.asarray((unique, counts)).T)

[[   1 2871]
 [   8    6]
 [  11    2]]
