# Topic Modeling - Mallet LDA


Author: Ryan Timbrook (RTIMBROO)<br>
DATE: 12/7/2019<br>
Topic: Perform Topic Modeling on NFL Tweet Text

## 1. Objective
_____________________________________________________________________________________________
Topic Modeling

LDA is an algorithm that can “summarize” the main topics of a text collection.

Topic modeling will be performed at the NFL Type level collected by NFL Game Schedule Week.
* Coach
* Team
* Player


### Topic Modeling References
* [sklearn LatentDirichletAllocation](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html)
* [Topic Modeling with SciKit Learn](https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730)
* [Complete Guide to Topic Modeling](https://nlpforhackers.io/topic-modeling/)
* [Topic Modelling with SciKit-learn -- Derek Greene University College Dublin](http://derekgreene.com/slides/topic-modelling-with-scikitlearn.pdf)


______________________________________________________________________________________________
### Coding Environment Setup
Import packages

In [1]:
# import packages for analysis and modeling
import pandas as pd #data frame operations
import numpy as np #arrays and math functions
np.random.seed(42)

import matplotlib.pyplot as plt #2D plotting
%matplotlib inline
import seaborn as sns #
import os
import sys
import io
from os import path
import re
import random
import json
from datetime import date
from datetime import time
from datetime import datetime
import warnings
from timeit import default_timer               # performance processing time
import logging                                 # logging framework
warnings.filterwarnings('ignore')

# 
import nltk
nltk.download('wordnet')
from nltk import PorterStemmer
from nltk.stem import PorterStemmer 
stemmer = PorterStemmer()

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.stem.porter import *
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import itemfreq

from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from scipy.stats import zscore


## IMPORTANT - you must install gensim first ##
## conda install -c anaconda gensim
from gensim import models, corpora
from gensim.utils import simple_preprocess, lemmatize
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer

## conda install -c conda-forge pyldavis
import pyLDAvis.sklearn as LDAvis
import pyLDAvis
pyLDAvis.enable_notebook()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rt310\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  """


In [2]:
# custome python packages
import rtimbroo_utils as br             # custome python helper functions

In [3]:
# get current date
now = datetime.utcnow().isoformat()
collection_date = re.findall('^[0-9]{4}-[0-9]{2}-[0-9]{2}',now)
collection_date

['2019-12-12']

In [4]:
# set global properties
notebook_file_name = 'nfl_tweets_topic_modeling'
report_file_name = 'nfl_tweets_topic_modeling'
app_name = 'nfl_tweets_topic_modeling'
log_level = 10 # 10-DEBUG, 20-INFO, 30-WARNING, 40-ERROR, 50-CRITICAL

# setup working directory structure
# set global properties
dataDir = './data'
outputDir = './output'
configDir = './config'
logOutDir = './logs'
imageDir = './images'
modelDir = './models'
corpusDir = './corpus'


In [5]:
# create base output directories if they don't exist
if not os.path.exists(outputDir): os.mkdir(outputDir)
if not os.path.exists(logOutDir): os.mkdir(logOutDir)
if not os.path.exists(imageDir): os.mkdir(imageDir)
if not os.path.exists(modelDir): os.mkdir(modelDir)
if not os.path.exists(corpusDir): os.mkdir(corpusDir)

In [6]:
# get a logger for troubleshooting / data exploration
logger = br.getFileLogger(logOutDir+'/',app_name+'_'+collection_date[0],level=log_level)

[NLTK 3.4.5](https://www.nltk.org/index.html)
* [nltk.stem.wordnet](https://www.nltk.org/_modules/nltk/stem/wordnet.html)
* [WordNetLemmatizer lemmatize](https://www.nltk.org/api/nltk.stem.html?highlight=wordnetlemmatizer#nltk.stem.wordnet.WordNetLemmatizer.lemmatize)

In [7]:
# gensim lemmatize function -- requires pattern package
# lemmatize each word to its root form, keeping only nouns(NN), adjectives(JJ), verbs(VB) and adverbs(RB).
# We keep only these POS tags because they are the ones contributing the most to the meaning of the sentences
# this implementation allows Nouns(NN), Adjectives(JJ), and Pronouns(RB) - topic keyword choices
def gen_lemmatize(text):
        stems = []
        tokens = text.split(' ')
        for i,word in enumerate(tokens):
            try:
            
                lem_word = lemmatize(word,allowed_tags=re.compile('(NN|VB|JJ|RB)'))

                if lem_word:
                    stems = stems + [lem_word[0].split(b'/')[0].decode('utf-8')]
                    
            except BaseException as be:
                logger.error(f'***ERROR***: Caught BaseException: {be} | word_idx:[{i}] | word:[{word}]')
                continue

        return ' '.join(stems)

## 2. OBTAIN the data   
________________________________________________________________________________________________


In [8]:
# pre-processed data file
train = pd.read_csv(f'{dataDir}/nfl_master_sent_merged_timeseries.csv', encoding='latin')

logger.info(f'train shape: {train.shape}')
logger.info(f'train size: {train.size}')
logger.info(f'train len: {len(train)}')
logger.info(f'train info: {train.info()}')



# nfl foster list to use for custom stop word removal
nfl_roster = pd.read_csv(f'{dataDir}/nfl_teams_roster_data.csv', encoding='utf8')
nfl_coaches_df = pd.read_csv(f'{dataDir}/nfl_coaches_list.csv', encoding='utf8')

train shape: (9928, 17)
train size: 168776
train len: 9928
train info: None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9928 entries, 0 to 9927
Data columns (total 17 columns):
id                  9928 non-null int64
created_at          9928 non-null object
date                9928 non-null object
time                9928 non-null object
user                9928 non-null object
favorite_count      9928 non-null float64
year                9928 non-null int64
month               9928 non-null int64
day_of_month        9928 non-null int64
day_of_week         9928 non-null int64
nfl_type            9928 non-null object
nfl_schedule_wk     9928 non-null int64
text                9870 non-null object
text_clean          9708 non-null object
sentiment_scores    9928 non-null object
sentiment           9928 non-null object
sentiment_class     9928 non-null int64
dtypes: float64(1), int64(7), object(9)
memory usage: 1.3+ MB


In [9]:
train.head()

Unnamed: 0,id,created_at,date,time,user,favorite_count,year,month,day_of_month,day_of_week,nfl_type,nfl_schedule_wk,text,text_clean,sentiment_scores,sentiment,sentiment_class
0,1200925546487504897,Sat Nov 30 23:52:17 +0000 2019,2019-11-30,23:52:17,miamidolphin12,1398.0,2019,11,30,5,team,13,RT SEVENTEEN!,seventeen,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral,1
1,1200925257407827968,Sat Nov 30 23:51:08 +0000 2019,2019-11-30,23:51:08,JT_Evans97,28503.0,2019,11,30,5,player,13,You mean the same fan experts who could see La...,mean fan experts could see would great,"{'neg': 0.0, 'neu': 0.439, 'pos': 0.561, 'comp...",positive,2
2,1200924548700495872,Sat Nov 30 23:48:19 +0000 2019,2019-11-30,23:48:19,MigiziLaFern,161.0,2019,11,30,5,player,13,Deshaun Watson or Sam Darnold? WhoShouldIStart,whoshouldistart,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral,1
3,1200924401002303498,Sat Nov 30 23:47:43 +0000 2019,2019-11-30,23:47:43,DelindaTierney,3148.0,2019,11,30,5,team,13,"Since 2012, Tom Brady has won 7 straight games...",since straight games ppg ypg ypa intÃÂ¢ÃÂÃ...,"{'neg': 0.0, 'neu': 0.808, 'pos': 0.192, 'comp...",positive,2
4,1200922963375280129,Sat Nov 30 23:42:01 +0000 2019,2019-11-30,23:42:01,TifdanyBrooks,10198.0,2019,11,30,5,player,13,RT Top 5 current NFL quarterbacks 1. Tom Bra...,top current quarterbacks russel watsonÃÂ¢ÃÂ...,"{'neg': 0.0, 'neu': 0.69, 'pos': 0.31, 'compou...",positive,2


In [10]:
nfl_roster.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 4 columns):
team      32 non-null object
year      32 non-null int64
roster    32 non-null object
text      32 non-null object
dtypes: int64(1), object(3)
memory usage: 1.1+ KB


In [11]:
nfl_roster.head()

Unnamed: 0,team,year,roster,text
0,green_bay_packers,2019,"['Jaire Alexander', 'Davante Adams', 'Yosuah N...",{'team_roster_news': {'2019-12-07T01:47:01.090...
1,arizona_cardinals,2019,"['Chandler Jones', 'Larry Fitzgerald', 'Keesea...",{'team_roster_news': {'2019-12-07T01:47:01.927...
2,baltimore_ravens,2019,"['Denzel Rice', 'Randin Crecelius', 'Kalil Mor...",{'team_roster_news': {'2019-12-07T01:47:03.116...
3,chicago_bears,2019,"['Khalil Mack', 'Josh Caldwell', 'Bobby Massie...",{'team_roster_news': {'2019-12-07T01:47:04.113...
4,cincinnati_bengals,2019,"['LaRoy Reynolds', 'Dre Kirkpatrick', 'Andy Da...",{'team_roster_news': {'2019-12-07T01:47:05.368...


In [12]:
nfl_coaches_df.head()

Unnamed: 0,Team,Coach,Season,W,L,T,W%,W.1,L.1,T.1,W%.1
0,Arizona Cardinals,Kliff Kingsbury,2019,3,6,1,0.35,3,6,1,0.35
1,Atlanta Falcons,Dan Quinn,2015,37,34,0,0.521,37,34,0,0.521
2,Baltimore Ravens,John Harbaugh,2008,109,74,0,0.596,109,74,0,0.596
3,Buffalo Bills,Sean McDermott,2017,20,18,0,0.526,20,18,0,0.526
4,Carolina Panthers,Perry Fewell*,2019,0,0,0,0.0,0,0,0,0.0


## Create NFL Stop Words Lists
----------------------------------------------------------------------------------------------------------

In [13]:
# NFL Teams Stop Words
nfl_teams = [t.replace('_', ' ') for t in nfl_roster['team']]
#logger.info(f'{nfl_teams}')

In [14]:
# NFL Players Stop Words
nfl_teams_players = [re.sub(r'\'|"|-|\.','',p.lower()) for p in nfl_roster['roster']]

nfl_players = []
for i, team_list in enumerate(nfl_teams_players):
    team_players = team_list.split(',')
    for player in team_players:
        nfl_players.append(re.sub(r'\[|\]','',player.strip()))
    #if i == 2: break
        
#logger.info(f'team count: {i+1} | player count: {len(nfl_players)} | \n{nfl_players}')

In [15]:
# NFL Coaches Stop Words
nfl_coaches = nfl_coaches_df['Coach']
nfl_coaches = [re.sub(r'\*|\'','',c.lower()) for c in nfl_coaches]

In [16]:
# dump stop words to file to be used by other modules
with open(f'{dataDir}/nfl_stop_words_config.txt', 'w+') as f:
    
    nfl_stop_words = {'coaches':nfl_coaches,'players':nfl_players,'teams':nfl_teams}
    json.dump(nfl_stop_words,f)
    

In [17]:
nfl_types_stop_words = []

for coach in nfl_coaches:
    nfl_types_stop_words.append(coach)
    
for player in nfl_players:
    nfl_types_stop_words.append(player)
    
for team in nfl_teams:
    nfl_types_stop_words.append(team)

In [18]:
nfl_types_stop_words_v2 = [name for names in nfl_types_stop_words for name in names.split()]
nfl_types_stop_words_v2[:10]

['kliff',
 'kingsbury',
 'dan',
 'quinn',
 'john',
 'harbaugh',
 'sean',
 'mcdermott',
 'perry',
 'fewell']

### 3.2 Topic Modeling - SciKit Learn package
--------------------------------------------------------------------------------------------------


Resources:
* [sklearn.decomposition.LatentDirichletAllocation](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html)
* [Topic Modelling with SciKit-learn - Insight, Derek Greene -- University College Dublin](http://derekgreene.com/slides/topic-modelling-with-scikitlearn.pdf)
* [sklearn.feature_extraction.text.CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
* [sklearn.feature_extraction.text.TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [19]:
# function to display the topic model top 10 feature words
def display_topics(model, feature_names, no_top_words):
    top_ids = []
    top_words = []
    
    for topic_idx, topic in enumerate(model.components_):
        top_ids.append(str(topic_idx))
        logger.debug("Topic %d:" % (topic_idx))
        top_10 = " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        top_words.append(top_10)
        logger.debug(top_10)
    
    top_df = pd.DataFrame()
    top_df['Topic']=top_ids
    top_df['Top_10_Features']=top_words
    
    return top_df
no_top_words = 10

In [20]:
# vectorization parameters
no_features = 1000
max_df = 0.95
min_df = 2
max_features = no_features # None
vect_input = 'content'
encoding = 'latin'
decode_error = 'strict'
stop_words = nfl_types_stop_words_v2

#### Build a CountVectorizer data model

In [21]:
# this block is just to filter out non-utf8 text from the data set. it was causing the Vectorizer classes to fail
train_dict = train[['id','text_clean','sentiment_class','nfl_type']]
text_tokens = []
text_ids = []
text_classes = []
text_nfl_type = []

for index, row in train_dict.iterrows():
    try:
        # add lemitization
        tokens = word_tokenize(gen_lemmatize(str(row['text_clean'])))
        
        text_tokens.append(tokens)
        text_ids.append(row['id'])
        text_classes.append(row['sentiment_class'])
        text_nfl_type.append(row['nfl_type'])
    except BaseException as be:
        logger.warning(f'****WARNING***: Caught Exception Tokenizing text: {be}')
        #pass


***ERROR***: Caught BaseException: generator raised StopIteration | word_idx:[0] | word:[seventeen]
***ERROR***: Caught BaseException: generator raised StopIteration | word_idx:[0] | word:[mean]
***ERROR***: Caught BaseException: generator raised StopIteration | word_idx:[3] | word:[could]


In [22]:
train_clean_df = pd.DataFrame()
text = []
for doc in text_tokens:
    text.append(' '.join(doc))


train_clean_df['id'] = text_ids
train_clean_df['text'] = text
train_clean_df['sentiment_class'] = text_classes
train_clean_df['nfl_type'] = text_nfl_type
train_clean_df.shape

(9928, 4)

In [23]:
coaches_text = train_clean_df[train_clean_df['nfl_type']=='coach'].text
players_text = train_clean_df[train_clean_df['nfl_type']=='player'].text
teams_text = train_clean_df[train_clean_df['nfl_type']=='team'].text

In [24]:
# ---- TRAIN VECTORIZER COACHES-----#
# build vectorizer from content
#train_documents or test_documents - in-memory content for training versus pulling from hard disk
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
coach_train_tf_vectorizer = CountVectorizer(input='content',stop_words=stop_words, max_df=max_df, min_df=min_df, max_features=max_features,encoding=encoding)
coach_train_tf_vec_model = coach_train_tf_vectorizer.fit_transform(coaches_text)
coach_train_tf_vec_feature_names = coach_train_tf_vectorizer.get_feature_names()
logger.info(f'coach_train_tf_vec_model shape: [{coach_train_tf_vec_model.shape}]')

coach_train_tf_vec_model shape: [(288, 222)]


In [25]:
# ---- TRAIN VECTORIZER PLAYERS-----#
# build vectorizer from content
#train_documents or test_documents - in-memory content for training versus pulling from hard disk
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
player_train_tf_vectorizer = CountVectorizer(input='content',stop_words=stop_words, max_df=max_df, min_df=min_df, max_features=max_features,encoding=encoding)
player_train_tf_vec_model = player_train_tf_vectorizer.fit_transform(players_text)
player_train_tf_vec_feature_names = player_train_tf_vectorizer.get_feature_names()
logger.info(f'player_train_tf_vec_model shape: [{player_train_tf_vec_model.shape}]')

player_train_tf_vec_model shape: [(4828, 1000)]


In [26]:
# ---- TRAIN VECTORIZER TEAMS-----#
# build vectorizer from content
#train_documents or test_documents - in-memory content for training versus pulling from hard disk
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
team_train_tf_vectorizer = CountVectorizer(input='content',stop_words=stop_words, max_df=max_df, min_df=min_df, max_features=max_features,encoding=encoding)
team_train_tf_vec_model = team_train_tf_vectorizer.fit_transform(teams_text)
team_train_tf_vec_feature_names = team_train_tf_vectorizer.get_feature_names()
logger.info(f'team_train_tf_vec_model shape: [{team_train_tf_vec_model.shape}]')

team_train_tf_vec_model shape: [(4812, 1000)]


## Build Topic Models
* [LDA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html)

**from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD**

#### Build a Latent Dirichlet Allocation Model
object parameters:
* LatentDirichletAllocation(n_components=10, doc_topic_prior=None, topic_word_prior=None, learning_method=’batch’, learning_decay=0.7, learning_offset=10.0, max_iter=10, batch_size=128, evaluate_every=-1, total_samples=1000000.0, perp_tol=0.1, mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=None, verbose=0, random_state=None)

Latent Dirichlet Allocation with online variational Bayes algorithm

In [27]:
# LDA algorithm parameters
n_topics = 10            # int, optional (default=10) - Number of Topics
max_iter=5
learning_method='online' # ‘batch’ | ‘online’, default=’batch’ -- Method used to update _component. Only used in fit method. In general, if the data size is large, the online update will be much faster than the batch update.
learning_offset=50.
random_state=42

In [28]:
# COACH --- instantiate the lda object
coach_lda = LatentDirichletAllocation(n_components=n_topics, 
                                max_iter=max_iter, 
                                learning_method=learning_method, 
                                learning_offset=learning_offset,
                                random_state=random_state)
# build the lda model
with br.elapsed_timer() as elapsed:
    coach_train_cong_lda_model = coach_lda.fit_transform(coach_train_tf_vec_model)
    logger.info(f'Coach Elapsed time training LDA mode: [{elapsed()}]')
    logger.info(f'Coach train_cong_lda_model shape: [{coach_train_cong_lda_model.shape}]') #(NO_DOCUMENTS, NO_TOPICS)

Coach Elapsed time training LDA mode: [0.21228719999999868]
Coach train_cong_lda_model shape: [(288, 10)]


In [29]:
# PLAYER --- instantiate the lda object
player_lda = LatentDirichletAllocation(n_components=n_topics, 
                                max_iter=max_iter, 
                                learning_method=learning_method, 
                                learning_offset=learning_offset,
                                random_state=random_state)
# build the lda model
with br.elapsed_timer() as elapsed:
    player_train_cong_lda_model = player_lda.fit_transform(player_train_tf_vec_model)
    logger.info(f'Player Elapsed time training LDA mode: [{elapsed()}]')
    logger.info(f'Player train_cong_lda_model shape: [{player_train_cong_lda_model.shape}]') #(NO_DOCUMENTS, NO_TOPICS)

Player Elapsed time training LDA mode: [2.6423711999999995]
Player train_cong_lda_model shape: [(4828, 10)]


In [30]:
# TEAM ---- instantiate the lda object
team_lda = LatentDirichletAllocation(n_components=n_topics, 
                                max_iter=max_iter, 
                                learning_method=learning_method, 
                                learning_offset=learning_offset,
                                random_state=random_state)
# build the lda model
with br.elapsed_timer() as elapsed:
    team_train_cong_lda_model = team_lda.fit_transform(team_train_tf_vec_model)
    logger.info(f'Team Elapsed time training LDA mode: [{elapsed()}]')
    logger.info(f'Team train_cong_lda_model shape: [{team_train_cong_lda_model.shape}]') #(NO_DOCUMENTS, NO_TOPICS)

Team Elapsed time training LDA mode: [2.7854177000000035]
Team train_cong_lda_model shape: [(4812, 10)]


### Evaluate Topic Models

In [31]:
# First document of each of the three model types
logger.info(f'COACH LDA --- 1st doc in corpus:\n {coach_train_cong_lda_model[0]}')
logger.info(f'PLAYER LDA --- 1st doc in corpus:\n {player_train_cong_lda_model[0]}')
logger.info(f'TEAM LDA --- 1st doc in corpus:\n {team_train_cong_lda_model[0]}')

COACH LDA --- 1st doc in corpus:
 [0.03333345 0.03333354 0.03333358 0.03333349 0.03333344 0.03333373
 0.03333694 0.03333344 0.69999497 0.03333342]
PLAYER LDA --- 1st doc in corpus:
 [0.02500255 0.025      0.02500161 0.02500004 0.02500659 0.025
 0.52662897 0.025      0.025      0.27336022]
TEAM LDA --- 1st doc in corpus:
 [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]


In [32]:
# display top 10 words fitted to each of the topic model spaces
coach_lda_top_10 = display_topics(coach_lda, coach_train_tf_vec_feature_names, no_top_words)
coach_lda_top_10.to_csv(f'{outputDir}/coach_lda_top_10.csv', index=False)
coach_lda_top_10.head()

Unnamed: 0,Topic,Top_10_Features
0,0,stand always espn cornerback chronicle release...
1,1,week check seasonã start head coachgm letã blo...
2,2,trading away quite left franchise askingã what...
3,3,office oversee front rumble offseason fly some...
4,4,longterm move follow donã seem coach base week...


In [33]:
player_lda_top_10 = display_topics(player_lda, player_train_tf_vec_feature_names, no_top_words)
player_lda_top_10.to_csv(f'{outputDir}/player_lda_top_10.csv', index=False)
player_lda_top_10.head()

Unnamed: 0,Topic,Top_10_Features
0,0,draft trade season electric pick accuracy tco ...
1,1,need hof hou det reach attention marino fewest...
2,2,int start starter national tds come make back ...
3,3,qbs defense face last next falk take break run...
4,4,calculate valuation recent clemson birthday hi...


In [34]:
team_lda_top_10 = display_topics(team_lda, team_train_tf_vec_feature_names, no_top_words)
team_lda_top_10.to_csv(f'{outputDir}/team_lda_top_10.csv', index=False)
team_lda_top_10.head()

Unnamed: 0,Topic,Top_10_Features
0,0,game report team injury week fan friday amp st...
1,1,sign roster place active practice list squad r...
2,2,october come join welcome detail conroe saturd...
3,3,week game texas preview beat matchup tomorrow ...
4,4,work get texas want back drop democrat stop fe...


### Evaluate Topic Models Visually


In [35]:
## COACH --- LDA TOPIC MODEL VISUALIZATION
coach_lda_panel = LDAvis.prepare(coach_lda, coach_train_tf_vec_model, coach_train_tf_vectorizer, mds='tsne')
pyLDAvis.display(coach_lda_panel)

In [36]:
## Player --- LDA TOPIC MODEL VISUALIZATION
player_lda_panel = LDAvis.prepare(player_lda, player_train_tf_vec_model, player_train_tf_vectorizer, mds='tsne')
pyLDAvis.display(player_lda_panel)

In [37]:
## Team --- LDA TOPIC MODEL VISUALIZATION
team_lda_panel = LDAvis.prepare(team_lda, team_train_tf_vec_model, team_train_tf_vectorizer, mds='tsne')
pyLDAvis.display(team_lda_panel)

In [38]:
del coach_lda
del coach_train_tf_vec_model
del coach_train_tf_vectorizer
del player_lda
del player_train_tf_vec_model
del player_train_tf_vectorizer
del team_lda
del team_train_tf_vec_model
del team_train_tf_vectorizer
