In [1]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [3]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rupamacharyya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rupamacharyya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [6]:
import glob
import pickle
def load_pickle(pickle_file):
    try:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f)
    except UnicodeDecodeError as e:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f, encoding='latin1')
    except Exception as e:
        print('Unable to load data ', pickle_file, ':', e)
        raise
    return pickle_data

def text_prepare(talk):
    text_data = []
    for sent_grp in talk:
        if sent_grp==['(Laughter)'] or sent_grp == ['(Applause)']:
            continue 
        else:
            for sent in sent_grp:
                tokens = prepare_text_for_lda(sent)
                if tokens:
                    text_data.append(tokens)
    return text_data

ted_dic = {}
doc_num = 0
for file in glob.glob("/Users/rupamacharyya/DeepFairness/Data/TED_meta/*.pkl"):
    talk = load_pickle(file)
    ted_dic[talk['talk_meta']['id']] = text_prepare(talk['talk_transcript'])
    if doc_num%100 == 0:
        print("Processing Document: ", doc_num)
    doc_num += 1
pickle.dump(ted_dic,open('tokenized_text.pkl','wb'))


Processing Document:  0
Processing Document:  100
Processing Document:  200
Processing Document:  300
Processing Document:  400
Processing Document:  500
Processing Document:  600
Processing Document:  700
Processing Document:  800
Processing Document:  900
Processing Document:  1000
Processing Document:  1100
Processing Document:  1200
Processing Document:  1300
Processing Document:  1400
Processing Document:  1500
Processing Document:  1600
Processing Document:  1700
Processing Document:  1800
Processing Document:  1900
Processing Document:  2000
Processing Document:  2100
Processing Document:  2200
Processing Document:  2300
Processing Document:  2400


In [8]:
!pip install boto

Collecting boto
[?25l  Downloading https://files.pythonhosted.org/packages/23/10/c0b78c27298029e4454a472a1919bde20cb182dab1662cec7f2ca1dcc523/boto-2.49.0-py2.py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 1.4MB/s eta 0:00:01
[?25hInstalling collected packages: boto
Successfully installed boto-2.49.0


In [9]:
from gensim import corpora
def create_dict_and_corpus(text_data):
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    return dictionary,corpus
#     import pickle
#     pickle.dump(corpus, open('corpus.pkl', 'wb'))
#     dictionary.save('dictionary.gensim')
  

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [14]:
import gensim
NUM_TOPICS = 5
NUM_WORDS = 10
def create_lda_model(corpus,dictionary):

    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
    topics = ldamodel.show_topics(num_topics=NUM_TOPICS, num_words=NUM_WORDS, log=False, formatted=False)    
    return topics,ldamodel


In [15]:
corpus_dic, dictionary_dic = {}, {}
topics_dic = {}
doc_num = 0
for key in ted_dic:
    if doc_num%100 == 0:
        print("Processing Document: ",doc_num)
    doc_num+=1
    dictionary,corpus = create_dict_and_corpus(ted_dic[key])
    corpus_dic[key] = corpus
    dictionary_dic[key] = dictionary

pickle.dump((corpus_dic,dictionary_dic),open('corpus_dictionary.pkl','wb'))  

Processing Document:  0
Processing Document:  100
Processing Document:  200
Processing Document:  300
Processing Document:  400
Processing Document:  500
Processing Document:  600
Processing Document:  700
Processing Document:  800
Processing Document:  900
Processing Document:  1000
Processing Document:  1100
Processing Document:  1200
Processing Document:  1300
Processing Document:  1400
Processing Document:  1500
Processing Document:  1600
Processing Document:  1700
Processing Document:  1800
Processing Document:  1900
Processing Document:  2000
Processing Document:  2100
Processing Document:  2200
Processing Document:  2300
Processing Document:  2400


In [12]:
corpus_dic, dictionary_dic = load_pickle('corpus_dictionary.pkl')

In [24]:
topics_dic = {}
ldamodel_dic = {}
doc_num = 0
for key in ted_dic:
    if doc_num%100 == 0:
        print("Processing Document: ",doc_num)
    doc_num+=1
    corpus, dictionary = corpus_dic[key], dictionary_dic[key]
    topics,ldamodel = create_lda_model(corpus,dictionary)
    topics_dic[key] = topics
    ldamodel_dic[key] = ldamodel
#     if doc_num>10:
#         break
    
#     vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
#     vis
    #print(topics)
pickle.dump(ldamodel_dic,open('ldamodel_dic.pkl','wb'))
pickle.dump(topics_dic,open('topics_dic_'+str(NUM_TOPICS)+'_'+str(NUM_WORDS)+'.pkl','wb'))

Processing Document:  0
Processing Document:  100
Processing Document:  200
Processing Document:  300
Processing Document:  400
Processing Document:  500
Processing Document:  600
Processing Document:  700
Processing Document:  800
Processing Document:  900
Processing Document:  1000
Processing Document:  1100
Processing Document:  1200
Processing Document:  1300
Processing Document:  1400
Processing Document:  1500
Processing Document:  1600
Processing Document:  1700
Processing Document:  1800
Processing Document:  1900
Processing Document:  2000
Processing Document:  2100
Processing Document:  2200
Processing Document:  2300
Processing Document:  2400


In [19]:
topics_dic = load_pickle('topics_dic_'+str(NUM_TOPICS)+'_'+str(NUM_WORDS)+'.pkl')

In [22]:
key = list(ldamodel_dic.keys())[3]
topics_dic[key]

[(0,
  [('important', 0.075964384),
   ('artist', 0.036761),
   ('culture', 0.028892092),
   ('become', 0.026511902),
   ('people', 0.021444738),
   ('muslim', 0.016338008),
   ('within', 0.016337728),
   ('together', 0.016328879),
   ('discuss', 0.011260906),
   ('would', 0.01123882)]),
 (1,
  [('could', 0.018614808),
   ('abaya', 0.01859119),
   ('people', 0.018582525),
   ('woman', 0.018580401),
   ('culture', 0.015836418),
   ('cultural', 0.013703359),
   ('institution', 0.013144027),
   ('filmmaker', 0.01279239),
   ('qatari', 0.012781894),
   ('learning', 0.012781889)]),
 (2,
  [('cultural', 0.053667907),
   ('people', 0.03318813),
   ('qatar', 0.027860152),
   ('nation', 0.027789775),
   ('think', 0.022478098),
   ('young', 0.022466417),
   ('development', 0.021928204),
   ('identity', 0.020136856),
   ('region', 0.017188113),
   ('happening', 0.011773157)]),
 (3,
  [('idea', 0.01988492),
   ('welcome', 0.019869674),
   ('understand', 0.019869242),
   ('voice', 0.019846061),
   

In [23]:
lda_model = load_pickle('ldamodel_dic.pkl')
key = list(ldamodel_dic.keys())[3]
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel_dic[key], corpus_dic[key], dictionary_dic[key])
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
